Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Snippets] Move BrgemmCopyB repacking logic outside the Subgraph #27007

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,12 @@ class RuntimeConfigurator {
*/
std::vector<std::vector<size_t>> extract_layouts() const;

static void compute_offsets(const ov::snippets::VectorDims& shape,
ov::snippets::VectorDims& offsets,
size_t offsets_size,
size_t dim_step,
size_t idx_stride);

class MHAParallelWAOptimizer {
public:
MHAParallelWAOptimizer() = default;
Expand Down
10 changes: 10 additions & 0 deletions src/common/snippets/include/snippets/utils/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,16 @@ void visit_path(const lowered::ExpressionPtr& expr,
std::function<void(lowered::ExpressionPtr)> func,
bool visit_parent_path);

/**
* @brief Checks if layout is planar
*/
inline bool is_planar_layout(const std::vector<size_t>& layout) {
for (size_t i = 0; i < layout.size(); ++i)
if (layout[i] != i)
return false;
return true;
}

} // namespace utils
} // namespace snippets
} // namespace ov
9 changes: 2 additions & 7 deletions src/common/snippets/src/lowered/expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,6 @@ ExpressionPtr Expression::clone() const {
}

bool Expression::visit_attributes(AttributeVisitor &visitor) {
auto is_planar_layout = [](const std::vector<size_t>& layout) {
for (size_t i = 0; i < layout.size(); ++i)
if (layout[i] != i) return false;
return true;
};
auto subtensor2str = [](const VectorDims& subtensor) {
std::stringstream ss;
for (size_t i = 0; i < subtensor.size(); ++i) {
Expand Down Expand Up @@ -203,7 +198,7 @@ bool Expression::visit_attributes(AttributeVisitor &visitor) {
subtensors.emplace_back("in_subtensor_" + std::to_string(i), subtensor2str(subtensor));

const auto& layout = desc->get_layout();
if (!layout.empty() && !is_planar_layout(layout))
if (!layout.empty() && !utils::is_planar_layout(layout))
layouts.emplace_back("in_layout_" + std::to_string(i), layout);

in_reg_types.emplace_back(regTypeToStr(desc->get_reg().type));
Expand All @@ -220,7 +215,7 @@ bool Expression::visit_attributes(AttributeVisitor &visitor) {
subtensors.emplace_back("out_subtensor_" + std::to_string(i), subtensor2str(subtensor));

const auto& layout = desc->get_layout();
if (!layout.empty() && !is_planar_layout(layout))
if (!layout.empty() && !utils::is_planar_layout(layout))
layouts.emplace_back("out_layout_" + std::to_string(i), layout);

out_reg_types.emplace_back(regTypeToStr(desc->get_reg().type));
Expand Down
6 changes: 3 additions & 3 deletions src/common/snippets/src/lowered/loop_port.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@ std::shared_ptr<LoopPort> LoopPort::clone_with_new_expr(const ExpressionPtr& new
bool operator==(const LoopPort& lhs, const LoopPort& rhs) {
if (&lhs == &rhs)
return true;
return lhs.expr_port == rhs.expr_port && lhs.is_incremented == rhs.is_incremented && lhs.dim_idx == rhs.dim_idx;
return *lhs.expr_port == *rhs.expr_port && lhs.is_incremented == rhs.is_incremented && lhs.dim_idx == rhs.dim_idx;
}

bool operator!=(const LoopPort& lhs, const LoopPort& rhs) {
return !(lhs == rhs);
}

bool operator<(const LoopPort& lhs, const LoopPort& rhs) {
return (lhs.expr_port < rhs.expr_port) ||
(lhs.expr_port == rhs.expr_port &&
return (*lhs.expr_port < *rhs.expr_port) ||
(*lhs.expr_port == *rhs.expr_port &&
(lhs.is_incremented < rhs.is_incremented ||
(lhs.is_incremented == rhs.is_incremented && lhs.dim_idx < rhs.dim_idx)));
}
Expand Down
35 changes: 20 additions & 15 deletions src/common/snippets/src/runtime_configurator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ void RuntimeConfigurator::update_data_offsets(const std::vector<VectorDims>& sha
const std::vector<std::vector<size_t>>& layouts) const {
OPENVINO_ASSERT(shapes.size() == m_io_num, "Number of custom shapes must be 0 or be equal to m_io_num");
OPENVINO_ASSERT(layouts.size() == m_io_num, "Number of custom layouts must be 0 or be equal to m_io_num");
for (size_t i = 0; i < m_io_num; ++i) {
for (size_t i = 0; i < m_io_num; ++i) {
// offsets represent distance between consecutive elements of corresponding dimension.
// If a dim size == 1, then the next dim starts immediately and the stride is 0
// case 1:
Expand All @@ -271,26 +271,17 @@ void RuntimeConfigurator::update_data_offsets(const std::vector<VectorDims>& sha
// shape: s0, s1, s2 == 1, s3
// offsets: s1*s3, s3, 0, 1
const auto& shape = shapes[i];
OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!");
if (shape == m_latest_shapes[i])
continue;

const auto& layout = layouts[i];
auto& offsets = m_config->io_data_offsets[i];

offsets.resize(m_config->tensor_rank);
std::fill(offsets.begin(), offsets.end(), 0);
if (utils::is_dynamic_vdims(shape))
return;

size_t dim_step = m_io_data_sizes[i];
offsets[offsets.size() - 1] = dim_step;

OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!");
auto& offsets = m_config->io_data_offsets[i];
const auto idx_stride = m_config->tensor_rank - shape.size();
for (int i = static_cast<int>(shape.size()) - 2; i >= 0; i--) {
dim_step *= shape[i + 1];
offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0;
}
compute_offsets(shape, offsets, m_config->tensor_rank, m_io_data_sizes[i], idx_stride);

const auto& layout = layouts[i];
if (!layout.empty()) {
std::vector<size_t> reordered_offsets(offsets.size());
const auto is_input = i < m_in_num;
Expand Down Expand Up @@ -318,6 +309,20 @@ std::vector<std::vector<size_t>> RuntimeConfigurator::extract_layouts() const {
return layouts;
}

void RuntimeConfigurator::compute_offsets(const ov::snippets::VectorDims& shape,
ov::snippets::VectorDims& offsets,
size_t offsets_size,
size_t dim_step,
size_t idx_stride) {
offsets.resize(offsets_size);
std::fill(offsets.begin(), offsets.end(), 0);
offsets[offsets.size() - 1] = dim_step;
for (int i = static_cast<int>(shape.size()) - 2; i >= 0; i--) {
dim_step *= shape[i + 1];
offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0;
}
}

void RuntimeConfigurator::set_kernel_executor_table(std::shared_ptr<KernelExecutorTable> table) const {
OPENVINO_ASSERT(table, "Failed to update Kernel Executo Table: passed table is missed");
m_config->kernel_executor_table = std::move(table);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@

#include "emitters/snippets/cpu_runtime_configurator.hpp"

#include "memory_desc/cpu_blocked_memory_desc.h"
#include "memory_desc/cpu_memory_desc_utils.h"
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include "snippets/lowered/loop_manager.hpp"
#include "snippets/utils/utils.hpp"

#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
#include "transformations/snippets/x64/op/brgemm_utils.hpp"
#include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp"
namespace ov {
namespace intel_cpu {

Expand Down Expand Up @@ -36,10 +41,36 @@ std::string CPURuntimeConfig::to_string() const {
CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigurator(std::make_shared<CPURuntimeConfig>()) {
}

void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
RuntimeConfigurator::initialization(linear_ir);
if (linear_ir->is_dynamic()) {
loopPortsAdjuster = BrgemmCopyBLoopPortsAdjuster(linear_ir, this);
}
}

void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
RuntimeConfigurator::update(linear_ir);
update_requested_descs(linear_ir);
m_config->master_shape = linear_ir->get_master_shape();
update_loop_info(linear_ir);

if (!m_optimizer.optimize()) {
// If the optimization was not applied, offsets are updated using shapes from descriptors
auto shapes = extract_shapes();
update_data_offsets(shapes, extract_layouts());
m_latest_shapes = std::move(shapes);
}
if (linear_ir->is_dynamic())
loopPortsAdjuster.optimize();

// Update KernelExecutor Table should be before `update_buffer_scratchpad_size`
// because `ComputeAllocationSize` depends on subtensors which are updated in the table
get_kernel_executor_table()->update_state(linear_ir);
update_buffer_scratchpad_size(linear_ir);

if (linear_ir->is_dynamic()) {
update_loop_args(linear_ir);
}
adjust_offsets_from_descs(linear_ir);
}

void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) {
Expand Down Expand Up @@ -73,5 +104,92 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::Linea
}
}

CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir,
ov::intel_cpu::CPURuntimeConfigurator *configurator) {
const auto& pass = std::make_shared<intel_cpu::pass::AdjustBrgemmCopyBLoopPorts>();
pass->run(*linear_ir);
const auto& affected_uni_loops = pass->get_affected_loops();
const auto& loop_map = linear_ir->get_loop_manager()->get_map();
for (const auto& p : loop_map) {
if (const auto& exp_loop = ov::as_type_ptr<snippets::lowered::ExpandedLoopInfo>(p.second)) {
const auto& uni_loop = exp_loop->get_unified_loop_info();
if (affected_uni_loops.count(uni_loop))
m_affected_uni2exp_map[uni_loop].push_back(exp_loop);
}
}
}

void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() {
for (const auto& p : m_affected_uni2exp_map) {
const auto& uni_loop = p.first;
const auto& exp_loops = p.second;
snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap initialized_info;
if (intel_cpu::pass::AdjustBrgemmCopyBLoopPorts::update_loop_info(uni_loop)) {
initialized_info[uni_loop] = get_loop_runtime_params(uni_loop);
for (const auto& exp_loop : exp_loops)
update_expanded_loop_info(exp_loop, initialized_info);
}
}
}

void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
auto& optimal_descs = cpu_config->m_in_requested_descs;
optimal_descs.resize(m_in_num);
const auto& params = linear_ir->get_parameters();
OPENVINO_ASSERT(params.size() == m_in_num);
for (size_t i = 0; i < m_in_num; ++i) {
const auto& param = params[i];
auto consumers = param->get_output_port_connector(0)->get_consumers();
const bool brgemm_with_extracted_repacking =
std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) {
auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(port.get_expr()->get_node());
return port.get_index() == 1 && brgemm && brgemm_utils::with_repacking(brgemm->get_type());
});
if (brgemm_with_extracted_repacking) {
const auto& desc = param->get_output_port_descriptor(0);
const auto& shape = desc->get_shape();
const auto& K = *++shape.rbegin();
const auto& N = *shape.rbegin();

const auto& precision = param->get_node()->get_output_element_type(0);
const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision);
const auto n_block = brgemm_utils::repacking::compute_inner_n_block(precision);
// Firstly, batch dims are set
VectorDims requested_blocked_shape(shape.begin(), shape.end() - m_config->tile_rank);
// Then, the blocked dims are formed
requested_blocked_shape.insert(
requested_blocked_shape.end(),
{snippets::utils::div_up(K, vnni_factor), snippets::utils::div_up(N, n_block), n_block, vnni_factor});
// Please note: only planar layout is supported for now
const VectorDims order{0, 1, 2, 3, 3, 2};
auto cpu_desc = std::make_shared<ov::intel_cpu::CpuBlockedMemoryDesc>(precision,
Shape(shape),
requested_blocked_shape,
order);
optimal_descs[i] = MemoryDescUtils::convertToDnnlMemoryDesc(cpu_desc);
}
}
}
void CPURuntimeConfigurator::adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
auto& optimal_descs = cpu_config->m_in_requested_descs;
for (size_t i = 0; i < m_in_num; ++i) {
const auto& optimal_desc = optimal_descs[i];
if (optimal_desc) {
// It is assumed that shape is planar
const auto& parameter = linear_ir->get_parameters()[i];
const auto& original_shape = parameter->get_output_port_descriptor(0)->get_shape();
const auto& blocked_shape = optimal_desc->as<DnnlBlockedMemoryDesc>()->getBlockDims();

ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1);
shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin(), blocked_shape.end());
auto& offsets = m_config->io_data_offsets[i];
compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_io_data_sizes[i], 0);
OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(parameter->get_output_port_descriptor(0)->get_layout()));
}
}
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "snippets/lowered/port_descriptor.hpp"
#include "emitters/snippets/jit_snippets_call_args.hpp"

#include "memory_desc/cpu_memory_desc.h"

namespace ov {
namespace intel_cpu {

Expand All @@ -22,6 +24,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
#endif

std::vector<jit_snippets_call_args::loop_args_t> loop_args = {};
std::vector<MemoryDescPtr> m_in_requested_descs = {};
};

class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
Expand All @@ -44,13 +47,31 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
* @param linear_ir LinearIR
*/
void init_tensor_rank(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const override;
void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override;
/**
* @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig
* @param linear_ir LinearIR
*/
void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;

void update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;
void adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;

static const size_t rank6D;

class BrgemmCopyBLoopPortsAdjuster {
public:
BrgemmCopyBLoopPortsAdjuster() = default;
BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir,
ov::intel_cpu::CPURuntimeConfigurator *configurator);

void optimize();

private:
using UniLoopInfoPtr = std::shared_ptr<snippets::lowered::UnifiedLoopInfo>;
using ExpLoopInfoPtr = std::shared_ptr<snippets::lowered::ExpandedLoopInfo>;
std::unordered_map<UniLoopInfoPtr, std::vector<ExpLoopInfoPtr>> m_affected_uni2exp_map;
} loopPortsAdjuster;
};

} // namespace intel_cpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::Expression
OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config");
// In case of data repacking LDB is chosen in accordance with repacking buffer size
if (with_repacking(brgemm_node->get_type()))
LDB = brgemm_utils::repacking::compute_out_leading_dim(N, brgemm_node->get_input_element_type(1));
LDB = brgemm_utils::repacking::compute_out_leading_dim(LDB, brgemm_node->get_input_element_type(1));

config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta);
}
Expand All @@ -303,6 +303,7 @@ void BrgemmKernelExecutor::execute(const BrgemmKernelExecutor* executor, call_ar
}

cpu::x64::brgemm_kernel_params_t brgemm_p;
size_t is_with_comp = config.get_beta() == 0 && config.is_with_comp();

brgemm_p.batch = nullptr; // default value
brgemm_p.ptr_A = args->A;
Expand All @@ -311,8 +312,8 @@ void BrgemmKernelExecutor::execute(const BrgemmKernelExecutor* executor, call_ar
brgemm_p.ptr_D = args->C;
brgemm_p.ptr_buf = args->scratch;
brgemm_p.ptr_bias = nullptr;
brgemm_p.do_post_ops = static_cast<size_t>(config.is_with_comp());
brgemm_p.do_apply_comp = static_cast<size_t>(config.is_with_comp());
brgemm_p.do_post_ops = is_with_comp;
brgemm_p.do_apply_comp = is_with_comp;
brgemm_p.skip_accm = 0;
brgemm_p.BS = 1; // default value
OV_CPU_JIT_EMITTER_ASSERT(kernel->compiled_kernel, "has nullptr kernel");
Expand Down
6 changes: 0 additions & 6 deletions src/plugins/intel_cpu/src/nodes/reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,7 @@
#include <common/primitive_hashing_utils.hpp>
#include <shape_inference/shape_inference_pass_through.hpp>

#include "convert.h"
#include "cpu/x64/cpu_isa_traits.hpp"
#include "nodes/common/cpu_convert.h"
#include "nodes/common/cpu_memcpy.h"
#include "nodes/common/reorder_prim.h"
#include "openvino/core/parallel.hpp"
#include "shape_inference/shape_inference_pass_through.hpp"
#include "utils/precision_support.h"
#include "nodes/executors/executor.hpp"
#include "nodes/executors/transpose_list.hpp"
Expand Down
Loading
Loading