Skip to content

Commit

Permalink
[Snippets][WIP] Move CopyB repacking out from Subgraph
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Oct 16, 2024
1 parent cdce459 commit 3300a1c
Show file tree
Hide file tree
Showing 18 changed files with 240 additions and 54 deletions.
6 changes: 6 additions & 0 deletions src/common/snippets/include/snippets/runtime_configurator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,12 @@ class RuntimeConfigurator {
*/
std::vector<std::vector<size_t>> extract_layouts() const;

static void compute_offsets(const ov::snippets::VectorDims& shape,
ov::snippets::VectorDims& offsets,
size_t offsets_size,
size_t dim_step,
size_t idx_stride);

class MHAParallelWAOptimizer {
public:
MHAParallelWAOptimizer() = default;
Expand Down
10 changes: 10 additions & 0 deletions src/common/snippets/include/snippets/utils/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,16 @@ void visit_path(const lowered::ExpressionPtr& expr,
std::function<void(lowered::ExpressionPtr)> func,
bool visit_parent_path);

/**
* @brief Checks if layout is planar
*/
inline bool is_planar_layout(const std::vector<size_t>& layout) {
for (size_t i = 0; i < layout.size(); ++i)
if (layout[i] != i)
return false;
return true;
}

} // namespace utils
} // namespace snippets
} // namespace ov
9 changes: 2 additions & 7 deletions src/common/snippets/src/lowered/expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,6 @@ ExpressionPtr Expression::clone() const {
}

bool Expression::visit_attributes(AttributeVisitor &visitor) {
auto is_planar_layout = [](const std::vector<size_t>& layout) {
for (size_t i = 0; i < layout.size(); ++i)
if (layout[i] != i) return false;
return true;
};
auto subtensor2str = [](const VectorDims& subtensor) {
std::stringstream ss;
for (size_t i = 0; i < subtensor.size(); ++i) {
Expand Down Expand Up @@ -203,7 +198,7 @@ bool Expression::visit_attributes(AttributeVisitor &visitor) {
subtensors.emplace_back("in_subtensor_" + std::to_string(i), subtensor2str(subtensor));

const auto& layout = desc->get_layout();
if (!layout.empty() && !is_planar_layout(layout))
if (!layout.empty() && !utils::is_planar_layout(layout))
layouts.emplace_back("in_layout_" + std::to_string(i), layout);

in_reg_types.emplace_back(regTypeToStr(desc->get_reg().type));
Expand All @@ -220,7 +215,7 @@ bool Expression::visit_attributes(AttributeVisitor &visitor) {
subtensors.emplace_back("out_subtensor_" + std::to_string(i), subtensor2str(subtensor));

const auto& layout = desc->get_layout();
if (!layout.empty() && !is_planar_layout(layout))
if (!layout.empty() && !utils::is_planar_layout(layout))
layouts.emplace_back("out_layout_" + std::to_string(i), layout);

out_reg_types.emplace_back(regTypeToStr(desc->get_reg().type));
Expand Down
34 changes: 20 additions & 14 deletions src/common/snippets/src/runtime_configurator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,26 +271,18 @@ void RuntimeConfigurator::update_data_offsets(const std::vector<VectorDims>& sha
// shape: s0, s1, s2 == 1, s3
// offsets: s1*s3, s3, 0, 1
const auto& shape = shapes[i];
OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!");
if (shape == m_latest_shapes[i])
continue;

const auto& layout = layouts[i];
auto& offsets = m_config->io_data_offsets[i];

offsets.resize(m_config->tensor_rank);
std::fill(offsets.begin(), offsets.end(), 0);
if (utils::is_dynamic_vdims(shape))
return;

size_t dim_step = m_io_data_sizes[i];
offsets[offsets.size() - 1] = dim_step;

OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!");
auto& offsets = m_config->io_data_offsets[i];
const auto idx_stride = m_config->tensor_rank - shape.size();
for (int i = static_cast<int>(shape.size()) - 2; i >= 0; i--) {
dim_step *= shape[i + 1];
offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0;
}
compute_offsets(shape, offsets, m_config->tensor_rank, m_io_data_sizes[i], idx_stride);

std::cout << "offsets[" << i << "] = " << ov::PartialShape(offsets) << std::endl;
const auto& layout = layouts[i];
if (!layout.empty()) {
std::vector<size_t> reordered_offsets(offsets.size());
const auto is_input = i < m_in_num;
Expand Down Expand Up @@ -318,6 +310,20 @@ std::vector<std::vector<size_t>> RuntimeConfigurator::extract_layouts() const {
return layouts;
}

void RuntimeConfigurator::compute_offsets(const ov::snippets::VectorDims& shape,
ov::snippets::VectorDims& offsets,
size_t offsets_size,
size_t dim_step,
size_t idx_stride) {
offsets.resize(offsets_size);
std::fill(offsets.begin(), offsets.end(), 0);
offsets[offsets.size() - 1] = dim_step;
for (int i = static_cast<int>(shape.size()) - 2; i >= 0; i--) {
dim_step *= shape[i + 1];
offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0;
}
}

void RuntimeConfigurator::set_kernel_executor_table(std::shared_ptr<KernelExecutorTable> table) const {
OPENVINO_ASSERT(table, "Failed to update Kernel Executo Table: passed table is missed");
m_config->kernel_executor_table = std::move(table);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

#include "emitters/snippets/cpu_runtime_configurator.hpp"

#include "memory_desc/dnnl_blocked_memory_desc.h"
#include "snippets/lowered/loop_manager.hpp"
#include "transformations/snippets/x64/op/brgemm_cpu.hpp"
#include "snippets/utils/utils.hpp"

#include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp"
Expand Down Expand Up @@ -48,6 +50,7 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI
}

void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
update_requested_descs(linear_ir);
m_config->master_shape = linear_ir->get_master_shape();
update_loop_info(linear_ir);

Expand All @@ -68,6 +71,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l
if (linear_ir->is_dynamic()) {
update_loop_args(linear_ir);
}
adjust_offsets_from_descs(linear_ir);
}

void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) {
Expand Down Expand Up @@ -143,5 +147,59 @@ void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() {
}
}

void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
auto& optimal_descs = cpu_config->m_in_requested_descs;
optimal_descs.resize(m_in_num);
const auto& params = linear_ir->get_parameters();
OPENVINO_ASSERT(params.size() == m_in_num);
for (size_t i = 0; i < m_in_num; ++i) {
// TODO: remove
if (i != 1) continue;
const auto& param = params[i];
const auto consumers = param->get_output_port_connector(0)->get_consumers();
OPENVINO_ASSERT(consumers.size() == 1);
const auto& consumer = consumers.begin()->get_expr();
// TODO: this logic should be more flexible
if (ov::is_type<ov::intel_cpu::BrgemmCPU>(consumer->get_node())) {
const auto& shape = param->get_output_port_descriptor(0)->get_shape();
VectorDims normalized_dims(3, 1);
*normalized_dims.rbegin() = *shape.rbegin();
*++normalized_dims.rbegin() = *++shape.rbegin();
normalized_dims[0] = std::accumulate(shape.begin(), shape.end() - 2, static_cast<Dim>(1), std::multiplies<Dim>());

const auto data_type = DnnlExtensionUtils::ElementTypeToDataType(param->get_node()->get_output_element_type(0));
// TODO: tag must be selected based on Brgemm params (inner block + vnni factor?)
const auto tag = dnnl::memory::format_tag::aCB16b64c2b;
optimal_descs[i] = std::make_shared<DnnlBlockedMemoryDesc>(Shape(normalized_dims), data_type, tag);
}
}
}
void CPURuntimeConfigurator::adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const {
const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_config);
auto& optimal_descs = cpu_config->m_in_requested_descs;
for (size_t i = 0; i < m_in_num; ++i) {
const auto& optimal_desc = optimal_descs[i];
if (optimal_desc) {
// It is assumed that shape is planar
const auto& parameter = linear_ir->get_parameters()[i];
const auto& original_shape = parameter->get_output_port_descriptor(0)->get_shape();
const auto& blocked_shape = optimal_desc->as<DnnlBlockedMemoryDesc>()->getBlockDims();

ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1);
// Parallel work amount is copied from original shape
shape_for_offset.insert(shape_for_offset.end(), original_shape.begin(), original_shape.end() - m_config->tile_rank);
// Only first dim is batch, the rest are repacked KN
shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin() + 1, blocked_shape.end());
std::cout << "shape_for_offset = " << ov::PartialShape(shape_for_offset) << std::endl;

auto& offsets = m_config->io_data_offsets[i];
compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_io_data_sizes[i], 0);
std::cout << "offsets[*] = " << ov::PartialShape(offsets) << std::endl;
OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(parameter->get_output_port_descriptor(0)->get_layout()));
}
}
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "snippets/lowered/port_descriptor.hpp"
#include "emitters/snippets/jit_snippets_call_args.hpp"

#include "memory_desc/cpu_memory_desc.h"

namespace ov {
namespace intel_cpu {

Expand All @@ -22,6 +24,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
#endif

std::vector<jit_snippets_call_args::loop_args_t> loop_args = {};
std::vector<MemoryDescPtr> m_in_requested_descs = {};
};

class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
Expand Down Expand Up @@ -51,6 +54,9 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
*/
void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;

void update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;
void adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;

static const size_t rank6D;

class BrgemmCopyBLoopPortsAdjuster {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -315,11 +315,7 @@ void BrgemmCopyBKernelExecutor::execute(const BrgemmCopyBKernelExecutor* executo
auto kernel = executor->get_kernel();
OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr kernel");
OV_CPU_JIT_EMITTER_ASSERT(args, "has nullptr call args");
auto s = args->src;
auto d = args->tr_src;
std::cerr << s << " : " << d << "\n";
(*kernel)(args);
std::cerr << "\n";
}

} // namespace intel_cpu
Expand Down
6 changes: 0 additions & 6 deletions src/plugins/intel_cpu/src/nodes/reorder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,7 @@
#include <common/primitive_hashing_utils.hpp>
#include <shape_inference/shape_inference_pass_through.hpp>

#include "convert.h"
#include "cpu/x64/cpu_isa_traits.hpp"
#include "nodes/common/cpu_convert.h"
#include "nodes/common/cpu_memcpy.h"
#include "nodes/common/reorder_prim.h"
#include "openvino/core/parallel.hpp"
#include "shape_inference/shape_inference_pass_through.hpp"
#include "utils/precision_support.h"
#include "nodes/executors/executor.hpp"
#include "nodes/executors/transpose_list.hpp"
Expand Down
32 changes: 29 additions & 3 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
//
#include "subgraph.h"

#include "nodes/reorder.h"
#include "nodes/common/reorder_prim.h"
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include "memory_desc/cpu_memory_desc_utils.h"
#include "common/primitive_hashing_utils.hpp"
#include "dnnl_extension_utils.h"
#include "onednn/dnnl.h"
Expand Down Expand Up @@ -35,6 +39,7 @@
#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp"
#include "transformations/snippets/x64/pass/remove_converts.hpp"
#include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp"
#include "transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp"
#include "transformations/snippets/x64/pass/enforce_precision.hpp"
#include "transformations/snippets/x64/shape_inference.hpp"
#include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp"
Expand Down Expand Up @@ -79,7 +84,7 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor {
const BufferScratchpadAllocator& allocator)
: SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {}

void exec(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
const auto& callable = m_schedule->get_callable<kernel>();

auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
Expand Down Expand Up @@ -127,7 +132,7 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset();
}

void exec(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
const auto& callable = m_schedule->get_callable<dynamic_kernel>();

OPENVINO_ASSERT(data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!");
Expand Down Expand Up @@ -648,6 +653,9 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() {
}
SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::Before, ov::snippets::pass::PropagatePrecision,
ov::intel_cpu::pass::BrgemmToBrgemmCPU);
if (!std::getenv("REFERENCE"))
SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::After, ov::intel_cpu::pass::BrgemmToBrgemmCPU,
ov::intel_cpu::pass::MoveBrgemmRepackingOut);
SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64(Place::PipelineEnd, ov::intel_cpu::pass::RemoveConverts);
SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(Place::PipelineEnd, ov::intel_cpu::pass::MulAddToFMA);

Expand Down Expand Up @@ -846,7 +854,7 @@ bool Subgraph::created() const {

void Subgraph::execute(dnnl::stream strm) {
OPENVINO_ASSERT(execPtr, "Can't execute Subgraph node. Primitive didn't created");
execPtr->exec(srcMemPtrs, dstMemPtrs);
execPtr->execute(strm, srcMemPtrs, dstMemPtrs);
}

void Subgraph::executeDynamicImpl(dnnl::stream strm) {
Expand Down Expand Up @@ -895,6 +903,8 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!");
m_buffer_scratchpad = allocator(static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size);

m_in_requested_descs = snippet_config->m_in_requested_descs;

#if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
const auto target = std::dynamic_pointer_cast<const CPUTargetMachine>(snippet_attrs->snippet->get_generator()->get_target_machine());
enabled_segfault_detector = target && target->debug_config.enable_segfault_detector;
Expand Down Expand Up @@ -970,6 +980,22 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function<void(jit_sni
});
}

void Subgraph::SubgraphExecutor::execute(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs, std::vector<MemoryPtr>& outMemPtrs) {
repack_inputs(strm, inMemPtrs);
exec_impl(inMemPtrs, outMemPtrs);
}

void Subgraph::SubgraphExecutor::repack_inputs(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs) {
OPENVINO_ASSERT(inMemPtrs.size() == m_in_requested_descs.size());
for (size_t i = 0; i < m_in_requested_descs.size(); ++i) {
if (m_in_requested_descs[i]) {
auto repacked_memory = std::make_shared<Memory>(strm.get_engine(), m_in_requested_descs[i]);
repacked_memory->load(*inMemPtrs[i]);
inMemPtrs[i] = repacked_memory;
}
}
}

} // namespace node
} // namespace intel_cpu
} // namespace ov
9 changes: 8 additions & 1 deletion src/plugins/intel_cpu/src/nodes/subgraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,11 @@ class Subgraph::SubgraphExecutor {
const BufferScratchpadAllocator& allocator);
virtual ~SubgraphExecutor() = default;

virtual void exec(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;
void execute(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs, std::vector<MemoryPtr>& outMemPtrs);

protected:
virtual void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;

void parallel_for6d(const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
const std::function<void(jit_snippets_call_args&, const size_t*)>& caller);
void parallel_forNd(const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
Expand Down Expand Up @@ -164,6 +166,11 @@ class Subgraph::SubgraphExecutor {
bool enabled_segfault_detector = false;
inline void segfault_detector();
#endif

private:
void repack_inputs(dnnl::stream strm, std::vector<MemoryPtr>& inMemPtrs);

std::vector<MemoryDescPtr> m_in_requested_descs = {};
};

} // namespace node
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,10 @@ void BrgemmCPU::custom_constructor_validate_and_infer_types(std::vector<size_t>
INTERNAL_OP_SCOPE(BrgemmCPU_constructor_validate_and_infer_types);
validate_inputs();

// During ctor call, BrgemmCPU doesn't know his port descriptors.
// So we use port descs from source inputs
const auto brgemm_copy = with_repacking(m_type) ? get_brgemm_copy() : nullptr;
// This shape inference can use get_input_partial_shape(1) in all cases
const auto planar_input_shapes =
std::vector<ov::PartialShape>{ snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout_a),
brgemm_copy ? snippets::utils::get_planar_pshape(brgemm_copy->input(0))
: snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b) };
snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b) };
auto output_shape = infer_output_partial_shape(planar_input_shapes);
set_output_type(0, get_output_type(), snippets::utils::get_planar_pshape(output_shape, layout_c));

Expand Down Expand Up @@ -141,6 +138,8 @@ std::shared_ptr<BrgemmCopyB> BrgemmCPU::get_brgemm_copy() const {
return brgemm_copy_b;
}
}
std::cout << "[ INFO ] get_brgemm_copy didn't find copy_B\n";
return nullptr;
OPENVINO_THROW("BrgemmCopyB hasn't been found!");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ enum class BRGEMM_TYPE {
STAND_ALONE, // No extra requirements, used for f32|f32
WITH_AMX, // i8|i8 or bf16|bf16 on AMX system - needs BrgemmCopyB and scratchpad
WITH_COMPENSATIONS, // i8|i8 (non-AMX system) - needs BrgemmCopyB for data repacking and compensations
REPACKING_ONLY // u8|i8 or bf16|bf16 (non-AMX system) - needs BrgemmCopyB on second input for data repacking
REPACKING_ONLY, // low precision or some specific f32 cases - needs BrgemmCopyB on second input for data repacking
};

dnnl::impl::cpu::x64::cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx);
Expand Down
Loading

0 comments on commit 3300a1c

Please sign in to comment.