diff --git a/src/common/transformations/include/ov_ops/fully_connected.hpp b/src/common/transformations/include/ov_ops/fully_connected.hpp new file mode 100644 index 00000000000000..a71abcd42d54b3 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnected : public ov::op::Op { +public: + OPENVINO_OP("FullyConnected", "ie_internal_opset"); + + FullyConnected() = default; + + FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::Output& bias, + const ov::element::Type output_type = ov::element::undefined); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + virtual std::shared_ptr fuse_bias(const ov::Output& bias) const; + + ov::element::Type get_output_type() const { + return m_output_type; + } + +protected: + ov::element::Type m_output_type; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp b/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp new file mode 100644 index 00000000000000..9b93f6501056b4 --- /dev/null +++ b/src/common/transformations/include/ov_ops/fully_connected_quantized.hpp @@ -0,0 +1,77 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/node.hpp" +#include "openvino/op/op.hpp" +#include "ov_ops/fully_connected.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API FullyConnectedQuantized : public ov::op::internal::FullyConnected { +public: + OPENVINO_OP("FullyConnectedQuantized", "gpu_opset"); + + FullyConnectedQuantized() = default; + + FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::Output& input_scales, + const ov::Output& input_zero_points, + const ov::Output& output_scales, + const ov::Output& output_zero_points, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::Output& input_scales, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::element::Type output_type = ov::element::undefined); + + FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::element::Type output_type = ov::element::undefined); + + // FullyConnectedQuantized(const ov::Output& X, + // const ov::Output& W, + // const ov::Output& bias, + // const ov::Output& weight_scales, + // const ov::Output& weight_zero_points, + // const ov::Output& input_scales, + // const ov::Output& input_zero_points, + // const ov::element::Type output_type = ov::element::undefined); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + std::shared_ptr fuse_bias(const ov::Output& bias) const override final; + + ov::element::Type get_output_type() const { + return m_output_type; + } +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/ov_ops/placeholder.hpp b/src/common/transformations/include/ov_ops/placeholder.hpp new file mode 100644 index 00000000000000..6a5c1e236389ce --- /dev/null +++ b/src/common/transformations/include/ov_ops/placeholder.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace op { +namespace internal { + +class TRANSFORMATIONS_API Placeholder : public ov::op::Op { +public: + OPENVINO_OP("Placeholder", "ie_internal_opset"); + + Placeholder(); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp new file mode 100644 index 00000000000000..8e2aa617f29dcd --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_compressed.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API ConvertFullyConnectedToFullyConnectedCompressed; + +} // namespace pass +} // namespace ov + +class ov::pass::ConvertFullyConnectedToFullyConnectedCompressed : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedCompressed", "0"); + ConvertFullyConnectedToFullyConnectedCompressed(bool convert_u4zp_to_u8 = false); +}; diff --git a/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized.hpp b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized.hpp new file mode 100644 index 00000000000000..9107f7333fc4df --- /dev/null +++ b/src/common/transformations/include/transformations/op_conversions/convert_fc_to_quantized.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" +#include "transformations_visibility.hpp" + +namespace ov { +namespace pass { + +class TRANSFORMATIONS_API ConvertFullyConnectedToFullyConnectedQuantized; + +} // namespace pass +} // namespace ov + +class ov::pass::ConvertFullyConnectedToFullyConnectedQuantized : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("ConvertFullyConnectedToFullyConnectedQuantized", "0"); + ConvertFullyConnectedToFullyConnectedQuantized(); +}; diff --git a/src/common/transformations/src/ov_ops/fully_connected.cpp b/src/common/transformations/src/ov_ops/fully_connected.cpp new file mode 100644 index 00000000000000..1938e2becac890 --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected.hpp" + +#include + +#include "matmul_shape_inference.hpp" +#include "ov_ops/placeholder.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnected::FullyConnected(const ov::Output& A, + const ov::Output& B, + const ov::Output& bias, + const ov::element::Type output_type) + : Op({A, B, bias}), + m_output_type(output_type) { + validate_and_infer_types(); +} + +std::shared_ptr FullyConnected::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), m_output_type); +} + +std::shared_ptr FullyConnected::fuse_bias(const ov::Output& bias) const { + return std::make_shared(input_value(0), input_value(1), bias, m_output_type); +} + +void FullyConnected::validate_and_infer_types() { + const auto input_size = get_input_size(); + NODE_VALIDATION_CHECK(this, + input_size >= 3, + "Number of inputs is incorrect. Current value is: ", + input_size, + ", expected at least 3."); + + ov::op::v0::MatMul op; + op.set_transpose_a(false); + op.set_transpose_b(true); + + auto out_shapes = + ov::op::v0::shape_infer(&op, + std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}); + + auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + set_output_type(0, output_type, out_shapes[0]); +} + +bool FullyConnected::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("output_type", m_output_type); + return true; +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp b/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp new file mode 100644 index 00000000000000..b632ac8f113d0c --- /dev/null +++ b/src/common/transformations/src/ov_ops/fully_connected_quantized.cpp @@ -0,0 +1,150 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/fully_connected_quantized.hpp" + +#include "matmul_shape_inference.hpp" +#include "ov_ops/placeholder.hpp" + +namespace ov { +namespace op { +namespace internal { + +FullyConnectedQuantized::FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::Output& input_scales, + const ov::Output& input_zero_points, + const ov::Output& output_scales, + const ov::Output& output_zero_points, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, weight_scales); + set_argument(4, weight_zero_points); + set_argument(5, input_scales); + set_argument(6, input_zero_points); + set_argument(7, output_scales); + set_argument(8, output_zero_points); + validate_and_infer_types(); +} + +FullyConnectedQuantized::FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::Output& input_scales, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, weight_scales); + set_argument(4, weight_zero_points); + set_argument(5, input_scales); + validate_and_infer_types(); +} + +FullyConnectedQuantized::FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::Output& weight_zero_points, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, weight_scales); + set_argument(4, weight_zero_points); +} + +FullyConnectedQuantized::FullyConnectedQuantized(const ov::Output& X, + const ov::Output& W, + const ov::Output& bias, + const ov::Output& weight_scales, + const ov::element::Type output_type) + : FullyConnected(X, W, bias, output_type) { + set_argument(3, weight_scales); +} + +std::shared_ptr FullyConnectedQuantized::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + new_args.at(3), + new_args.at(4), + new_args.at(5), + new_args.at(6), + new_args.at(7), + new_args.at(8), + m_output_type); +} + +std::shared_ptr FullyConnectedQuantized::fuse_bias(const ov::Output& bias) const { + switch (get_input_size()) { + case 9: + return std::make_shared(input_value(0), + input_value(1), + bias, + input_value(3), + input_value(4), + input_value(5), + input_value(6), + input_value(7), + input_value(8), + get_output_type()); + case 6: + return std::make_shared(input_value(0), + input_value(1), + bias, + input_value(3), + input_value(4), + input_value(5), + get_output_type()); + case 5: + return std::make_shared(input_value(0), + input_value(1), + bias, + input_value(3), + input_value(4), + get_output_type()); + case 4: + return std::make_shared(input_value(0), + input_value(1), + bias, + input_value(3), + get_output_type()); + } + + OPENVINO_THROW("Unsupported number of inputs: ", get_input_size()); +} + +// @todo finalize validate_and_infer_types +void FullyConnectedQuantized::validate_and_infer_types() { + const auto input_size = get_input_size(); + NODE_VALIDATION_CHECK(this, + input_size >= 3, + "Number of inputs is incorrect. Current value is: ", + input_size, + ", expected at least 3."); + + ov::op::v0::MatMul op; + op.set_transpose_a(false); + op.set_transpose_b(true); + + auto out_shapes = + ov::op::v0::shape_infer(&op, + std::vector{get_input_partial_shape(0), get_input_partial_shape(1)}); + + auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + set_output_type(0, output_type, out_shapes[0]); +} + +bool FullyConnectedQuantized::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("output_type", m_output_type); + return true; +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/ov_ops/placeholder.cpp b/src/common/transformations/src/ov_ops/placeholder.cpp new file mode 100644 index 00000000000000..17e4c3224a4ad6 --- /dev/null +++ b/src/common/transformations/src/ov_ops/placeholder.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ov_ops/placeholder.hpp" + +#include "transformations/rt_info/fused_names_attribute.hpp" + +namespace ov { +namespace op { +namespace internal { + +Placeholder::Placeholder() : ov::op::Op() { + validate_and_infer_types(); + // set_friendly_name(get_name()); + // get_rt_info().emplace(FusedNames::get_type_info_static(), FusedNames{get_friendly_name()}); +} + +bool Placeholder::visit_attributes(ov::AttributeVisitor& visitor) { + return true; +} + +void Placeholder::validate_and_infer_types() { + set_output_type(0, ov::element::undefined, ov::PartialShape{}); +} + +std::shared_ptr Placeholder::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + return std::make_shared(); +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp new file mode 100644 index 00000000000000..eb9653b4bd62fa --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_compressed.cpp @@ -0,0 +1,191 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_fc_to_compressed.hpp" + +#include + +#include "openvino/core/rt_info.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyConnectedCompressed( + bool convert_u4zp_to_u8) { + using namespace ov::pass::pattern; + + auto compressed_constant = [](const ov::Output& output) { + return (output.get_element_type() == ov::element::u8 || output.get_element_type() == ov::element::i8 || + output.get_element_type() == ov::element::u4 || output.get_element_type() == ov::element::i4 || + output.get_element_type() == ov::element::nf4 || output.get_element_type() == ov::element::f4e2m1); + // output.get_target_inputs().size() == 1; + // output.get_target_inputs().size() > 0; + }; + + auto reshape_3d_to_2d = [](const ov::Output& output) { + auto in_ps = output.get_node()->get_input_partial_shape(0); + auto out_ps = output.get_node()->get_output_partial_shape(0); + return in_ps.rank().is_static() && out_ps.rank().is_static() && in_ps.size() == 3 && out_ps.size() == 2; + }; + + auto weights_m = wrap_type(compressed_constant); + auto convert_m = wrap_type({weights_m}); + + // auto sub_const_m = wrap_type(consumers_count(1)); + auto sub_const_m = wrap_type(); + auto sub_convert_const_m = wrap_type({sub_const_m}); + auto sub_with_convert_m = wrap_type({convert_m, sub_convert_const_m}); + auto sub_no_convert_m = wrap_type({convert_m, sub_const_m}); + auto subtract_m = std::make_shared(OutputVector{sub_with_convert_m, sub_no_convert_m}); + + // auto mul_const_m = wrap_type(consumers_count(1)); + auto mul_const_m = wrap_type(); + auto mul_convert_const_m = wrap_type({mul_const_m}); + auto mul_scale_m = std::make_shared(OutputVector{mul_const_m, mul_convert_const_m}); + + auto mul_with_sub_m = wrap_type({subtract_m, mul_scale_m}); + auto mul_no_sub_m = wrap_type({convert_m, mul_scale_m}); + auto mul_m = std::make_shared(OutputVector{mul_with_sub_m, mul_no_sub_m}); + + auto reshape_const_m = wrap_type(); + auto reshape_m = wrap_type({mul_m, reshape_const_m}, reshape_3d_to_2d); + + auto transpose_input = std::make_shared(OutputVector{reshape_m, mul_m}); + auto transpose_const_m = wrap_type(); + auto transpose_m = wrap_type({transpose_input, transpose_const_m}); + + auto data_m = any_input(); + auto bias_m = any_input(); + auto weights_input_m = std::make_shared(ov::OutputVector{reshape_m, transpose_m, mul_m}); + auto fully_connected_m = wrap_type({data_m, weights_input_m, bias_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + OPENVINO_ASSERT(pattern_map.count(fully_connected_m)); + OPENVINO_ASSERT(pattern_map.count(mul_const_m)); + OPENVINO_ASSERT(pattern_map.count(weights_m)); + OPENVINO_ASSERT(pattern_map.count(bias_m)); + OPENVINO_ASSERT(pattern_map.count(convert_m)); + auto fc = std::dynamic_pointer_cast( + pattern_map.at(fully_connected_m).get_node_shared_ptr()); + if (!fc || transformation_callback(fc)) { + return false; + } + + bool has_transpose = pattern_map.count(transpose_m); + auto scale_shape = pattern_map.at(mul_const_m).get_shape(); + bool grouped = std::count_if(scale_shape.begin(), scale_shape.end(), [](size_t d) { + return d > 1; + }) > 1; + + auto weights_shape = fc->get_input_shape(1); + const auto IC = *(weights_shape.rbegin()); + const auto OC = *(weights_shape.rbegin() + 1); + + const auto G = grouped ? (has_transpose ? *(scale_shape.rbegin() + 2) : *(scale_shape.rbegin() + 1)) : 1; + + if (IC % G != 0 || IC / G < 4 || OC == 1) { + return false; + } + + auto reshape_const_to_2d = [has_transpose, grouped](std::shared_ptr node) { + auto constant = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(constant != nullptr); + ov::Shape current_shape = constant->get_shape(); + if (current_shape.size() <= 2) + return constant; + + OPENVINO_ASSERT(current_shape.size() == 3); + + auto new_shape = (has_transpose || !grouped) + ? ov::Shape{current_shape[0] * current_shape[1], current_shape[2]} + : ov::Shape{current_shape[0], current_shape[1] * current_shape[2]}; + + return std::make_shared(*constant, new_shape); + }; + + auto convert_u4const_to_u8 = [convert_u4zp_to_u8](std::shared_ptr node) { + auto constant = std::dynamic_pointer_cast(node); + if (constant->get_element_type() != ov::element::u4 || !convert_u4zp_to_u8) + return std::dynamic_pointer_cast(constant); + return std::dynamic_pointer_cast(std::make_shared(node, ov::element::u8)); + }; + + const ov::Output& fc_input_a = fc->input(0).get_source_output(); + const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr()); + std::shared_ptr optional_zero_point = nullptr; + + const bool with_zero_point = + pattern_map.count(sub_no_convert_m) > 0 || pattern_map.count(sub_with_convert_m) > 0; + if (with_zero_point) { + // WA: Convert ZP to u8 for OneDNN case to avoid u4 reorder + optional_zero_point = + convert_u4const_to_u8(reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr())); + } + + std::shared_ptr fc_input_b = reshape_const_to_2d(pattern_map.at(weights_m).get_node_shared_ptr()); + std::shared_ptr fc_input_scale = scale; + std::shared_ptr fc_input_zp = optional_zero_point; + std::shared_ptr fc_input_bias = pattern_map.at(bias_m).get_node_shared_ptr(); + std::vector> result_nodes = {}; + if (has_transpose) { + const auto& transpose = pattern_map.at(transpose_m).get_node_shared_ptr(); + std::shared_ptr transpose_const = pattern_map.at(transpose_const_m).get_node_shared_ptr(); + if (ov::shape_size(transpose_const->get_shape()) != fc_input_b->get_output_partial_shape(0).size()) { + std::vector new_order(fc_input_b->get_output_partial_shape(0).size()); + std::iota(new_order.begin(), new_order.end(), 0); + std::swap(new_order[new_order.size() - 1], new_order[new_order.size() - 2]); + transpose_const = + std::make_shared(ov::element::i32, ov::Shape{new_order.size()}, new_order); + } + + fc_input_b = transpose->clone_with_new_inputs({fc_input_b->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_b); + result_nodes.push_back(fc_input_b); + fc_input_scale = transpose->clone_with_new_inputs({scale->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_scale); + result_nodes.push_back(fc_input_scale); + if (with_zero_point && ov::shape_size(optional_zero_point->output(0).get_shape()) > 1) { + fc_input_zp = transpose->clone_with_new_inputs({optional_zero_point->output(0), transpose_const}); + ov::disable_constant_folding(fc_input_zp); + result_nodes.push_back(fc_input_zp); + } + } + + std::shared_ptr new_fc = nullptr; + if (with_zero_point) { + new_fc = std::make_shared(fc_input_a, + fc_input_b, + fc_input_bias, + fc_input_scale, + fc_input_zp, + fc->get_output_type()); + } else { + new_fc = std::make_shared(fc_input_a, + fc_input_b, + fc_input_bias, + fc_input_scale, + fc->get_output_type()); + } + + result_nodes.push_back(new_fc); + new_fc->set_friendly_name(fc->get_friendly_name()); + ov::copy_runtime_info(m.get_matched_nodes(), result_nodes); + ov::replace_node(fc, new_fc); + return true; + }; + + auto m = std::make_shared(fully_connected_m, + "ConvertFullyConnectedToFullyConnectedCompressed"); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized.cpp b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized.cpp new file mode 100644 index 00000000000000..3664ba3f039785 --- /dev/null +++ b/src/common/transformations/src/transformations/op_conversions/convert_fc_to_quantized.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/op_conversions/convert_fc_to_quantized.hpp" + +#include + +#include "openvino/core/rt_info.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/pass/pattern/op/label.hpp" +#include "openvino/pass/pattern/op/pattern.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "ov_ops/placeholder.hpp" +#include "transformations/utils/utils.hpp" + +ov::pass::ConvertFullyConnectedToFullyConnectedQuantized::ConvertFullyConnectedToFullyConnectedQuantized() { + using namespace ov::pass::pattern; + + auto quantized_weights = [](const ov::Output& output) { + return output.get_element_type() == ov::element::i8; + }; + + auto quantized_activations = [](const ov::Output& output) { + return output.get_element_type() == ov::element::u8 || output.get_element_type() == ov::element::i8; + }; + + auto activations_m = pattern::any_input(quantized_activations); + auto weights_m = wrap_type(quantized_weights); + // auto bias_m = wrap_type(); + auto bias_m = pattern::any_input(); + + auto fully_connected_m = wrap_type({activations_m, weights_m, bias_m}); + auto dequantization_scales_m = wrap_type(); + auto multiply_m = wrap_type({fully_connected_m, dequantization_scales_m}); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + + auto fc_output = pattern_map.at(fully_connected_m); + auto activations = pattern_map.at(activations_m); + auto weights = pattern_map.at(weights_m); + auto bias = pattern_map.at(bias_m); + auto multiply = pattern_map.at(multiply_m); + auto dequantization_scales = pattern_map.at(dequantization_scales_m); + const auto& fc_output_shape = fc_output.get_partial_shape(); + const auto& multiply_output_shape = multiply.get_partial_shape(); + + if (*fc_output_shape.rbegin() != *multiply_output_shape.rbegin()) { + return false; + } + + auto fc_node = std::dynamic_pointer_cast( + pattern_map.at(fully_connected_m).get_node_shared_ptr()); + + auto fc_quantized = std::make_shared( + activations, + weights, + bias, + std::make_shared(), + std::make_shared(), + std::make_shared(), + std::make_shared(), + dequantization_scales, + std::make_shared(), + fc_node->get_output_type()); + + auto multiply_node = multiply.get_node_shared_ptr(); + fc_quantized->set_friendly_name(multiply_node->get_friendly_name()); + ov::copy_runtime_info({multiply_node, fc_node}, fc_quantized); + ov::replace_node(multiply_node, fc_quantized); + + return true; + }; + + auto m = std::make_shared(multiply_m, "ConvertFullyConnectedToFullyConnectedQuantized"); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp index 156481fb893227..ee88a47f8b228e 100644 --- a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp +++ b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp @@ -24,7 +24,8 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() { auto compressed_constant = [](const ov::Output& output) { return (output.get_element_type() == ov::element::u8 || output.get_element_type() == ov::element::i8 || output.get_element_type() == ov::element::u4 || output.get_element_type() == ov::element::i4) && - output.get_target_inputs().size() == 1 && + // output.get_target_inputs().size() == 1 && + output.get_target_inputs().size() > 0 && (output.get_shape().size() == 2 || output.get_shape().size() == 3); }; diff --git a/src/core/src/pass/graph_rewrite.cpp b/src/core/src/pass/graph_rewrite.cpp index 00eafa873043c3..249f2faa9e1c72 100644 --- a/src/core/src/pass/graph_rewrite.cpp +++ b/src/core/src/pass/graph_rewrite.cpp @@ -280,11 +280,12 @@ void ov::pass::MatcherPass::register_matcher(const std::shared_ptr& node) -> bool { + OPENVINO_DEBUG("[MATCHER] ", m->get_name(), " matching ", node); if (m->match(node->output(0))) { - OPENVINO_DEBUG("Matcher ", m->get_name(), " matched ", node); + OPENVINO_DEBUG("[MATCHER] ", m->get_name(), " matched ", node); OV_PASS_CALLBACK(m); const bool status = callback(*m.get()); - OPENVINO_DEBUG("Matcher ", m->get_name(), " callback ", (status ? "succeded" : "failed")); + OPENVINO_DEBUG("[MATCHER] ", m->get_name(), " callback ", (status ? "succeded" : "failed")); // explicitly clear Matcher state because it holds pointers to matched nodes m->clear_state(); return status; diff --git a/src/frontends/ir/src/ir_deserializer.cpp b/src/frontends/ir/src/ir_deserializer.cpp index 68900b150514bc..8f2c02ad575d89 100644 --- a/src/frontends/ir/src/ir_deserializer.cpp +++ b/src/frontends/ir/src/ir_deserializer.cpp @@ -9,6 +9,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/meta_data.hpp" +#include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/loop.hpp" @@ -830,7 +831,9 @@ std::shared_ptr ov::XmlDeserializer::create_node(const std::vector(inputs[i].get_node_shared_ptr()) && + ov::element::Type_t::undefined == inputs[i].get_element_type()) OPENVINO_THROW(params.type, " layer ", params.name, diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index d3245312a16efc..f69ae6eecfd0a1 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -35,12 +35,14 @@ static const TypeToNameMap& get_type_to_name_tbl() { static const TypeToNameMap type_to_name_tbl = { {"Constant", Type::Input}, {"Parameter", Type::Input}, + {"PlaceHolder", Type::Input}, {"Result", Type::Output}, {"Eye", Type::Eye}, {"Convolution", Type::Convolution}, {"GroupConvolution", Type::Convolution}, {"MatMul", Type::MatMul}, {"FullyConnected", Type::FullyConnected}, + {"FullyConnectedQuantized", Type::FullyConnected}, {"MaxPool", Type::Pooling}, {"AvgPool", Type::Pooling}, {"AdaptiveMaxPool", Type::AdaptivePooling}, diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp index 2f82fbe553ae19..bdea332e74b001 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp @@ -11,21 +11,68 @@ #include #include +#include "cpu_types.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "openvino/core/type/element_type.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { namespace intel_cpu { +static std::vector getDeQuantizedScales(const MemoryArgs& memory) { + if (!memory.count(ARG_DST | ARG_ATTR_SCALES)) + return {}; + + auto scalesBlob = memory.at(ARG_DST | ARG_ATTR_SCALES); + + auto scalesData = static_cast(scalesBlob->getData()); + + if (!scalesData) + return {}; + + auto dstShape = memory.at(ARG_DST)->getShape(); + auto dqScalesShape = memory.at(ARG_DST | ARG_ATTR_SCALES)->getShape(); + + auto scalesDims = getNormalizedDimsBySize(dqScalesShape.getDims(), dstShape.getDims().size()); + + auto scaleSize = std::accumulate(scalesDims.begin(), scalesDims.end(), std::size_t(1), std::multiplies()); + + std::vector DQScales(scaleSize, 1.0); + + // OPENVINO_ASSERT(scaleSize == 1 || DQScales.size() == 1 || DQScales.size() == scaleSize, + // "set invalid scales size , DQScales vector size: ", + // DQScales.size(), + // ", scale data size: ", + // scaleSize); + + if (scaleSize > DQScales.size()) + DQScales.resize(scaleSize, DQScales[0]); + if (1 == scaleSize) { + std::transform(DQScales.begin(), DQScales.end(), DQScales.begin(), [=](float val) { + return (scalesData[0] * val); + }); + } else { + for (size_t i = 0; i < DQScales.size(); i++) { + DQScales[i] *= scalesData[i]; + } + } + if (std::all_of(DQScales.begin(), DQScales.end(), [&](float val) { + return (val == DQScales[0]); + })) + DQScales.resize(1); + + return DQScales; +} + DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, const dnnl::engine& engine, const VectorDims& outputDims, const size_t indexOfOutputChannelDim, const bool isInt8, const int weiScaleMaskPerChannel, - const std::vector& DQScales, - const bool hasBias, + const bool weightsWithBatch, + const MemoryArgs& memory, const dnnl::memory::data_type outDataType) : engine(engine), postOps(postOps), @@ -33,12 +80,14 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, idxOC(indexOfOutputChannelDim), isINT8(isInt8), weightScaleMaskPerChannel(weiScaleMaskPerChannel), + weightsWithBatch(weightsWithBatch), outDataType(outDataType) { OPENVINO_ASSERT(idxOC >= 0 && static_cast(idxOC) < outputDims.size()); OC = outputDims[idxOC]; dimsPerOC = dimsPerTensor = VectorDims(outputDims.size(), 1); dimsPerOC[idxOC] = OC; + const auto& DQScales = getDeQuantizedScales(memory); // generalise dq scales, so extra logic is necessary here. if (isINT8) { wei_scale_values = DQScales.empty() ? std::vector{1.0} : DQScales; @@ -49,6 +98,7 @@ DnnlPostOpsComposer::DnnlPostOpsComposer(const PostOps& postOps, updateWeiScales(); // If having the bias, attr weight scale can't be updated for further ops-ops optimization. // ONEDNN 3.x quantization for scheme: QuantizedInput * QuantizedWeight * DQScale + Bias. + const bool hasBias = !memory.at(ARG_BIAS)->getDesc().empty(); weightScaleAvailable = !hasBias; } else if (!DQScales.empty()) { // DQ scale is fused but swiching back to non-INT8 for execution in some cases. @@ -325,9 +375,9 @@ static OptimizedFormula updateOptimizedFormula(const FakeQuantizePostOp& postOp, } bool DnnlPostOpsComposer::appendAttrPostOps(const FakeQuantizePostOp& postOp, - bool isLastPostOp, - bool doRounding, - bool allowBinary) { + bool isLastPostOp, + bool doRounding, + bool allowBinary) { DEBUG_LOG("isLastPostOp=", isLastPostOp, ", outDataType=", @@ -541,9 +591,9 @@ bool DnnlPostOpsComposer::appendShift(const std::vector& shift, bool allo } bool DnnlPostOpsComposer::appendLinear(const std::vector& scale, - const std::vector& shift, - bool isLastPostOp, - bool allowBinary) { + const std::vector& shift, + bool isLastPostOp, + bool allowBinary) { if (scale.size() == 1 && shift.size() == 1) { if (shift[0] == 0.0f) return appendScale(scale, isLastPostOp, allowBinary); @@ -594,20 +644,45 @@ void DnnlPostOpsComposer::appendClip(const std::vector& low, const std::v static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, bool needTranspose, ov::element::Type dstPrc, - const dnnl::engine& engine) { + const dnnl::engine& engine, + bool weightsWithBatch) { auto shape = paramsPtr->getShape().getStaticDims(); if (shape.size() == 1 && shape[0] == 1) { shape.push_back(1); } + if (shape.size() != 2 && shape.size() != 3) - OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape"); + OPENVINO_THROW("DnnlPostOpsComposer cannot prepack decompression params with invalid shape"); + + size_t OC = 0; + size_t G = 0; + // if (weightsWithBatch) { + // OC = needTranspose ? shape[shape.size() - 2] : shape[shape.size() - 1]; + // G = needTranspose ? shape[shape.size() - 1] : shape[shape.size() - 2]; + // } else { + // OC = needTranspose ? shape.front() : shape.back(); + // G = needTranspose ? shape[1] : shape.front(); + // } + if (weightsWithBatch) { + OC = shape[shape.size() - 2]; + G = shape[shape.size() - 1]; + } else { + OC = shape.front(); + G = shape[1]; + } - Shape dstShape = needTranspose ? Shape({shape[0], shape[1]}) : Shape({shape[shape.size() - 1], shape[0]}); - DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(dstPrc), dnnl::memory::format_tag::io); - auto dstMem = std::make_shared(engine, dstMemoryDesc); + Shape dstShape = Shape({OC, G}); + DnnlBlockedMemoryDesc dstMemoryDesc(dstShape, + DnnlExtensionUtils::ElementTypeToDataType(dstPrc), + dnnl::memory::format_tag::io); + auto dstMem = std::make_shared(engine, dstMemoryDesc); auto srcFormat = needTranspose ? dnnl::memory::format_tag::oi : dnnl::memory::format_tag::io; - DnnlBlockedMemoryDesc srcMemoryDesc(dstShape, DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), srcFormat); + + DnnlBlockedMemoryDesc srcMemoryDesc( + dstShape, + DnnlExtensionUtils::ElementTypeToDataType(paramsPtr->getDescPtr()->getPrecision()), + srcFormat); auto srcMem = std::make_shared(engine, srcMemoryDesc, paramsPtr->getData()); dstMem->load(*srcMem); @@ -615,25 +690,32 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr, return dstMem; } -void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionScales(const MemoryCPtr& scales_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (scales_ptr == nullptr) return; - auto scalesMem = prepackDecompressionParams(scales_ptr, needTranspose, dstPrecision, engine); + auto scalesMem = prepackDecompressionParams(scales_ptr, needTranspose, dstPrecision, engine, weightsWithBatch); attr.set_scales_dims(DNNL_ARG_WEIGHTS, - DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + DnnlExtensionUtils::convertToDnnlDims(scalesMem->getStaticDims()), + DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = std::move(scalesMem); dnnlArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] = cpuArgs[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS]->getPrimitive(); } -void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, bool needTranspose, ov::element::Type dstPrecision) { +void DnnlPostOpsComposer::appendDecompressionZeroPoints(const MemoryCPtr& zero_points_ptr, + bool needTranspose, + ov::element::Type dstPrecision) { if (zero_points_ptr == nullptr) return; - auto zeroPointsMem = prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine); + auto zeroPointsMem = + prepackDecompressionParams(zero_points_ptr, needTranspose, dstPrecision, engine, weightsWithBatch); attr.set_zero_points_dims(DNNL_ARG_WEIGHTS, - DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); + DnnlExtensionUtils::convertToDnnlDims(zeroPointsMem->getStaticDims()), + DnnlExtensionUtils::ElementTypeToDataType(dstPrecision)); cpuArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem; dnnlArgs[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS] = zeroPointsMem->getPrimitive(); } diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.h b/src/plugins/intel_cpu/src/dnnl_postops_composer.h index c07ec0f608b6db..091b3ed9470205 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.h +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.h @@ -27,8 +27,8 @@ class DnnlPostOpsComposer { const size_t indexOfOutputChannelDim, const bool isINT8, const int weiScaleMaskPerChannel, - const std::vector& DQScales, - const bool hasBias, + const bool weightsWithBatch, + const MemoryArgs& memory, const dnnl::memory::data_type outDataType); DnnlPrimitiveAttrs compose(); void appendDecompressionScales(const MemoryCPtr& scales_ptr, bool needTranspose, ov::element::Type dstPrecision); @@ -59,6 +59,7 @@ class DnnlPostOpsComposer { size_t idxOC; const bool isINT8; // only INT8 primitive support scales const int weightScaleMaskPerChannel; + bool weightsWithBatch; bool weightScaleAvailable = false; const dnnl::memory::data_type outDataType; diff --git a/src/plugins/intel_cpu/src/edge.cpp b/src/plugins/intel_cpu/src/edge.cpp index c314718bb82416..70a69c27d47989 100644 --- a/src/plugins/intel_cpu/src/edge.cpp +++ b/src/plugins/intel_cpu/src/edge.cpp @@ -5,6 +5,7 @@ #include "edge.h" #include "node.h" #include "dnnl_extension_utils.h" +#include "openvino/core/type/element_type.hpp" #include "openvino/util/pp.hpp" using namespace dnnl; @@ -212,6 +213,10 @@ Edge::ReorderStatus Edge::needReorder() { bool optimized = false; auto inputPortDesc = getInputPortDesc(); auto outPortDesc = getOutputPortDesc(); + + if (inputPortDesc->getMemDesc()->getPrecision() == element::undefined) + return ReorderStatus::No; + // Check whether the child node may accept the parent produced tensor if (!outPortDesc->isCompatible(*inputPortDesc)) { // Performance optimization which exploit the fact that some tensors do not need actual data reordering to be read using different descriptors @@ -411,6 +416,9 @@ const MemoryDesc& Edge::getOutputDesc() const { } const MemoryDesc& Edge::getDesc() const { + if (getInputDesc().getPrecision() == element::undefined) + return getInputDesc(); + if (!getInputDesc().isCompatible(getOutputDesc())) OPENVINO_THROW("Cannot get descriptor for edge: ", getParent()->getName(), "->", getChild()->getName()); diff --git a/src/plugins/intel_cpu/src/extension.cpp b/src/plugins/intel_cpu/src/extension.cpp index d5a8801ffedeac..85ff582663dec8 100644 --- a/src/plugins/intel_cpu/src/extension.cpp +++ b/src/plugins/intel_cpu/src/extension.cpp @@ -7,10 +7,13 @@ #include "openvino/core/op_extension.hpp" #include "ov_ops/augru_cell.hpp" #include "ov_ops/augru_sequence.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_quantized.hpp" #include "ov_ops/gather_compressed.hpp" #include "ov_ops/multiclass_nms_ie_internal.hpp" #include "ov_ops/nms_ie_internal.hpp" #include "ov_ops/nms_static_shape_ie.hpp" +#include "ov_ops/placeholder.hpp" #include "ov_ops/rotary_positional_embeddings.hpp" #include "ov_ops/type_relaxed.hpp" #include "snippets/op/subgraph.hpp" @@ -82,6 +85,9 @@ class TypeRelaxedExtension : public ov::OpExtension> { OP_EXTENSION(ov::op::internal::AUGRUSequence) \ OP_EXTENSION(ov::op::internal::NmsStaticShapeIE) \ OP_EXTENSION(ov::op::internal::RoPE) \ + OP_EXTENSION(ov::op::internal::FullyConnected) \ + OP_EXTENSION(ov::op::internal::FullyConnectedQuantized) \ + OP_EXTENSION(ov::op::internal::Placeholder) \ OP_EXTENSION_X64(ov::intel_cpu::MHANode) \ OP_EXTENSION_X64(ov::intel_cpu::InteractionNode) \ OP_EXTENSION_X64(ov::intel_cpu::LLMMLPNode) \ diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 6b3175e24d9dcb..77f49505e80ee8 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -67,10 +67,6 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph &graph) { FuseConvMatmulFCDeconvAndDQScales(graph); graph.RemoveDroppedNodes(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndWeightsDecompression"); - FuseFCAndWeightsDecompression(graph); - graph.RemoveDroppedNodes(); - OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseConvolutionAndBias"); FuseConvolutionMatMulDeconvAndBias(graph); graph.RemoveDroppedNodes(); @@ -212,9 +208,8 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { auto parentNode = node->getParentEdgeAt(0)->getParent(); auto scaleNode = node->getParentEdgeAt(1)->getParent(); if (!(parentNode->getType() == Type::Convolution - || parentNode->getType() == Type::MatMul - || parentNode->getType() == Type::Deconvolution - || parentNode->getType() == Type::FullyConnected)) + || parentNode->getType() == Type::MatMul + || parentNode->getType() == Type::Deconvolution)) return false; if (!scaleNode->isConstant()) return false; @@ -288,257 +283,6 @@ void GraphOptimizer::FuseConvMatmulFCDeconvAndDQScales(Graph &graph) { } } -void GraphOptimizer::FuseFCAndWeightsDecompression(Graph &graph) { - std::set supportedWeightsPrecisions{ - ov::element::u8, ov::element::i8, ov::element::nf4, ov::element::u4, ov::element::i4, ov::element::f4e2m1}; - const std::set supportedDataPrecisions{ov::element::f32, ov::element::bf16}; - auto expectedNode = [](NodePtr node, Type expectedType) { - return node->getType() == expectedType && node->getChildEdges().size() == 1; - }; - -#define SKIP_FUSION_FOR_NODE(node) \ - DEBUG_LOG("FuseFCAndWeightsDecompression can't be applied for node ", node->getName()); \ - continue - - if (!impl::cpu::x64::mayiuse(impl::cpu::x64::avx2)) - return; - - auto& graphNodes = graph.GetNodes(); - for (size_t i = 0; i < graphNodes.size(); i++) { - const auto fcNode = std::dynamic_pointer_cast(graphNodes[i]); - if (fcNode == nullptr) - continue; - - auto parent = fcNode->getParentEdgeAt(1)->getParent(); - const bool withTranspose = parent->getType() == Type::Transpose; - const NodePtr transposeNode = withTranspose ? parent : nullptr; - if (transposeNode) - parent = transposeNode->getParentEdgeAt(0)->getParent(); - // Compressed weights can be shared between several FC layers - const bool is_shared_decompression = parent->getChildEdges().size() > 1; - - const bool withReshape = parent->getType() == Type::Reshape; - const auto reshapeNode = withReshape ? parent : nullptr; - if (reshapeNode) { - parent = reshapeNode->getParentEdgeAt(0)->getParent(); - } - - const auto multiplyNode = parent; - if (multiplyNode->getType() != Type::Eltwise || multiplyNode->getAlgorithm() != Algorithm::EltwiseMultiply || - !multiplyNode->isConstant()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - CPU_GRAPH_OPTIMIZER_SCOPE(FuseFCAndWeightsDecompression); - const auto mulParent1 = multiplyNode->getParentEdgeAt(1)->getParent(); - NodePtr multiplyParent, multiplyConvertNode, multiplyConstNode; - multiplyParent = mulParent1; - if (multiplyParent->getType() == Type::Convert) { - multiplyConvertNode = multiplyParent; - multiplyParent = multiplyConvertNode->getParentEdgeAt(0)->getParent(); - } - multiplyConstNode = multiplyParent; - if (multiplyConstNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - const bool withMultiplyConvert = multiplyConvertNode != nullptr; - - const auto mulParent0 = multiplyNode->getParentEdgeAt(0)->getParent(); - const bool withSubtract = mulParent0->getAlgorithm() == Algorithm::EltwiseSubtract; - NodePtr subtractNode, subtractConvertNode, subtractConstNode; - if (withSubtract) { - subtractNode = mulParent0; - if (!expectedNode(subtractNode, Type::Eltwise)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - auto subtractParent = subtractNode->getParentEdgeAt(1)->getParent(); - if (subtractParent->getType() == Type::Convert) { - subtractConvertNode = subtractParent; - subtractParent = subtractConvertNode->getParentEdgeAt(0)->getParent(); - } - subtractConstNode = subtractParent; - if (subtractConstNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - } - - const bool withSubtractConvert = subtractConvertNode != nullptr; - const auto convertNode = withSubtract ? subtractNode->getParentEdgeAt(0)->getParent() : mulParent0; - if (!expectedNode(convertNode, Type::Convert)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - const auto weightsNode = convertNode->getParentEdgeAt(0)->getParent(); - if (weightsNode->getType() != Type::Input) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Precision limitations - if (supportedDataPrecisions.find(fcNode->getOriginalInputPrecisionAtPort(0)) == supportedDataPrecisions.end()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (supportedWeightsPrecisions.find(weightsNode->getOriginalOutputPrecisionAtPort(0)) == supportedWeightsPrecisions.end()) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (withSubtract && - !one_of(subtractConstNode->getOriginalOutputPrecisionAtPort(0), weightsNode->getOriginalOutputPrecisionAtPort(0), ov::element::f32)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Shape limitations - const auto weightsShape = weightsNode->getOutputShapeAtPort(0); - if (weightsShape != multiplyNode->getOutputShapeAtPort(0)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (reshapeNode && (reshapeNode->getInputShapeAtPort(0).getRank() != 3 || reshapeNode->getOutputShapeAtPort(0).getRank() != 2)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - VectorDims decompressionConstShape; - const auto fcInputWeightsShape = fcNode->getInputShapeAtPort(1); - int groupNum = 1; - // Ordinary case: one decompression group - if (fcInputWeightsShape.getRank() == weightsShape.getRank()) { - const auto& out_channels = fcInputWeightsShape.getDims()[0]; - decompressionConstShape = withTranspose ? VectorDims{1, out_channels} : VectorDims{out_channels, 1}; - } else { - // Group decompression case: last 3 dimension (there could be also prepending '1's in the beginning) of weights shape must be: - // [N, G, O], if transpose = true - // [O, N, G], otherwise. - // O - output channels - // N - number of groups - // G - group size - const auto& weights_dims = weightsShape.getStaticDims(); - const auto& N = withTranspose ? *(weights_dims.rbegin() + 2) : *(weights_dims.rbegin() + 1); - const auto& O = withTranspose ? *weights_dims.rbegin() : *(weights_dims.rbegin() + 2); - // Group decompression is applied by O and N dims - decompressionConstShape = withTranspose ? VectorDims{N, 1, O} : VectorDims{O, N, 1}; - groupNum = N; - } - - auto check_decompression_shape = [&decompressionConstShape](const VectorDims& shape_to_check) { - if (shape_to_check.size() > decompressionConstShape.size()) - return false; - if (std::all_of(shape_to_check.begin(), shape_to_check.end(), [](Dim x) { return x == 1; })) - return true; - const auto comparison_start_pos = decompressionConstShape.size() - shape_to_check.size(); - // in case of different ranks shapes are compared taking into account ranks numpy broadcasting - return std::equal(shape_to_check.begin(), shape_to_check.end(), decompressionConstShape.begin() + comparison_start_pos); - }; - if (!check_decompression_shape(multiplyConstNode->getOutputShapeAtPort(0).getDims())) { - SKIP_FUSION_FOR_NODE(fcNode); - } - if (withSubtract && !check_decompression_shape(subtractConstNode->getOutputShapeAtPort(0).getDims())) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - const size_t OC = fcInputWeightsShape.getDims()[0]; - const size_t IC = fcInputWeightsShape.getDims()[1]; - // HW specific shape limitations - if (impl::cpu::x64::mayiuse(impl::cpu::x64::avx512_core_amx) && - fcNode->getOriginalInputPrecisionAtPort(0) == ov::element::bf16) { - // OneDNN AMX IP implementation has limited shapes support due to performance considerations. As a current solution conditions below are copied - // from OneDNN to make sure correct IP impl will be used since fallback one doesn't support weights decompression feature. - size_t simdWidth = 16; - size_t vnniFactor = 2; - size_t maxSize = 512; - auto amxRow = vnniFactor * simdWidth; - - if ((IC <= amxRow && OC <= amxRow) || (IC <= maxSize && OC <= maxSize && IC % amxRow != 0)) { - SKIP_FUSION_FOR_NODE(fcNode); - } - } - - // OneDNN IP primitive provides limited decompression params support - if (IC % groupNum != 0 || IC / groupNum < 4 || OC == 1) { - SKIP_FUSION_FOR_NODE(fcNode); - } - - // Fusion processing - auto *multiplyInputNode = dynamic_cast(multiplyConstNode.get()); - OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", multiplyConstNode->getName(), " to Input node."); - fcNode->fuseDecompressionMultiply(multiplyInputNode->getMemoryPtr()); - - if (withSubtract) { - auto *subtractInputNode = dynamic_cast(subtractConstNode.get()); - OPENVINO_ASSERT(multiplyInputNode, "Cannot cast ", subtractConstNode->getName(), " to Input node."); - fcNode->fuseDecompressionSubtract(subtractInputNode->getMemoryPtr()); - } - - fcNode->addOriginalLayer(multiplyNode->getOriginalLayers()); - fcNode->addOriginalLayer(convertNode->getOriginalLayers()); - if (withSubtract) - fcNode->addOriginalLayer(subtractNode->getOriginalLayers()); - if (withSubtractConvert) - fcNode->addOriginalLayer(subtractConvertNode->getOriginalLayers()); - if (withMultiplyConvert) - fcNode->addOriginalLayer(multiplyConvertNode->getOriginalLayers()); - - const auto& weightsPrecision = weightsNode->getOriginalOutputPrecisionAtPort(0); - if (withTranspose) { - transposeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision); - transposeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision); - } - if (withReshape) { - reshapeNode->setOriginalInputPrecisionAtPort(0, weightsPrecision); - reshapeNode->setOriginalOutputPrecisionAtPort(0, weightsPrecision); - } - fcNode->setOriginalInputPrecisionAtPort(1, weightsPrecision); - - // If decompression subgraph is shared with other nodes, it mustn't be removed. - // In this case, the current FC is reconnected to the weights - if (is_shared_decompression) { - const auto weights_out_edge = weightsNode->getChildEdges()[0].lock(); - const auto fc_weights_path_edge = withTranspose ? transposeNode->getParentEdgeAt(0) - : fcNode->getParentEdgeAt(1); - const auto inNum = weights_out_edge->getInputNum(); - const auto outNum = fc_weights_path_edge->getOutputNum(); - graph.RemoveEdge(fc_weights_path_edge); - // In case of shared group decompression, Reshape node has to be copied for the current FC - if (withReshape) { - const auto& reshapeOutShape = reshapeNode->getOutputShapeAtPort(0).getStaticDims(); - auto reshapeConst = std::make_shared(ov::element::i32, - ov::Shape{reshapeOutShape.size()}, - reshapeOutShape); - auto reshapeDummyInput = std::make_shared(reshapeNode->getOriginalInputPrecisionAtPort(0), - reshapeNode->getInputShapeAtPort(0).toPartialShape()); - const auto reshape = std::make_shared(reshapeDummyInput, reshapeConst, false); - reshape->set_friendly_name(reshapeNode->getName() + "_copy"); - const auto cpuReshape = std::make_shared(reshape, graph.getGraphContext()); - graph.InsertNode(weightsNode, withTranspose ? transposeNode : fcNode, cpuReshape, inNum, outNum, false); - const auto cpuReshapeConst = std::make_shared(reshapeConst, graph.getGraphContext()); - graph.AddNode(cpuReshapeConst); - graph.CreateEdge(cpuReshapeConst, cpuReshape, 0, 1); - } else { - graph.CreateEdge(weightsNode, withTranspose ? transposeNode : fcNode, inNum, outNum); - } - } else { - // If decompression subgraph is not shared with other nodes, it can be removed - if (withSubtract) - graph.RemoveEdge(subtractNode->getParentEdgeAt(1)); - if (withSubtractConvert) { - // SubtractConvert is removed only if there are no other consumers (e.g. CompressedGather) - const auto& restChilds = subtractConvertNode->getChildEdges(); - if (restChilds.empty()) - graph.RemoveEdge(subtractConvertNode->getParentEdgeAt(0)); - } - graph.RemoveEdge(multiplyNode->getParentEdgeAt(1)); - if (withMultiplyConvert) { - // MultiplyConvert is removed only if there are no other consumers (e.g. CompressedGather) - const auto& restChilds = multiplyConvertNode->getChildEdges(); - if (restChilds.empty()) - graph.RemoveEdge(multiplyConvertNode->getParentEdgeAt(0)); - } - - graph.DropNode(convertNode); - if (withSubtract) - graph.DropNode(subtractNode); - graph.DropNode(multiplyNode); - } - DEBUG_LOG("FuseFCAndWeightsDecompression finished for node ", fcNode->getName()); - } -#undef SKIP_FUSION_FOR_NODE -} - void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { auto& graphNodes = graph.GetNodes(); @@ -552,7 +296,7 @@ void GraphOptimizer::FuseConvolutionMatMulDeconvAndBias(Graph &graph) { return false; if (!deconv) - return (one_of(node->getType(), Type::Convolution, Type::MatMul, Type::FullyConnected) && + return (one_of(node->getType(), Type::Convolution, Type::MatMul) && node->getParentEdges().size() == 2); else return deconv->canFuseBias(); @@ -980,9 +724,8 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { auto isSuitablePattern = [](NodePtr parent) { bool res = true && parent->getType() == Type::Transpose && parent->getChildEdges().size() == 1 - && parent->getChildEdgeAt(0)->getOutputNum() == 1 + && one_of(parent->getChildEdgeAt(0)->getOutputNum(), 1, 3, 4) && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected - && parent->getOutputShapeAtPort(0).getRank() == 2 && parent->isConstant(); return res; }; diff --git a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h index 4b641669262591..1575841cb2be9e 100644 --- a/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h +++ b/src/plugins/intel_cpu/src/memory_desc/empty_memory_desc.h @@ -59,7 +59,9 @@ class EmptyMemoryDesc : public MemoryDesc { } MemoryDescPtr cloneWithNewPrecision(const ov::element::Type prec) const override { - OPENVINO_THROW("Clone an empty memory desc with any precision (", prec, ") is prohibited"); + OPENVINO_ASSERT(prec == ov::element::undefined, + "Clone an empty memory desc with defined precision: ", prec, " is prohibited"); + return clone(); } private: diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 31c4a0d2a5b54d..991cec4ec8b3e8 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -5,6 +5,7 @@ #include "node.h" #include "cpu_types.h" #include "edge.h" +#include "openvino/core/type/element_type.hpp" #include "partitioned_mem_mgr.h" #include @@ -1542,7 +1543,7 @@ bool Node::isInputTensorAtPortEmpty(size_t port) const { auto edge = getParentEdgeAt(port); if (one_of(edge->getStatus(), Edge::Status::Allocated, Edge::Status::Validated)) { auto&& mem = edge->getMemory(); - if (mem.isDefined()) { + if (mem.isDefined() && !mem.getDesc().empty()) { return mem.getShape().hasZeroDims(); } } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index 6f464abf33d036..f64a1da2af3377 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -53,8 +53,9 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, const PostOps &postOps) { DEBUG_LOG("ACLFullyConnectedExecutor: prepack weights"); const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); - const auto N = wgtDims[0]; - const auto K = wgtDims[1]; + const auto N = std::accumulate(wgtDims.begin(), wgtDims.end() - 1, Dim{1}, std::multiplies()); + const auto K = wgtDims.back(); + const VectorDims wgtDims2D = {N, K}; auto create = [&]() { MemoryPtr final_ptr = memory.at(ARG_WEI); @@ -91,9 +92,10 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, memoryArgs[ARG_WEI] = final_ptr; if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { const auto& inShape = memory.at(ARG_SRC_0)->getShape(); - const auto& wShape = final_ptr->getShape(); - const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); - const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); + const Shape wShape2D{wgtDims2D}; + // const auto& wShape = final_ptr->getShape(); + const auto& inDymmyDims = makeDummyInputDims(inShape, wShape2D); + const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape2D.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), @@ -121,11 +123,11 @@ static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, } // Transpose weights if (!aclfcAttrs.weightsNonTransposed) { - auto reverse_weights_dims = memory.at(ARG_WEI)->getStaticDims(); - if (reverse_weights_dims.size() == 3) { - reverse_weights_dims = VectorDims( - {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]}); - } + auto reverse_weights_dims = wgtDims2D; + // if (reverse_weights_dims.size() == 3) { + // reverse_weights_dims = VectorDims( + // {reverse_weights_dims[0] * reverse_weights_dims[1], reverse_weights_dims[2]}); + // } std::reverse(reverse_weights_dims.begin(), reverse_weights_dims.end()); MemoryArgs memoryArgs; memoryArgs[ARG_SRC_0] = final_ptr; @@ -215,8 +217,8 @@ bool ACLFullyConnectedExecutor::supports(const FCConfig &config) { static void updateFCTensorsShapes(ACLShapes& aclMemoryShapes) { if (aclMemoryShapes[ACLArgs::ACL_WEI].num_dimensions() == 3U) { aclMemoryShapes[ACLArgs::ACL_WEI] = arm_compute::TensorShape( - {aclMemoryShapes[ACLArgs::ACL_WEI][0] * aclMemoryShapes[ACLArgs::ACL_WEI][1], - aclMemoryShapes[ACLArgs::ACL_WEI][2]}); + {aclMemoryShapes[ACLArgs::ACL_WEI][0], + aclMemoryShapes[ACLArgs::ACL_WEI][1] * aclMemoryShapes[ACLArgs::ACL_WEI][2]}); } if (one_of(aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions(), 3U, 4U)) { diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp index 8f9d7ad0805e41..86ef4cc6280f05 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_convolution_primitive.cpp @@ -158,7 +158,7 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const ConvAttrs& attrs, auto outputDataType = DnnlExtensionUtils::ElementTypeToDataType(dstDesc->getPrecision()); DnnlPostOpsComposer - dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, {}, attrs.withBias, outputDataType); + dnnlpoc(postOps, context->getEngine(), dims, 1, isINT8, 1 << 0, weiDesc->getShape().getRank() == 3, memory, outputDataType); return dnnlpoc.compose(); } diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp index fcb70d4753b2ce..160d2960775e25 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "nodes/executors/executor.hpp" #include "nodes/executors/fullyconnected_config.hpp" #include "nodes/executors/memory_arguments.hpp" +#include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" namespace ov { @@ -108,6 +110,11 @@ std::shared_ptr DnnlFCPrimitive::create(const MemoryArgs& memor return primitive; } +template +static std::vector normalizeDimsTo2D(const std::vector& dims) { + return {std::accumulate(dims.begin(), dims.end() - 1, (T)1, std::multiplies()), dims[dims.size() - 1]}; +} + DnnlMemoryDescPtr DnnlFCPrimitive::makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc, const DnnlMemoryDescPtr dstDesc, bool weightsNonTransposed) { @@ -115,9 +122,11 @@ DnnlMemoryDescPtr DnnlFCPrimitive::makeTransposedWeightDescriptor(const DnnlMemo return srcDesc; const auto& weiDesc = srcDesc->getDnnlDesc(); - const auto reorderedWeiDesc = - dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; - const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); + auto wDims = weiDesc.get_dims(); + dnnl::memory::dim batchDim = std::accumulate(wDims.begin(), wDims.end() - 1, 1, std::multiplies()); + dnnl::memory::dims dims2D{batchDim, wDims.back()}; + + const auto transposedWeiDesc = dnnl::memory::desc{dims2D, weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } @@ -140,12 +149,11 @@ bool DnnlFCPrimitive::useWeightsDecompressionImpl(const ov::element::Type inputT return false; } -bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, - const MemoryDescPtr srcDesc, - const MemoryDescPtr weightsDesc, - MemoryCPtr scalesPtr, - MemoryCPtr zpPtr, - bool needTranspose) { +static bool useDynamicQuantizationImpl(size_t dqGroupSize, + const MemoryDescPtr srcDesc, + const MemoryDescPtr weightsDesc, + const MemoryArgs& memory, + bool needTranspose) { if (dqGroupSize == 0) return false; @@ -155,6 +163,8 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, if (srcDesc->getPrecision() != ov::element::f32) return false; + + MemoryCPtr zpPtr = memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS) ? memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS) : nullptr; // For dynamic quantization, VNNI accumulation requires weight to be unsigned. // To support dynamic quantization with weights symmetrically quantized as i8/i4 // w/o zero-point, we will transform weight to u8/u4 weight with zp 128/8. @@ -177,11 +187,15 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, if (weightsDesc->getPrecision() == ov::element::u4) { int ic = weightsDesc->getShape().getStaticDims()[1]; int minGroupSize = INT_MAX; + + MemoryCPtr scalesPtr = memory.count(ARG_WEI | ARG_ATTR_SCALES) ? memory.at(ARG_WEI | ARG_ATTR_SCALES) : nullptr; + if (scalesPtr && scalesPtr->getShape().getRank() == 3) { auto scalesDims = scalesPtr->getShape().getStaticDims(); auto groupsNum = needTranspose ? scalesDims[1] : scalesDims[0]; minGroupSize = ic / groupsNum; } + if (zpPtr && zpPtr->getShape().getRank() == 3) { auto zpDims = zpPtr->getShape().getStaticDims(); int groupsNum = needTranspose ? zpDims[1] : zpDims[0]; @@ -196,11 +210,6 @@ bool DnnlFCPrimitive::useDynamicQuantizationImpl(size_t dqGroupSize, return true; } -template -static std::vector normalizeDimsTo2D(const std::vector& dims) { - return {std::accumulate(dims.begin(), dims.end() - 1, (T)1, std::multiplies()), dims[dims.size() - 1]}; -} - static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, const PostOps& postOps, const MemoryArgs& memory, @@ -223,21 +232,23 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const FCAttrs& attrs, dims.size() - 1, isINT8, 1 << 0, - attrs.dequantizationScales, - !memory.at(ARG_BIAS)->getDesc().empty(), + weiDesc->getShape().getRank() == 3, + memory, outputDataType); - if (attrs.decompressionMultiplyPtr) { - auto dstPrc = attrs.decompressionMultiplyPtr->getPrecision(); + if (memory.count(ARG_WEI | ARG_ATTR_SCALES)) { + auto dstPrc = memory.at(ARG_WEI | ARG_ATTR_SCALES)->getPrecision(); if (dstPrc != f8e8m0 || useDynamicQuantization) dstPrc = ov::element::f32; - dnnlpoc.appendDecompressionScales(attrs.decompressionMultiplyPtr, !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionScales(memory.at(ARG_WEI | ARG_ATTR_SCALES), !attrs.weightsNonTransposed, dstPrc); } - if (attrs.decompressionSubtractPtr) { + + if (memory.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) { auto dstPrc = useDynamicQuantization ? ov::element::u8 : ov::element::f32; - dnnlpoc.appendDecompressionZeroPoints(attrs.decompressionSubtractPtr, !attrs.weightsNonTransposed, dstPrc); + dnnlpoc.appendDecompressionZeroPoints(memory.at(ARG_WEI | ARG_ATTR_ZERO_POINTS), !attrs.weightsNonTransposed, dstPrc); } + if (useDynamicQuantization) { auto wei_precision = weiDesc->getPrecision(); bool is_symmetric_weights = (wei_precision == ov::element::i8) || (wei_precision == ov::element::i4); @@ -276,12 +287,13 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons const bool useWeightsDecompression) { const auto normalizedInputDesc = normalizeDescriptor(inputDesc); const auto normalizedOutputDesc = normalizeDescriptor(outputDesc); + const auto normalizedWeightDesc = normalizeDescriptor(weightDesc); const auto indt = normalizedInputDesc.get_data_type(); auto wdt = indt; if (useWeightsDecompression) { - wdt = weightDesc.get_data_type(); + wdt = normalizedWeightDesc.get_data_type(); // dynamic quantization with symmetric quantized weights needs unsigned weights uint64_t dynQuantGroupSize = 0; @@ -297,8 +309,8 @@ static dnnl::inner_product_forward::primitive_desc createDescriptorInternal(cons } const dnnl::memory::desc weightsDesc = - useSparseWeights ? dnnl::memory::desc().sparse_desc(weightDesc.get_dims(), wdt) - : dnnl::memory::desc(weightDesc.get_dims(), wdt, memory::format_tag::any); + useSparseWeights ? dnnl::memory::desc().sparse_desc(normalizedWeightDesc.get_dims(), wdt) + : dnnl::memory::desc(normalizedWeightDesc.get_dims(), wdt, memory::format_tag::any); return dnnl::inner_product_forward::primitive_desc(engine, dnnl::prop_kind::forward_inference, @@ -387,8 +399,7 @@ DnnlShapeAgnosticDataPtr DnnlFCPrimitive::createShapeAgnosticData(const FCAttrs& useWeightsDecompression && useDynamicQuantizationImpl(attrs.dynamicQuantizationGroupSize, srcDesc, weiDesc, - attrs.decompressionMultiplyPtr, - attrs.decompressionSubtractPtr, + memory, !attrs.weightsNonTransposed); const auto postOpData = createPrimitiveAttrs(attrs, postOps, memory, context, useDynamicQuantization); diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp index 5295b9655066cc..21247f149ca69f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_fullyconnected_primitive.hpp @@ -75,13 +75,6 @@ class DnnlFCPrimitive { const DnnlShapeAgnosticDataPtr& shapeAgnosticData); private: - static bool useDynamicQuantizationImpl(size_t dqGroupSize, - const MemoryDescPtr srcDesc, - const MemoryDescPtr weightsDesc, - MemoryCPtr scalesPtr, - MemoryCPtr zpPtr, - bool needTranspose); - dnnl::stream m_stream; dnnl::primitive_desc m_primDesc; impl_desc_type m_implType; diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp index 1b8646c858e532..2e4bf0556486af 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_matmul_primitive.cpp @@ -104,10 +104,11 @@ DnnlMemoryDescPtr DnnlMatMulPrimitive::makeTransposedWeightDescriptor(const Dnnl const auto& weiDesc = srcDesc->getDnnlDesc(); auto wDims = weiDesc.get_dims(); auto wDataType = weiDesc.get_data_type(); - std::swap(wDims[wDims.size() - 1], wDims[wDims.size() - 2]); + dnnl::memory::dim batchDim = std::accumulate(wDims.begin(), wDims.end() - 1, 1, std::multiplies()); + dnnl::memory::dims dims2D{wDims.back(), batchDim}; const auto format = weightsNonTransposed ? dnnl::memory::format_tag::ab : dnnl::memory::format_tag::ba; - const auto transposedWeiDesc = dnnl::memory::desc{wDims, wDataType, format}; + const auto transposedWeiDesc = dnnl::memory::desc{dims2D, wDataType, format}; return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); } @@ -134,8 +135,8 @@ static DnnlPrimitiveAttrs createPrimitiveAttrs(const MatMulAttrs& attrs, dims.size() - 1, isINT8, 1 << 0, - attrs.dequantizationScales, - !memory.at(ARG_BIAS)->getDesc().empty(), + weiDesc->getShape().getRank() == 3, + memory, outputDataType); return dnnlpoc.compose(); @@ -262,7 +263,7 @@ DnnlShapeAgnosticDataPtr DnnlMatMulPrimitive::createShapeAgnosticData(const FCAt const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& biasDesc = memory.at(ARG_BIAS)->getDescPtr(); auto dstDesc = memory.at(ARG_DST)->getDescPtr(); - MatMulAttrs mmAttrs{false, false, attrs.dequantizationScales}; + MatMulAttrs mmAttrs{false, false}; const auto postOpData = createPrimitiveAttrs(mmAttrs, postOps, memory, context, false); diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp index ad6479597c6971..1bdbd9f369937b 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_config.hpp @@ -21,10 +21,11 @@ struct FCAttrs { bool sparseWeights = false; // @todo only memory descriptors should be a part of attributes // actual memory should be passed into "execute" or "prepareMemory" calls - std::vector dequantizationScales; + // std::vector dequantizationScales; + // @todo should be passed as an additional memory input? - MemoryCPtr decompressionSubtractPtr; - MemoryCPtr decompressionMultiplyPtr; + // MemoryCPtr decompressionSubtractPtr; + // MemoryCPtr decompressionMultiplyPtr; uint64_t dynamicQuantizationGroupSize; ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown; }; diff --git a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp index 5a8b1ef78b6dbb..2fde49d10e0a58 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/fullyconnected_implementations.cpp @@ -439,8 +439,7 @@ const std::vector>& getImplementations() { const ExecutorContext::CPtr context, std::shared_ptr shareAgnosticData) const { MatMulAttrs matMulAttrs{false, - false, - attrs.dequantizationScales}; + false}; auto primitive = DefaultInstantiator{}( memory, diff --git a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp index 9e484b24a2940e..e42bf3138bce91 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/matmul_config.hpp @@ -12,7 +12,6 @@ namespace intel_cpu { struct MatMulAttrs { bool transposeA; bool transposeB; - std::vector dequantizationScales; }; using MatMulConfig = executor::Config; diff --git a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp index c04ca39e845ee1..9959188b9a8cf4 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/memory_arguments.hpp @@ -24,6 +24,10 @@ using MemoryArgs = std::unordered_map; #define ARG_WEI_0 33 #define ARG_WEI ARG_WEI_0 #define ARG_BIAS 41 +/// Scaling factors provided at execution time. +#define ARG_ATTR_SCALES 4096 +/// Zero points provided at execution time. +#define ARG_ATTR_ZERO_POINTS 8192 } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp index a03bfe2649413a..8fd945b773f262 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp @@ -23,6 +23,10 @@ using namespace executor; using namespace dnnl; using namespace ov::element; +static Dim batchDim(const VectorDims& dims) { + return std::accumulate(dims.begin(), dims.end() - 1, 1, std::multiplies()); +} + static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory, const ExecutorContext::CPtr context, const bool weightsTransposed) { @@ -31,14 +35,15 @@ static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory, // Weights are transposed by MatMulConstTransposesExtraction // K is the IC of weight // the weight is reshaped to [-1, K] in ConvertMatMulToFC - const auto K = wgtDims[1]; - const auto N = wgtDims[0]; + Dim K = wgtDims.back(); + Dim N = batchDim(wgtDims); auto packedBsize = mlas_sgemm_pack_get_size(N, K); auto create = [&]() { float* weightPtr = weightsMemory->getDataAs(); size_t ldb = weightsTransposed ? K : N; + MemoryPtr _ptr = std::make_shared(context->getEngine(), intel_cpu::CpuBlockedMemoryDesc(i8, intel_cpu::Shape{packedBsize})); float* prepackedDst = _ptr->getDataAs(); @@ -66,21 +71,10 @@ bool MlasGemmExecutor::supports(const FCConfig& config) { DEBUG_LOG("MlasGemmExecutor: PostOps are not supported"); return false; } - const auto& weiDesc = config.descs.at(ARG_WEI); - const auto& dstDesc = config.descs.at(ARG_DST); - // MLAS cannot support weight dims > 2, e.g. [1,64,9,9] * [10,64,9,9] - const auto& weightsDims = weiDesc->getShape().getStaticDims(); - if (weightsDims.size() > 2) { - if (!std::all_of(weightsDims.begin() + 2, weightsDims.end(), [](const Dim dim) { - return dim == 1; - })) { - DEBUG_LOG("MlasGemmExecutor: weights dims > 2 are not supported"); - return false; - } - } + const auto& dstDesc = config.descs.at(ARG_DST); - if (config.attrs.withBias) { + if (!config.descs.at(ARG_BIAS)->empty()) { const auto& biaDesc = config.descs.at(ARG_BIAS); const auto& biasDims = biaDesc->getShape().getStaticDims(); const auto& outDims = dstDesc->getShape().getDims(); @@ -108,24 +102,17 @@ MlasGemmExecutor::MlasGemmExecutor(const FCAttrs& attrs, const ExecutorContext::CPtr context) : m_attrs(attrs), m_memoryArgs(memory), - packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)) {} + packedWeights(prepareWeightMemory(memory.at(ARG_WEI), context, !attrs.weightsNonTransposed)), + N(batchDim(memory.at(ARG_WEI)->getStaticDims())), + K(memory.at(ARG_WEI)->getStaticDims().back()) +{} bool MlasGemmExecutor::update(const MemoryArgs& memory) { - const auto& weiDesc = memory.at(ARG_WEI)->getDescPtr(); const auto& dstDesc = memory.at(ARG_DST)->getDescPtr(); - const auto& wgtDims = weiDesc->getShape().getStaticDims(); - // Weights are transposed by MatMulConstTransposesExtraction - // K is the IC of weight - // the weight is reshaped to [-1, K] in ConvertMatMulToFC - K = wgtDims[1]; - N = wgtDims[0]; const auto& outDims = dstDesc->getShape().getStaticDims(); - if (outDims.size() > 2) { - M = std::accumulate(outDims.begin(), outDims.end() - 1, 1, std::multiplies()); - } else { - M = outDims[0]; - } + M = outDims.size() > 2 ? batchDim(outDims) : outDims[0]; + return true; } diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 5d2b6fd9b50212..3cd09658fbae2e 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -21,6 +21,9 @@ #include "nodes/executors/fullyconnected_config.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/runtime/threading/cpu_message.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "ov_ops/placeholder.hpp" #include "post_ops.hpp" #include "shape_inference/custom/fullyconnected.hpp" #include "transformations/cpu_opset/common/op/fully_connected.hpp" @@ -39,21 +42,22 @@ namespace node { bool FullyConnected::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { - const auto fc = std::dynamic_pointer_cast(op); + const auto fcQuantized = std::dynamic_pointer_cast(op); + if (fcQuantized) { + return true; + } + + const auto fc = std::dynamic_pointer_cast(op); if (!fc) { errorMessage = "Only legacy FullyConnected operation is supported"; return false; } if (fc->get_input_size() == 3 && - std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(BIAS_ID)) == nullptr) { + (std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(BIAS)) == nullptr && + std::dynamic_pointer_cast(fc->get_input_node_shared_ptr(BIAS)) == nullptr)) { errorMessage = "Only Constant operation on 'bias' input is supported"; return false; } - const auto weightRank = fc->get_input_partial_shape(WEIGHTS_ID).size(); - if (weightRank != 2) { - errorMessage = "Doesn't support 'weight' input with rank: " + std::to_string(weightRank); - return false; - } } catch (...) { return false; } @@ -79,6 +83,27 @@ FullyConnected::FullyConnected(const std::shared_ptr& op, const GraphC initTensorParallelConfig(context); if (!isSupportedOperation(op, errorMessage)) OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); + + argToInput[ARG_SRC] = DATA; + argToInput[ARG_WEI] = WEIGHTS; + argToInput[ARG_BIAS] = BIAS; + const auto fcQuantized = std::dynamic_pointer_cast(op); + if (fcQuantized) { + if (fcQuantized->get_input_size() > OUTPUT_SCALES && + fcQuantized->input(OUTPUT_SCALES).get_element_type() != ov::element::undefined) { + argToInput[ARG_DST | ARG_ATTR_SCALES] = OUTPUT_SCALES; + } + + if (fcQuantized->get_input_size() > WEIGHT_SCALES && + fcQuantized->input(WEIGHT_SCALES).get_element_type() != ov::element::undefined) { + argToInput[ARG_WEI | ARG_ATTR_SCALES] = WEIGHT_SCALES; + } + + if (fcQuantized->get_input_size() > WEIGHT_ZERO_POINTS && + fcQuantized->input(WEIGHT_ZERO_POINTS).get_element_type() != ov::element::undefined) { + argToInput[ARG_WEI | ARG_ATTR_ZERO_POINTS] = WEIGHT_ZERO_POINTS; + } + } } bool FullyConnected::canBeExecutedInInt8() const { @@ -364,31 +389,13 @@ static bool useSparseWeightsDecompression(const NodePtr& weightsInput, return sparseRate >= minSparseRate; } -void FullyConnected::needUpdateDQScaleForTensorParallel(std::vector& dequantizationScales) { - if (tp_cfg.enable_tensor_parallel) { - auto split_parts = [](int len, int n) { - int average = len / n; - std::vector parts(n, average); - parts.back() = len - average * (n - 1); - return parts; - }; - auto DQScales = getDQScales(); - auto split_lens = split_parts(DQScales.size(), tp_cfg.w_size); - auto split_offset = tp_cfg.w_rank * split_lens[0]; - std::vector newDQScales(split_lens[tp_cfg.w_rank]); - std::copy(DQScales.begin() + split_offset, DQScales.begin() + split_offset + split_lens[tp_cfg.w_rank], newDQScales.begin()); - dequantizationScales = newDQScales; - } -} - void FullyConnected::initSupportedPrimitiveDescriptors() { - attrs.withBias = getOriginalInputsNumber() == 3; + attrs.withBias = getOriginalInputPrecisionAtPort(BIAS) != ov::element::undefined; + // attrs.dequantizationScales = getDQScales(); + // needUpdateDQScaleForTensorParallel(attrs.dequantizationScales); - attrs.dequantizationScales = getDQScales(); - needUpdateDQScaleForTensorParallel(attrs.dequantizationScales); - - attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS_ID)->getParent(), - getOriginalInputPrecisionAtPort(DATA_ID), + attrs.sparseWeights = useSparseWeightsDecompression(getParentEdgeAt(WEIGHTS)->getParent(), + getOriginalInputPrecisionAtPort(DATA), context->getConfig().fcSparseWeiDecompressionRate); attrs.dynamicQuantizationGroupSize = context->getConfig().fcDynamicQuantizationGroupSize; attrs.modelType = context->getConfig().modelType; @@ -404,6 +411,10 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { VecMemoryDescs srcDescs; const auto& creatorsMap = BlockedDescCreator::getCommonCreators(); for (size_t i = 0; i < srcTypes.size(); i++) { + if (srcTypes[i] == element::undefined) { + srcDescs.push_back(MemoryDescUtils::makeEmptyDesc()); + continue; + } const auto srcDesc = creatorsMap.at(LayoutType::ncsp)->createSharedDesc(srcTypes[i], getInputShapeAtPort(i)); srcDescs.push_back(srcDesc); } @@ -415,23 +426,32 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { } MemoryDescArgs descs{ - {ARG_SRC, srcDescs[0]}, - {ARG_WEI, srcDescs[1]}, - {ARG_BIAS, attrs.withBias ? srcDescs[2] : MemoryDescUtils::makeEmptyDesc()}, + {ARG_SRC, srcDescs[DATA]}, + {ARG_WEI, srcDescs[WEIGHTS]}, + {ARG_BIAS, srcDescs[BIAS]}, {ARG_DST, dstDescs[0]}, }; - needUpdateScaleForTensorParallel(); - needUpdateZeroPointForTensorParallel(); - auto executionContext = std::make_shared(context, getImplPriority(), privateWeightCache); factory = std::make_shared>(attrs, postOps, executionContext, descs); const auto nodeDescriptors = factory->getProperMemoryDescriptors(descs); NodeConfig nodeConfig; - nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_SRC)); - nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_WEI)); - if (attrs.withBias) nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_BIAS)); + nodeConfig.inConfs.resize(srcDescs.size()); + + for (const auto& desc : nodeDescriptors) { + if (argToInput.count(desc.first)) { + nodeConfig.inConfs[argToInput[desc.first]] = desc.second; + } + } + + for (size_t i = 3; i < srcDescs.size(); i++) { + nodeConfig.inConfs[i] = srcDescs[i]; + } + + // nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_SRC)); + // nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_WEI)); + // nodeConfig.inConfs.emplace_back(nodeDescriptors.at(ARG_BIAS)); const int inPlace = canBeInPlace() ? 0 : -1; nodeConfig.outConfs.emplace_back(nodeDescriptors.at(ARG_DST), BlockedMemoryDesc::FULL_MASK, inPlace); @@ -441,11 +461,11 @@ void FullyConnected::initSupportedPrimitiveDescriptors() { void FullyConnected::needSplitMemoryForTensorParallel() { if (tp_cfg.enable_tensor_parallel) { - auto src = getSrcMemoryAtPort(DATA_ID); - auto wgt = getSrcMemoryAtPort(WEIGHTS_ID); + auto src = getSrcMemoryAtPort(DATA); + auto wgt = getSrcMemoryAtPort(WEIGHTS); auto dst = getDstMemoryAtPort(0); // src - memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); + memory[ARG_SRC] = getSrcMemoryAtPort(DATA); // wgt // split N direction tp_cfg.cached_splited_weight = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), wgt, 0, tp_cfg.w_rank, tp_cfg.w_size) @@ -453,7 +473,7 @@ void FullyConnected::needSplitMemoryForTensorParallel() { memory[ARG_WEI] = tp_cfg.cached_splited_weight; // bias if (attrs.withBias) { - auto bias = getSrcMemoryAtPort(BIAS_ID); + auto bias = getSrcMemoryAtPort(BIAS); auto select_bias = split_horizontal(context->getEngine(), bias, 0, tp_cfg.w_rank, tp_cfg.w_size); tp_cfg.cached_splited_bias = select_bias; } else { @@ -463,6 +483,33 @@ void FullyConnected::needSplitMemoryForTensorParallel() { // dst memory[ARG_DST] = getDstMemoryAtPort(0); tp_cfg.cached_dst = split_horizontal(context->getEngine(), dst, -1, tp_cfg.w_rank, tp_cfg.w_size, false); + + memory[ARG_DST | ARG_ATTR_SCALES] = split_horizontal(context->getEngine(), memory[ARG_DST | ARG_ATTR_SCALES], 0, tp_cfg.w_rank, tp_cfg.w_size); + // auto split_parts = [](int len, int n) { + // int average = len / n; + // std::vector parts(n, average); + // parts.back() = len - average * (n - 1); + // return parts; + // }; + // auto DQScales = getDQScales(); + // auto split_lens = split_parts(DQScales.size(), tp_cfg.w_size); + // auto split_offset = tp_cfg.w_rank * split_lens[0]; + // std::vector newDQScales(split_lens[tp_cfg.w_rank]); + // std::copy(DQScales.begin() + split_offset, DQScales.begin() + split_offset + split_lens[tp_cfg.w_rank], newDQScales.begin()); + // dequantizationScales = newDQScales; + + auto scale_mem = std::const_pointer_cast(memory[ARG_WEI | ARG_ATTR_SCALES]); + memory[ARG_WEI | ARG_ATTR_SCALES] = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); + + auto zeropoint_mem = std::const_pointer_cast(memory[ARG_WEI | ARG_ATTR_ZERO_POINTS]); + auto element_num = zeropoint_mem->getSize() / zeropoint_mem->getPrecision().size(); + if (element_num == 1) { + tp_cfg.cached_zeropoint = zeropoint_mem; + } else { + tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) + : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); + } } } @@ -471,7 +518,7 @@ void FullyConnected::needUpdateTensorParalelConfig() { // 1. weight shape is dynamic // 2. last dim can be splited. if (tp_cfg.enable_tensor_parallel) { - auto shape = getSrcMemoryAtPort(WEIGHTS_ID)->getShape(); + auto shape = getSrcMemoryAtPort(WEIGHTS)->getShape(); if (shape.isDynamic()) { tp_cfg.enable_tensor_parallel = false; } else if (shape.getDims()[0] < static_cast(tp_cfg.w_size)) { @@ -479,15 +526,31 @@ void FullyConnected::needUpdateTensorParalelConfig() { } } } + + + void FullyConnected::createPrimitive() { needUpdateTensorParalelConfig(); - memory[ARG_SRC] = getSrcMemoryAtPort(DATA_ID); - memory[ARG_WEI] = getSrcMemoryAtPort(WEIGHTS_ID); - memory[ARG_BIAS] = attrs.withBias ? getSrcMemoryAtPort(BIAS_ID) : MemoryDescUtils::makeEmptyMemory(context); + memory[ARG_SRC] = getSrcMemoryAtPort(DATA); + memory[ARG_WEI] = getSrcMemoryAtPort(WEIGHTS); + memory[ARG_BIAS] = getSrcMemoryAtPort(BIAS); memory[ARG_DST] = getDstMemoryAtPort(0); + if (argToInput.count(ARG_DST | ARG_ATTR_SCALES)) { + memory[ARG_DST | ARG_ATTR_SCALES] = getSrcMemoryAtPort(argToInput[ARG_DST | ARG_ATTR_SCALES]); + } + + if (argToInput.count(ARG_WEI | ARG_ATTR_SCALES)) { + memory[ARG_WEI | ARG_ATTR_SCALES] = getSrcMemoryAtPort(argToInput[ARG_WEI | ARG_ATTR_SCALES]); + } + + if (argToInput.count(ARG_WEI | ARG_ATTR_ZERO_POINTS)) { + memory[ARG_WEI | ARG_ATTR_ZERO_POINTS] = getSrcMemoryAtPort(argToInput[ARG_WEI | ARG_ATTR_ZERO_POINTS]); + } + needSplitMemoryForTensorParallel(); + // @todo should we preconfigure only for dynamic shapes? // Since for static shapes primitive is created in scope of compile_model() anyway factory->preconfigure(memory); @@ -511,49 +574,6 @@ ov::element::Type FullyConnected::getRuntimePrecision() const { return getMaxPrecision(srcTypes); } -void FullyConnected::needUpdateScaleForTensorParallel() { - if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_scale) { - attrs.decompressionMultiplyPtr = tp_cfg.cached_scale; - } -} - -void FullyConnected::needSplitScaleForTensorParallel(const MemoryCPtr& memory) { - if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_scale) { - auto scale_mem = std::const_pointer_cast(memory); - tp_cfg.cached_scale = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), scale_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); - } -} - -void FullyConnected::needUpdateZeroPointForTensorParallel() { - if (tp_cfg.enable_tensor_parallel && tp_cfg.cached_zeropoint) { - attrs.decompressionSubtractPtr = tp_cfg.cached_zeropoint; - } -} - -void FullyConnected::needSplitZeroPointForTensorParallel(const MemoryCPtr& memory) { - if (tp_cfg.enable_tensor_parallel && !tp_cfg.cached_zeropoint) { - auto zeropoint_mem = std::const_pointer_cast(memory); - auto element_num = memory->getSize() / memory->getPrecision().size(); - if (element_num == 1) { - tp_cfg.cached_zeropoint = zeropoint_mem; - } else { - tp_cfg.cached_zeropoint = attrs.weightsNonTransposed ? split_vertical(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size) - : split_horizontal(context->getEngine(), zeropoint_mem, 0, tp_cfg.w_rank, tp_cfg.w_size); - } - } -} - -void FullyConnected::fuseDecompressionMultiply(const MemoryCPtr& memory) { - attrs.decompressionMultiplyPtr = memory; - needSplitScaleForTensorParallel(memory); -} - -void FullyConnected::fuseDecompressionSubtract(const MemoryCPtr& memory) { - attrs.decompressionSubtractPtr = memory; - needSplitZeroPointForTensorParallel(memory); -} - } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index be29342b851988..414e15ed488c6a 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -6,9 +6,11 @@ #include +#include #include #include #include +#include #include #include "cpu_memory.h" @@ -81,9 +83,19 @@ class FullyConnected : public Node { void toNumaNodeImpl(int numaID) override; private: - static const size_t DATA_ID = 0; - static const size_t WEIGHTS_ID = 1; - static const size_t BIAS_ID = 2; + enum InputId : size_t { + DATA = 0, + WEIGHTS = 1, + BIAS = 2, + WEIGHT_SCALES = 3, + WEIGHT_ZERO_POINTS = 4, + INPUT_SCALES = 5, + INPUT_ZERO_POINTS = 6, + OUTPUT_SCALES = 7, + OUTPUT_ZERO_POINTS = 8, + }; + + std::unordered_map argToInput; ExecutorPtr createExecutor(); void fuseDecompressionConstant(const MemoryCPtr& memory, MemoryCPtr& decompressionValuesPtr); @@ -94,11 +106,6 @@ class FullyConnected : public Node { void initTensorParallelSync(); void execTensorParallelSync(); void needSplitMemoryForTensorParallel(); - void needSplitScaleForTensorParallel(const MemoryCPtr& memory); - void needUpdateScaleForTensorParallel(); - void needSplitZeroPointForTensorParallel(const MemoryCPtr& memory); - void needUpdateZeroPointForTensorParallel(); - void needUpdateDQScaleForTensorParallel(std::vector& dequantizationScales); FCAttrs attrs; PostOps postOps; diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp index ea659ec1e31b84..78a65363e5a4da 100644 --- a/src/plugins/intel_cpu/src/nodes/input.cpp +++ b/src/plugins/intel_cpu/src/nodes/input.cpp @@ -7,6 +7,8 @@ #include "cpu/x64/jit_generator.hpp" #include "openvino/core/parallel.hpp" #include "shape_inference/shape_inference_pass_through.hpp" +#include "ov_ops/placeholder.hpp" +#include "memory_desc/cpu_memory_desc_utils.h" using namespace dnnl; using namespace dnnl::impl::cpu::x64; @@ -222,14 +224,18 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte op::v0::Constant::get_type_info_static(), op::v0::Result::get_type_info_static(), op::v3::ReadValue::get_type_info_static(), - op::v6::ReadValue::get_type_info_static())) + op::v6::ReadValue::get_type_info_static(), + op::internal::Placeholder::get_type_info_static())) OPENVINO_THROW_NOT_IMPLEMENTED("CPU Input node doesn't support ngraph operation ", op->get_type_name(), " with name ", op->get_friendly_name()); - constOp = ov::as_type_ptr(op); - if (constOp) { + if (auto placeHolder = ov::as_type_ptr(op)) { + memoryPtr = MemoryDescUtils::makeEmptyMemory(context); constant = ConstantType::Const; + } else if (auto constOp = ov::as_type_ptr(op)) { + constant = ConstantType::Const; + m_constOp = constOp; cloneBlobIfRequired(); } else { constant = ConstantType::StrictNoConst; @@ -237,8 +243,8 @@ Input::Input(const std::shared_ptr& op, const GraphContext::CPtr conte } void Input::cloneBlobIfRequired() { - Shape shape(constOp->get_shape().empty() ? ov::Shape(1, 1) : constOp->get_shape()); - const auto prec = constOp->get_element_type(); + Shape shape(m_constOp->get_shape().empty() ? ov::Shape(1, 1) : m_constOp->get_shape()); + const auto prec = m_constOp->get_element_type(); const size_t size = shape.getElementsCount(); CpuBlockedMemoryDesc memDesc(prec, shape); @@ -257,21 +263,21 @@ void Input::cloneBlobIfRequired() { // oneDNN always allocate 1byte for element type with bitWidth < 8 (u4,u1...) // but ngraph Constant uses actual bitWidth for data storage allocation // in that case we make a copy to avoid overflow - if (constOp->get_byte_size() >= memDesc.getCurrentMemSize()) { - if (constOp->get_element_type() == element::string) { - memory = std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()); + if (m_constOp->get_byte_size() >= memDesc.getCurrentMemSize()) { + if (m_constOp->get_element_type() == element::string) { + memory = std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()); } else { - memory = std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()); + memory = std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()); } } else { - if (constOp->get_element_type() == element::string) { + if (m_constOp->get_element_type() == element::string) { memory = std::make_shared(getEngine(), memDesc); - auto src = constOp->get_data_ptr(); + auto src = m_constOp->get_data_ptr(); auto dst = memory->getDataAs(); std::copy(src, src + size, dst); } else { memory = std::make_shared(getEngine(), memDesc); - memcpy(memory->getData(), constOp->get_data_ptr(), constOp->get_byte_size()); + memcpy(memory->getData(), m_constOp->get_data_ptr(), m_constOp->get_byte_size()); } } @@ -287,7 +293,7 @@ void Input::cloneBlobIfRequired() { }; auto isBlobAligned = [&, this] () { - const void *ptr = constOp->get_data_ptr(); + const void *ptr = m_constOp->get_data_ptr(); bool blobAlignedOnSSE = true; #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) // Majority of arithmetic and data processing instructions in legacy SSE isa requires @@ -302,7 +308,7 @@ void Input::cloneBlobIfRequired() { // The presence of subnormals is better to determined at IR read time. auto hasSubnormals = [&, this] () { if (prec == ov::element::f32) { - uint32_t const *u32data = constOp->get_data_ptr(); + uint32_t const *u32data = m_constOp->get_data_ptr(); if (!size) return false; @@ -345,7 +351,7 @@ void Input::cloneBlobIfRequired() { auto blobKey = [&, this] () { char ptr[32]; - snprintf(ptr, sizeof ptr, "%p", constOp->get_data_ptr()); + snprintf(ptr, sizeof ptr, "%p", m_constOp->get_data_ptr()); return getName() + "_" + std::to_string(size * prec.size()) + "_" + ptr; @@ -362,7 +368,7 @@ void Input::cloneBlobIfRequired() { // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where original weights are stored. (!weightCache || context->getNumNumaNodes() == 1 || context->getCPUStreamExecutor()->get_streams_num() == 1); - memoryPtr = clone_is_not_needed ? std::make_shared(getEngine(), memDesc, constOp->get_data_ptr()) + memoryPtr = clone_is_not_needed ? std::make_shared(getEngine(), memDesc, m_constOp->get_data_ptr()) : std::const_pointer_cast( weightCache ? *weightCache->findOrCreate(blobKey(), cloneBlob) : cloneBlob()); } diff --git a/src/plugins/intel_cpu/src/nodes/input.h b/src/plugins/intel_cpu/src/nodes/input.h index 9b304e5a75a891..de9324f0b13628 100644 --- a/src/plugins/intel_cpu/src/nodes/input.h +++ b/src/plugins/intel_cpu/src/nodes/input.h @@ -44,7 +44,7 @@ class Input : public Node { void initSupportedPdFromMemDesc(); private: - std::shared_ptr constOp; + std::shared_ptr m_constOp; MemoryCPtr memoryPtr; MemoryDescPtr extMemDesc = nullptr; bool isMeanImage = false; diff --git a/src/plugins/intel_cpu/src/nodes/reference.cpp b/src/plugins/intel_cpu/src/nodes/reference.cpp index 43b8f041184a70..d972914e9434b6 100644 --- a/src/plugins/intel_cpu/src/nodes/reference.cpp +++ b/src/plugins/intel_cpu/src/nodes/reference.cpp @@ -14,7 +14,7 @@ Reference::Reference(const std::shared_ptr& op, const GraphContext::CP Node(op, context, NgraphShapeInferFactory(op, FULL_PORT_MASK)), ovCoreNode(op), additionalErrorMessage(errorMessage) { if (!op->has_evaluate()) { OPENVINO_THROW_NOT_IMPLEMENTED( - "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented"); + "Cannot fallback on ngraph reference implementation (Ngraph::Node::evaluate() is not implemented for op: ", *op); } setType(Type::Reference); diff --git a/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp b/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp index 5aef73df1949bd..e23f9c3776abf6 100644 --- a/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/shape_inference/custom/fullyconnected.cpp @@ -15,7 +15,8 @@ Result FCShapeInfer::infer( const VectorDims& activationShape = input_shapes[0].get(); const VectorDims& weightShape = input_shapes[1].get(); size_t activationRank = activationShape.size(); - size_t channelRank = weightShape.size() - 1; + // size_t channelRank = weightShape.size() - 1; + size_t channelRank = 1; // activation weight output_shape // NCHW CoCHW NCo @@ -23,7 +24,7 @@ Result FCShapeInfer::infer( // NC CoC NCo VectorDims outputShape(out_rank, 1); // set Co - outputShape.back() = weightShape[0]; + outputShape.back() = std::accumulate(weightShape.begin(), weightShape.end() - 1, 1, std::multiplies()); // set batch dims size_t batchRank = activationRank - channelRank; size_t startIdx = out_rank - batchRank - 1; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp index f2861843a81110..77cb336227db28 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp @@ -2,7 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +// #include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "ov_ops/fully_connected.hpp" #include "convert_matmul_to_fc.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/convert.hpp" @@ -10,6 +11,7 @@ #include "openvino/op/reshape.hpp" #include "openvino/core/rt_info.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" +#include "ov_ops/placeholder.hpp" #include "transformations/utils/utils.hpp" #include "itt.hpp" @@ -138,18 +140,18 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() { // Transferring from MatMul representation: [B, I, K] * [B, K, O] = [B, I, O] // to FullyConnected representation: [I, K] * [K, O] = [I, O] - if (rank_b != 2) { - ov::Dimension K = *(shape_b_aligned.rbegin() + 1); - OPENVINO_ASSERT(K.is_static()); - auto k_len = K.get_length(); - auto reshape_shape_values = matmul->get_transpose_b() ? std::vector{-1, k_len} : std::vector{k_len, -1}; - auto reshape_shape = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, reshape_shape_values); - fc_input_b = ov::op::util::make_try_fold(fc_input_b, reshape_shape, false); - if (!std::dynamic_pointer_cast(fc_input_b.get_node_shared_ptr())) { - new_ops.push_back(reshape_shape); - } - new_ops.push_back(fc_input_b.get_node_shared_ptr()); - } + // if (rank_b != 2) { + // ov::Dimension K = *(shape_b_aligned.rbegin() + 1); + // OPENVINO_ASSERT(K.is_static()); + // auto k_len = K.get_length(); + // auto reshape_shape_values = matmul->get_transpose_b() ? std::vector{-1, k_len} : std::vector{k_len, -1}; + // auto reshape_shape = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{ 2 }, reshape_shape_values); + // fc_input_b = ov::op::util::make_try_fold(fc_input_b, reshape_shape, false); + // if (!std::dynamic_pointer_cast(fc_input_b.get_node_shared_ptr())) { + // new_ops.push_back(reshape_shape); + // } + // new_ops.push_back(fc_input_b.get_node_shared_ptr()); + // } // Weights normalization if (!matmul->get_transpose_b()) { @@ -169,10 +171,15 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() { fc_input_b = convert; } - // Create FullyConnected - auto output_rank = matmul->get_output_partial_shape(0).rank(); - auto fc = std::make_shared(fc_input_a, fc_input_b, output_rank, - matmul->get_output_element_type(0)); + auto bias_ph = std::make_shared(); + new_ops.push_back(bias_ph); + + auto fc = std::make_shared(fc_input_a, + fc_input_b, + bias_ph, + matmul->get_output_element_type(0)); + // auto fc = std::make_shared(fc_input_a, fc_input_b, matmul->get_output_element_type(0)); + fc->set_friendly_name(matmul->get_friendly_name()); ///todo: CVS-130863 Remove after fp16_compression is copyable if (ov::fp16_compression_is_disabled(matmul)) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp index 8079286d1e3ad7..c44a4bc0952afe 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_to_power_static.cpp @@ -12,7 +12,8 @@ #include "openvino/pass/pattern/op/or.hpp" #include "transformations/rt_info/dequantization_node.hpp" #include "transformations/cpu_opset/common/op/power_static.hpp" -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "ov_ops/fully_connected.hpp" +// #include "transformations/cpu_opset/common/op/fully_connected.hpp" #include "utils/general_utils.h" #include "itt.hpp" @@ -47,16 +48,16 @@ bool isConvertableToPowerStatic(const std::shared_ptr &node) { return ov::shape_size(const_shape) == 1 && input_rank.get_length() >= static_cast(const_shape.size()) && !ov::intel_cpu::one_of(node->get_input_node_shared_ptr(nonConstPort)->get_type_info(), - ov::opset1::NormalizeL2::get_type_info_static(), - ov::opset4::Interpolate::get_type_info_static(), - ov::opset1::Convolution::get_type_info_static(), - ov::opset1::GroupConvolution::get_type_info_static(), - ov::opset1::ConvolutionBackpropData::get_type_info_static(), - ov::opset1::GroupConvolutionBackpropData::get_type_info_static(), - ov::opset1::MatMul::get_type_info_static(), - ov::intel_cpu::FullyConnectedNode::get_type_info_static(), - ov::op::v0::MVN::get_type_info_static(), - ov::opset6::MVN::get_type_info_static()); + ov::opset1::NormalizeL2::get_type_info_static(), + ov::opset4::Interpolate::get_type_info_static(), + ov::opset1::Convolution::get_type_info_static(), + ov::opset1::GroupConvolution::get_type_info_static(), + ov::opset1::ConvolutionBackpropData::get_type_info_static(), + ov::opset1::GroupConvolutionBackpropData::get_type_info_static(), + ov::opset1::MatMul::get_type_info_static(), + ov::op::internal::FullyConnected::get_type_info_static(), + ov::op::v0::MVN::get_type_info_static(), + ov::opset6::MVN::get_type_info_static()); } template <> diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp new file mode 100644 index 00000000000000..c07d45ed200d37 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fc_bias_fusion.hpp" +#include +#include + +#include "openvino/core/type.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "ov_ops/fully_connected.hpp" +#include "ov_ops/fully_connected_quantized.hpp" +#include "ov_ops/placeholder.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/add.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" + +#include "transformations/utils/utils.hpp" + +#include "itt.hpp" + +ov::intel_cpu::FullyConnectedBiasFusion::FullyConnectedBiasFusion() { + MATCHER_SCOPE(FullyConnectedBiasFusion); + auto any = ov::pass::pattern::any_input(); + auto input = any; + auto weights = ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()); + auto ph = ov::pass::pattern::wrap_type(); + + auto has_single_consumer = [](ov::Output output) { + return ov::pass::pattern::consumers_count(1)(output); + }; + + auto m_fc = + ov::pass::pattern::wrap_type({input, weights, ph}, has_single_consumer); + + auto m_fc_q = ov::pass::pattern::wrap_type( + { + input, + weights, + ph, + ov::pass::pattern::any_input(), + ov::pass::pattern::any_input(), + ov::pass::pattern::any_input(), + ov::pass::pattern::any_input(), + ov::pass::pattern::any_input(), + ov::pass::pattern::any_input() + }, + has_single_consumer); + + auto m_fc_or = std::make_shared( + OutputVector{ + m_fc, + m_fc_q, + }); + + auto m_bias = ov::pass::pattern::any_input(ov::pass::pattern::has_static_shape()); + auto m_add = ov::pass::pattern::wrap_type({m_fc_or, m_bias}); + + ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher &m) { + auto& pattern_to_output = m.get_pattern_value_map(); + + auto add = pattern_to_output[m_add].get_node_shared_ptr(); + auto bias = pattern_to_output[m_bias].get_node_shared_ptr(); + auto fc = pattern_to_output.count(m_fc) ? pattern_to_output[m_fc].get_node_shared_ptr() + : pattern_to_output[m_fc_q].get_node_shared_ptr(); + + if (transformation_callback(fc)) { + return false; + } + + if (!std::dynamic_pointer_cast(bias)) { + return false; + } + + ov::Shape bias_shape(bias->get_shape()); + ov::PartialShape output_shape(fc->get_output_partial_shape(0)); + size_t bias_size = ov::shape_size(bias_shape); + auto rank = output_shape.rank().get_length(); + if (rank == 0 || output_shape[rank - 1].is_dynamic()) { + return false; + } + + if (bias_shape.empty() || static_cast(bias_shape.back()) != output_shape[rank - 1].get_length() || bias_shape.back() != bias_size) { + return false; + } + + ov::NodeVector new_ops; + + std::shared_ptr final_bias = bias; + if (bias_shape.size() >= 2) { + auto reshape_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{ 1 }, { -1 }); + final_bias = ov::op::util::make_try_fold(final_bias, reshape_const, true); + new_ops.push_back(final_bias); + } + + std::shared_ptr fc_with_bias; + + // @todo can be replaced by some virtual function, i.e. clone_with_new_bias() + // so we don't need to down cast here + auto fc_node = std::dynamic_pointer_cast(fc); + fc_with_bias = fc_node->fuse_bias(final_bias); + + new_ops.push_back(fc_with_bias); + + fc_with_bias->set_friendly_name(add->get_friendly_name()); + ov::copy_runtime_info({fc, add}, new_ops); + ov::replace_node(add, fc_with_bias); + return true; + }; + + auto m = std::make_shared(m_add, matcher_name); + this->register_matcher(m, callback); +} diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp new file mode 100644 index 00000000000000..e10af028544b61 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/fc_bias_fusion.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_cpu { + +class FullyConnectedBiasFusion : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("FullyConnectedBiasFusion", "0"); + FullyConnectedBiasFusion(); +}; + +class FullyConnectedBiasFusions : public ov::pass::GraphRewrite { +public: + OPENVINO_RTTI("FullyConnectedBiasFusion", "0"); + FullyConnectedBiasFusions() { + add_matcher(); + } +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp index e681cd48ce8087..8ea914240f00d5 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/move_fc_reshape_to_weights.cpp @@ -2,7 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "transformations/cpu_opset/common/op/fully_connected.hpp" +// #include "transformations/cpu_opset/common/op/fully_connected.hpp" +#include "ov_ops/fully_connected.hpp" #include "move_fc_reshape_to_weights.hpp" #include #include @@ -48,7 +49,8 @@ ov::intel_cpu::MoveFCReshapeToWeights::MoveFCReshapeToWeights() { auto weights_input_m = std::make_shared(ov::OutputVector{reshape_m, transpose_m}); auto data_m = any_input(); - auto fully_connected_m = wrap_type({data_m, weights_input_m}); + auto bias_m = any_input(); + auto fully_connected_m = wrap_type({data_m, weights_input_m, bias_m}); ov::matcher_pass_callback callback = [&](ov::pass::pattern::Matcher& m) { const auto fully_connected = m.get_match_root(); diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp index 20502f67d3645e..12f6395820eb86 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/convert_to_cpu_specific_opset.hpp @@ -6,6 +6,7 @@ #include "openvino/op/fake_quantize.hpp" #include "openvino/pass/manager.hpp" #include "common/pass/align_matmul_input_ranks.hpp" +#include "transformations/common_optimizations/nop_elimination.hpp" #include "transformations/common_optimizations/reshape_prelu.hpp" #include "common/pass/convert_broadcast_to_tiles.hpp" #include "common/pass/convert_tile_to_seq_tiles.hpp" @@ -14,8 +15,11 @@ #include "common/pass/convert_to_leaky_relu.hpp" #include "common/pass/convert_to_swish_cpu.hpp" #include "common/pass/move_fc_reshape_to_weights.hpp" +#include "common/pass/fc_bias_fusion.hpp" #include "common/pass/split_fc.hpp" #include "transformations/convert_precision.hpp" +#include "transformations/op_conversions/convert_fc_to_compressed.hpp" +#include "transformations/op_conversions/convert_fc_to_quantized.hpp" #include "transformations/utils/utils.hpp" #include "common/pass/rnn_sequences_optimization.hpp" #include "transformations/common_optimizations/reshape_sequence_fusion.hpp" @@ -31,7 +35,31 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr &model) { ov::pass::Manager manager("CPU:ConvertToCPUSpecificOpset"); manager.set_per_pass_validation(false); + + // CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks); CPU_REGISTER_PASS_COMMON(manager, ConvertMatMulToFC); + if (std::getenv("EXTRA_DUMP")) { + manager.run_passes(model); + ov::pass::Serialize("after_fc.xml", "/dev/null").run_on_model(model); + CPU_DISABLE_PASS_COMMON(manager, ConvertMatMulToFC); + } + CPU_REGISTER_PASS_X64(manager, pass::ConvertFullyConnectedToFullyConnectedCompressed); + // CPU_SET_CALLBACK_COMMON(manager, + // [](const std::shared_ptr& node) -> bool { + // const auto& weights = node->input_value(1); + // const auto& weights_shape = weights.get_shape(); + // const auto OC = *(weights_shape.rbegin() + 1); + // return OC == 1; + // }, + // pass::ConvertFullyConnectedToFullyConnectedCompressed); + + CPU_REGISTER_PASS_X64(manager, pass::ConvertFullyConnectedToFullyConnectedQuantized); + if (std::getenv("EXTRA_DUMP")) { + manager.run_passes(model); + ov::pass::Serialize("after_fc_quantized.xml", "/dev/null").run_on_model(model); + CPU_DISABLE_PASS_COMMON(manager, ConvertMatMulToFC); + } + CPU_REGISTER_PASS_COMMON(manager, FullyConnectedBiasFusion); CPU_REGISTER_PASS_X64(manager, MoveFCReshapeToWeights); CPU_REGISTER_PASS_X64(manager, ov::pass::Validate); CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks); diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp index 69b3da9be00227..5aae9ad3ea8c59 100644 --- a/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp +++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.cpp @@ -2,6 +2,7 @@ // Copyright (C) 2018-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include "openvino/core/type/element_type.hpp" #ifdef CPU_DEBUG_CAPS #include "cpu_memory.h" @@ -316,7 +317,7 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) { void * data = pmem->getData(); auto shape = pmem->getDesc().getShape().getDims(); - if (shape_size(shape) <= 8) { + if (shape_size(shape) <= 8 && pmem->getDesc().getPrecision() != ov::element::undefined) { auto type = pmem->getDesc().getPrecision(); auto tensor = ov::Tensor(type, shape, data); auto constop = std::make_shared(tensor); @@ -669,7 +670,7 @@ std::ostream& operator<<(std::ostream& os, const IMemory& mem) { } return os; } -// @todo remove + void print_dnnl_memory(const dnnl::memory& memory, const size_t size, const int id, const char* message) { const size_t s = memory.get_desc().get_size() / sizeof(float); std::cout << message << " " << id << " size: " << s << ", values: "; diff --git a/src/plugins/intel_cpu/src/utils/debug_capabilities.h b/src/plugins/intel_cpu/src/utils/debug_capabilities.h index cea96c6cfdbd72..4bee8361ecbc8f 100644 --- a/src/plugins/intel_cpu/src/utils/debug_capabilities.h +++ b/src/plugins/intel_cpu/src/utils/debug_capabilities.h @@ -3,6 +3,7 @@ // #pragma once +#include "cpu_types.h" #include "openvino/util/env_util.hpp" #ifdef CPU_DEBUG_CAPS @@ -94,6 +95,12 @@ class PrintableTimer { } }; +template +std::ostream & operator<<(std::ostream & os, const std::vector vec) { + for (const auto& element : vec) + os << element << "x"; + return os; +} std::ostream & operator<<(std::ostream & os, const PortConfig& desc); std::ostream & operator<<(std::ostream & os, const NodeConfig& desc); std::ostream & operator<<(std::ostream & os, const NodeDesc& desc); diff --git a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake index 057869a864d87b..6e52f1928a60db 100644 --- a/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake +++ b/src/plugins/intel_cpu/tests/functional/cmake/target_per_test.cmake @@ -96,7 +96,9 @@ endif() endfunction() if(ENABLE_CPU_SPECIFIC_TARGET_PER_TEST) - create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src ov_cpu_func_subgraph) + create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/x64 ov_cpu_func_subgraph_x64) + create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/subgraph_tests/src/common ov_cpu_func_subgraph_common) create_target_per_test_for_directory(${CMAKE_CURRENT_SOURCE_DIR}/custom/single_layer_tests ov_cpu_func_slt) endif() diff --git a/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp b/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp index cb085920d97dc5..2fa3554f60e17a 100644 --- a/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include "ov_ops/fully_connected.hpp" #include #include #include @@ -19,6 +19,7 @@ #include #include "common_test_utils/ov_test_utils.hpp" +#include "ov_ops/placeholder.hpp" #include "transformations/rt_info/decompression.hpp" using namespace testing; @@ -38,11 +39,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest1) { auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); auto transpose1 = std::make_shared(input1, transpose_constant1); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 2 }, { 1 }); - auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 }); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 2 }, { 1 }); + auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, {0, 2, 1 }); auto transpose2 = std::make_shared(input2, transpose_constant2); - auto matmul = std::make_shared(transpose1, transpose2, ov::Rank(3)); + auto matmul = std::make_shared(transpose1, + transpose2, + std::make_shared()); model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); } @@ -78,7 +81,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest3) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3)); + auto matmul = std::make_shared(input1, input2, std::make_shared()); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -96,7 +99,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest4) { { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3)); + auto matmul = std::make_shared(input1, input2, std::make_shared()); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -132,7 +135,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest7) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 2}, {1}); - auto fc = std::make_shared(input1, input2, ov::Rank(2)); + auto fc = std::make_shared(input1, input2, std::make_shared()); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -151,7 +154,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest8) { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{3, 2}, {1}); - auto fc = std::make_shared(input1, input2, ov::Rank(2)); + auto fc = std::make_shared(input1, input2, std::make_shared()); auto a_shape = std::make_shared(input1); auto I = ov::op::util::node_to_get_shape_value_of_indices_from_shape_node(a_shape, {0, 1}); @@ -174,7 +177,7 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest9) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 2}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3)); + auto matmul = std::make_shared(input1, input2, std::make_shared()); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -218,8 +221,8 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest13) { } { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 1}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{80, 1}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3)); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 80, 1}, {1}); + auto matmul = std::make_shared(input1, input2, std::make_shared()); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -242,8 +245,12 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest14) { } { auto input1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, 1}); - auto input2 = ov::opset1::Constant::create(ov::element::i8, ov::Shape{80, 1}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(3), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::i8, ov::Shape{1, 80, 1}, {1}); + + auto matmul = std::make_shared(input1, + input2, + std::make_shared(), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } @@ -261,7 +268,11 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_1) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 3, 4, 5}); auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 6, 5 }, { 1 }); - auto fc = std::make_shared(input1, input2, ov::Rank(4), ov::element::f32); + + auto fc = std::make_shared(input1, + input2, + std::make_shared(), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -278,8 +289,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_2) { } { auto input1 = std::make_shared(ov::element::f32, ov::PartialShape{-1, -1, 1, 5}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{10, 5}, {1}); - auto fc = std::make_shared(input1, input2, ov::Rank(4)); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 10, 5}, {1}); + auto fc = std::make_shared(input1, + input2, + std::make_shared()); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -296,8 +309,11 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_3) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 }); - auto fc = std::make_shared(input1, input2, ov::Rank(4), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 }); + auto fc = std::make_shared(input1, + input2, + std::make_shared(), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -314,8 +330,11 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_4) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 }); - auto fc = std::make_shared(input1, input2, ov::Rank(4), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 }); + auto fc = std::make_shared(input1, + input2, + std::make_shared(), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -332,8 +351,11 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_4d_5) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{2, 3, 2, 4}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{5, 4}, { 1 }); - auto fc = std::make_shared(input1, input2, ov::Rank(4), ov::element::f32); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 5, 4}, { 1 }); + auto fc = std::make_shared(input1, + input2, + std::make_shared(), + ov::element::f32); model_ref = std::make_shared(ov::NodeVector{fc}, ov::ParameterVector{input1}); } @@ -350,8 +372,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_1) { } { auto input1 = std::make_shared(ov::element::f32, ov::Shape{5, 2, 3}); - auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{2, 3}, {1}); - auto matmul = std::make_shared(input1, input2, ov::Rank(2)); + auto input2 = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 2, 3}, {1}); + auto matmul = std::make_shared(input1, + input2, + std::make_shared()); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); } } @@ -368,7 +392,9 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_2) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 2, 3 }); auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 3 }, { 1 }); - auto matmul = std::make_shared(input1, weights, ov::Rank(2)); + auto matmul = std::make_shared(input1, + weights, + std::make_shared()); model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); } @@ -386,8 +412,10 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_second_input_rank_adj_3) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 5, 2, 3 }); - auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 2, 3 }, { 1 }); - auto matmul = std::make_shared(input1, weights, ov::Rank(3)); + auto weights = ov::opset1::Constant::create(ov::element::f32, ov::Shape{ 1, 2, 3 }, { 1 }); + auto matmul = std::make_shared(input1, + weights, + std::make_shared()); model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); } } @@ -406,12 +434,14 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_decompress_convert_0) { { auto input1 = std::make_shared(ov::element::f32, ov::Shape{ 3, 2, 2 }); - auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 2, 2 }, { 1 }); - auto transpose_constant = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 }); + auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 1, 2, 2 }, { 1 }); + auto transpose_constant = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); auto transpose = std::make_shared(input2, transpose_constant); auto convert = std::make_shared(transpose, ov::element::f32); - auto matmul = std::make_shared(input1, convert, ov::Rank(3)); + auto matmul = std::make_shared(input1, + convert, + std::make_shared()); model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); } @@ -433,12 +463,14 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_decompress_convert_1) { auto transpose_constant1 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); auto transpose1 = std::make_shared(input1, transpose_constant1); - auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 2, 2 }, { 1 }); - auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 2 }, { 1, 0 }); + auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{ 1, 2, 2 }, { 1 }); + auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{ 3 }, { 0, 2, 1 }); auto transpose2 = std::make_shared(input2, transpose_constant2); auto convert = std::make_shared(transpose2, ov::element::f32); - auto matmul = std::make_shared(transpose1, convert, ov::Rank(3)); + auto matmul = std::make_shared(transpose1, + convert, + std::make_shared()); model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ input1 }); } @@ -467,11 +499,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_compressed_u8_weights) { auto mul_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1, 1, 2}, {1}); auto mul = std::make_shared(sub, mul_const); - auto reshape_const = ov::opset1::Constant::create(ov::element::i32, {2}, {2, -1}); - auto reshape = std::make_shared(mul, reshape_const, false); - auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {2}, {1, 0}); - auto transpose = std::make_shared(reshape, transpose_const); - auto matmul = std::make_shared(data, transpose, ov::Rank(3)); + // auto reshape_const = ov::opset1::Constant::create(ov::element::i32, {2}, {2, -1}); + // auto reshape = std::make_shared(mul, reshape_const, false); + auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {3}, {0, 2, 1}); + auto transpose = std::make_shared(mul, transpose_const); + auto matmul = std::make_shared(data, + transpose, + std::make_shared()); model_ref = std::make_shared(ov::NodeVector{ matmul }, ov::ParameterVector{ data }); } diff --git a/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp b/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp index 68241c9169bce7..6b8268729d3457 100644 --- a/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp +++ b/src/plugins/intel_cpu/tests/unit/transformations/move_fc_reshape_to_weights.cpp @@ -11,11 +11,12 @@ #include #include -#include +#include "ov_ops/fully_connected.hpp" #include #include #include "common_test_utils/ov_test_utils.hpp" +#include "ov_ops/placeholder.hpp" using namespace testing; using namespace ov::intel_cpu; @@ -115,7 +116,8 @@ class MoveFCReshapeToWeightsTests : public TransformationTestsF, public WithPara auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {2}, {1, 0}); weights_path = std::make_shared(weights_path, transpose_const); } - auto fully_connected = std::make_shared(data, weights_path, ov::Rank(3)); + + auto fully_connected = std::make_shared(data, weights_path, std::make_shared()); return std::make_shared(ov::NodeVector{fully_connected}, ov::ParameterVector{data}); } diff --git a/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp b/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp index f5453b3c536480..fd400d84bfec17 100644 --- a/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp +++ b/src/tests/test_utils/common_test_utils/src/ov_test_utils.cpp @@ -72,6 +72,7 @@ void TransformationTestsF::TearDown() { manager.register_pass(m_unh, m_soft_names_comparison, m_result_friendly_names_check); manager.run_passes(model); + // why do we need this check for fused names? if (!m_disable_rt_info_check) { OV_ASSERT_NO_THROW(check_rt_info(model)); } @@ -85,6 +86,7 @@ void TransformationTestsF::TearDown() { ASSERT_TRUE(res.valid) << res.message; comparator.disable(FunctionsComparator::CmpValues::ACCURACY); } + auto res = comparator.compare(model, model_ref); ASSERT_TRUE(res.valid) << res.message; }