lstm_weights optimization is part of post_optimize_weights

openvinotoolkit · Oct 20, 2024 · dc8ac73 · dc8ac73
1 parent b539a3f
commit dc8ac73
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 182 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
@@ -35,22 +35,21 @@ class ICompilationContext;
 struct program {
     using ptr = std::shared_ptr<program>;
     using cptr = std::shared_ptr<const program>;
-    friend class calculate_prior_boxes;                  // to be removed when possible
-    friend class graph_initializations;                  // to be removed when possible
-    friend class prepare_padding;                        // to be removed when possible
-    friend class propagate_constants;                    // to be removed when possible
-    friend class pre_replace_deconv;                     // to be removed when possible
-    friend class prepare_primitive_fusing;               // to be removed when possible
-    friend class prepare_quantization;                   // to be removed when possible
-    friend class reorder_inputs;                         // to be removed when possible
-    friend class remove_redundant_reorders;              // to be removed when possible
-    friend class post_optimize_weights;                  // to be removed when possible
-    friend class post_optimize_lstm_weights_and_output;  // to be removed when possible
-    friend class prepare_primitive_fusing_through;       // to be removed when possible
-    friend class reorder_transfer;                       // to be removed when possible
-    friend class fuse_constant_transposes;               // to be removed when possible
-    friend class program_wrapper;                        // this class is intended to extend the interface of program for
-                                                         // the usage within tests_core_internal project only
+    friend class calculate_prior_boxes;             // to be removed when possible
+    friend class graph_initializations;             // to be removed when possible
+    friend class prepare_padding;                   // to be removed when possible
+    friend class propagate_constants;               // to be removed when possible
+    friend class pre_replace_deconv;                // to be removed when possible
+    friend class prepare_primitive_fusing;          // to be removed when possible
+    friend class prepare_quantization;              // to be removed when possible
+    friend class reorder_inputs;                    // to be removed when possible
+    friend class remove_redundant_reorders;         // to be removed when possible
+    friend class post_optimize_weights;             // to be removed when possible
+    friend class prepare_primitive_fusing_through;  // to be removed when possible
+    friend class reorder_transfer;                  // to be removed when possible
+    friend class fuse_constant_transposes;          // to be removed when possible
+    friend class program_wrapper;                   // this class is intended to extend the interface of program for
+                                                    // the usage within tests_core_internal project only
 public:
     struct nodes_ordering {
     public:

diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp
@@ -203,8 +203,10 @@ struct lstm_seq : public primitive_base<lstm_seq> {
     using vec_activation = std::vector<activation_func>;
     using vec_activation_param = std::vector<activation_additional_params>;
     lstm_seq(const RNNParams& p): primitive_base(p.id, p.get_inputs(), p.num_outputs, \
-        {optional_data_type()}, {p.output_padding}), params(p) {}
+        {optional_data_type()}, {p.output_padding}), params(p), weights(params.W), bias(params.B) {}
     RNNParams params;
+    input_info weights;
+    input_info bias;
     size_t hash() const override {
         size_t seed = primitive::hash();
         seed = hash_combine(seed, params.hash());

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_lstm_weights_and_output.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_lstm_weights_and_output.cpp
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
@@ -9,7 +9,9 @@
 #include "convolution_inst.h"
 #include "deconvolution_inst.h"
 #include "fully_connected_inst.h"
+#include "lstm_seq_inst.h"
 #include "intel_gpu/runtime/format.hpp"
+#include "intel_gpu/primitives/mutable_data.hpp"
 #ifdef ENABLE_ONEDNN_FOR_GPU
 #include "graph/impls/onednn/utils.hpp"
 #endif // ENABLE_ONEDNN_FOR_GPU
@@ -19,6 +21,9 @@ post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref)
     : base_pass("post_optimize_weights"), _rf(rf_ref) {}
 
 template<typename T> post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) {
+    if (node.type() == lstm_seq::type_id()) {
+        return weights_bias_offset(3, 3);
+    }
     return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size());
 }
 
@@ -109,15 +114,26 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
                     set_implementation(weights_reorder_node);
                 }
             } else {
-                auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params);
-                // insert new weights reorder node to topology
-                p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second);
-                // set weights reorder's node output layout and implementation
-                auto& weights_reorder_node = node.get_dependency(i);
-                weights_reorder_node.get_output_layout(false);
-
-                if (!weights_reorder.second) {
-                    set_implementation(weights_reorder_node);
+                if (node.type() == lstm_seq::type_id()) {
+                    program_node& prev_node = node.get_dependency(i);
+                    if (i != 5) {
+                        _rf.get_weights_split(prev_node.id(), weights_reorder_params, p, prev_node, node, i);
+                    } else {
+                        _rf.get_bias_split(prev_node.id(), weights_reorder_params, p, prev_node, node);
+                    }
+                    auto& weights_reorder_node = node.get_dependency(i);
+                    weights_reorder_node.get_output_layout(false);
+                } else {
+                    auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params);
+                    // insert new weights reorder node to topology
+                    p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second);
+                    // set weights reorder's node output layout and implementation
+                    auto& weights_reorder_node = node.get_dependency(i);
+                    weights_reorder_node.get_output_layout(false);
+
+                    if (!weights_reorder.second) {
+                        set_implementation(weights_reorder_node);
+                    }
                 }
             }
         }
@@ -134,8 +150,33 @@ void post_optimize_weights::run(program& p) {
             optimize_weights(node->as<deconvolution>(), p);
         } else if (node->is_type<fully_connected>()) {
             optimize_weights(node->as<fully_connected>(), p);
+        } else if (node->is_type<lstm_seq>()) {
+            optimize_weights(node->as<lstm_seq>(), p);
+        }
+    }
+    p.get_processing_order().calc_processing_order(p);
+    int i = 0;
+    for (auto node : p.get_processing_order()) {
+        if (node->is_type<cldnn::mutable_data>()) {
+            continue;
+        }
+        for (auto prev_node : node->get_dependencies()) {
+            if (prev_node.first->is_type<lstm_seq>()) {
+                auto impl = prev_node.first->get_selected_impl();
+                 if (!impl)
+                    continue;
+                auto weights_reorder_params = impl->get_weights_reorder_params();
+                if (weights_reorder_params == nullptr) {
+                    continue;
+                }
+                prev_node.first->recalc_output_layouts(false);
+                _rf.get_out_reorder(p, prev_node.first, node, i);
+                node->recalc_output_layouts(false);
+                i++;
+            }
         }
     }
+    p.get_processing_order().calc_processing_order(p);
 }
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@@ -199,30 +199,6 @@ class post_optimize_weights : public base_pass {
     reorder_factory& _rf;
 };
 
-class post_optimize_lstm_weights_and_output : public base_pass {
-public:
-    explicit post_optimize_lstm_weights_and_output(reorder_factory& rf_ref);
-
-private:
-    struct weights_bias_offset {
-        size_t weights_offset;
-        size_t bias_offset;
-
-        // When using this ctor weights offset is added to the bias_offset
-        weights_bias_offset(const size_t w_offset, const size_t b_offset)
-            : weights_offset(w_offset)
-            , bias_offset(weights_offset + b_offset)
-        {}
-    };
-
-    void run(program& p) override;
-    template<typename T>
-    weights_bias_offset get_weights_bias_offset(const T& node);
-    template<typename T>
-    void optimize_lstm_weights(T& node, program& p);
-    reorder_factory& _rf;
-};
-
 class propagate_constants : public base_pass {
 public:
     propagate_constants() : base_pass("propagate_constants") {}

diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -199,19 +199,16 @@ void reorder_factory::get_weights_split(primitive_id input_id,
 
     std::string permute_id = input_id + "_perx";
     std::vector<uint16_t> ord{2, 4, 3, 0, 1};
-    //std::vector<uint16_t> ord{0, 1, 2, 3, 4};
     auto permute = std::make_shared<cldnn::permute>(permute_id, input_info{concat_id}, ord);
     auto& permute_node = p.get_or_create(permute);
     p.add_intermediate(permute_node, node, con_node,  true);
     permute_node.get_output_layout(false);
     select_implementation(p, con_node);
     select_implementation(p, reorder_node);
-
     select_implementation(p, crop0_node);
     select_implementation(p, crop1_node);
     select_implementation(p, crop2_node);
     select_implementation(p, crop3_node);
-
     select_implementation(p, permute_node);
     p.mark_if_constant(crop0_node);
     p.mark_if_constant(crop1_node);
@@ -235,20 +232,17 @@ void reorder_factory::get_bias_split(primitive_id input_id,
     cache_key ckey{ input_id, reorder_params->get_output_layout(), false };
     auto hiddenSize = reorder_params->get_input_layout().get_shape()[1] / 4;
     auto cropSizeR = cldnn::tensor{1, static_cast<int>(hiddenSize), 1, 1};
-    std::string crop_id = input_id + "_crop";
-    auto crop0_id = primitive_id(crop_id + "0");
-    auto crop1_id = primitive_id(crop_id + "1");
-    auto crop2_id = primitive_id(crop_id + "2");
-    auto crop3_id = primitive_id(crop_id + "3");
-    auto crop0 = std::make_shared<cldnn::crop>(crop0_id, input_id, cropSizeR, cldnn::tensor{0, 0, 0, 0});
-    auto crop1 = std::make_shared<cldnn::crop>(crop1_id, input_id, cropSizeR, cldnn::tensor{0, static_cast<int>(1*hiddenSize), 0, 0});
-    auto crop2 = std::make_shared<cldnn::crop>(crop2_id, input_id, cropSizeR, cldnn::tensor{0, static_cast<int>(2*hiddenSize), 0, 0});
-    auto crop3 = std::make_shared<cldnn::crop>(crop3_id, input_id, cropSizeR, cldnn::tensor{0, static_cast<int>(3*hiddenSize), 0, 0});
-    auto& crop0_node = p.get_or_create(crop0);
-    auto& crop1_node = p.get_or_create(crop1);
-    auto& crop2_node = p.get_or_create(crop2);
-    auto& crop3_node = p.get_or_create(crop3);
-    std::vector<input_info> con_input{input_info(crop1_id), input_info(crop0_id), input_info(crop2_id), input_info(crop3_id)};
+    std::string crop_id_b = input_id + "_c";
+    auto get_crop_node = [&](int cropNum) -> cldnn::program_node& {
+        auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum));
+        auto crop_prim = std::make_shared<cldnn::crop>(crop_id,  input_id, cropSizeR, cldnn::tensor{0, static_cast<int>(cropNum*hiddenSize), 0, 0});
+        return p.get_or_create(crop_prim);
+    };
+    auto& crop0_node = get_crop_node(0);
+    auto& crop1_node = get_crop_node(1);
+    auto& crop2_node = get_crop_node(2);
+    auto& crop3_node = get_crop_node(3);
+    std::vector<input_info> con_input{input_info(crop1_node.id()), input_info(crop0_node.id()), input_info(crop2_node.id()), input_info(crop3_node.id())};
     cldnn::primitive_id concat_id{input_id + "concat"};
     auto con = std::make_shared<cldnn::concatenation>(concat_id, con_input, 2);
     auto& con_node = p.get_or_create(con);
@@ -273,16 +267,11 @@ void reorder_factory::get_bias_split(primitive_id input_id,
     p.add_intermediate(permute_node, node, con_node,  true);
     permute_node.get_output_layout(false);
     select_implementation(p, crop0_node);
-
     select_implementation(p, crop1_node);
-
     select_implementation(p, crop2_node);
     select_implementation(p, crop3_node);
-
     select_implementation(p, permute_node);
-
     select_implementation(p, con_node);
-
     p.mark_if_constant(crop0_node);
     p.mark_if_constant(crop1_node);
     p.mark_if_constant(crop2_node);

diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -620,7 +620,6 @@ void program::post_optimize_graph(bool is_internal) {
 
     if (!is_internal) {
         apply_opt_pass<post_optimize_weights>(rf);
-        apply_opt_pass<post_optimize_lstm_weights_and_output>(rf);
     }
 
     apply_opt_pass<remove_redundant_reorders>(false, true);  // TODO: do we need it at this place also?