Skip to content

Commit

Permalink
lstm_weights optimization is part of post_optimize_weights
Browse files Browse the repository at this point in the history
  • Loading branch information
michal-miotk committed Oct 20, 2024
1 parent b539a3f commit dc8ac73
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 182 deletions.
31 changes: 15 additions & 16 deletions src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,21 @@ class ICompilationContext;
struct program {
using ptr = std::shared_ptr<program>;
using cptr = std::shared_ptr<const program>;
friend class calculate_prior_boxes; // to be removed when possible
friend class graph_initializations; // to be removed when possible
friend class prepare_padding; // to be removed when possible
friend class propagate_constants; // to be removed when possible
friend class pre_replace_deconv; // to be removed when possible
friend class prepare_primitive_fusing; // to be removed when possible
friend class prepare_quantization; // to be removed when possible
friend class reorder_inputs; // to be removed when possible
friend class remove_redundant_reorders; // to be removed when possible
friend class post_optimize_weights; // to be removed when possible
friend class post_optimize_lstm_weights_and_output; // to be removed when possible
friend class prepare_primitive_fusing_through; // to be removed when possible
friend class reorder_transfer; // to be removed when possible
friend class fuse_constant_transposes; // to be removed when possible
friend class program_wrapper; // this class is intended to extend the interface of program for
// the usage within tests_core_internal project only
friend class calculate_prior_boxes; // to be removed when possible
friend class graph_initializations; // to be removed when possible
friend class prepare_padding; // to be removed when possible
friend class propagate_constants; // to be removed when possible
friend class pre_replace_deconv; // to be removed when possible
friend class prepare_primitive_fusing; // to be removed when possible
friend class prepare_quantization; // to be removed when possible
friend class reorder_inputs; // to be removed when possible
friend class remove_redundant_reorders; // to be removed when possible
friend class post_optimize_weights; // to be removed when possible
friend class prepare_primitive_fusing_through; // to be removed when possible
friend class reorder_transfer; // to be removed when possible
friend class fuse_constant_transposes; // to be removed when possible
friend class program_wrapper; // this class is intended to extend the interface of program for
// the usage within tests_core_internal project only
public:
struct nodes_ordering {
public:
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,10 @@ struct lstm_seq : public primitive_base<lstm_seq> {
using vec_activation = std::vector<activation_func>;
using vec_activation_param = std::vector<activation_additional_params>;
lstm_seq(const RNNParams& p): primitive_base(p.id, p.get_inputs(), p.num_outputs, \
{optional_data_type()}, {p.output_padding}), params(p) {}
{optional_data_type()}, {p.output_padding}), params(p), weights(params.W), bias(params.B) {}
RNNParams params;
input_info weights;
input_info bias;
size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, params.hash());
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
#include "convolution_inst.h"
#include "deconvolution_inst.h"
#include "fully_connected_inst.h"
#include "lstm_seq_inst.h"
#include "intel_gpu/runtime/format.hpp"
#include "intel_gpu/primitives/mutable_data.hpp"
#ifdef ENABLE_ONEDNN_FOR_GPU
#include "graph/impls/onednn/utils.hpp"
#endif // ENABLE_ONEDNN_FOR_GPU
Expand All @@ -19,6 +21,9 @@ post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref)
: base_pass("post_optimize_weights"), _rf(rf_ref) {}

template<typename T> post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) {
if (node.type() == lstm_seq::type_id()) {
return weights_bias_offset(3, 3);
}
return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size());
}

Expand Down Expand Up @@ -109,15 +114,26 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
set_implementation(weights_reorder_node);
}
} else {
auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params);
// insert new weights reorder node to topology
p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second);
// set weights reorder's node output layout and implementation
auto& weights_reorder_node = node.get_dependency(i);
weights_reorder_node.get_output_layout(false);

if (!weights_reorder.second) {
set_implementation(weights_reorder_node);
if (node.type() == lstm_seq::type_id()) {
program_node& prev_node = node.get_dependency(i);
if (i != 5) {
_rf.get_weights_split(prev_node.id(), weights_reorder_params, p, prev_node, node, i);
} else {
_rf.get_bias_split(prev_node.id(), weights_reorder_params, p, prev_node, node);
}
auto& weights_reorder_node = node.get_dependency(i);
weights_reorder_node.get_output_layout(false);
} else {
auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params);
// insert new weights reorder node to topology
p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second);
// set weights reorder's node output layout and implementation
auto& weights_reorder_node = node.get_dependency(i);
weights_reorder_node.get_output_layout(false);

if (!weights_reorder.second) {
set_implementation(weights_reorder_node);
}
}
}
}
Expand All @@ -134,8 +150,33 @@ void post_optimize_weights::run(program& p) {
optimize_weights(node->as<deconvolution>(), p);
} else if (node->is_type<fully_connected>()) {
optimize_weights(node->as<fully_connected>(), p);
} else if (node->is_type<lstm_seq>()) {
optimize_weights(node->as<lstm_seq>(), p);
}
}
p.get_processing_order().calc_processing_order(p);
int i = 0;
for (auto node : p.get_processing_order()) {
if (node->is_type<cldnn::mutable_data>()) {
continue;
}
for (auto prev_node : node->get_dependencies()) {
if (prev_node.first->is_type<lstm_seq>()) {
auto impl = prev_node.first->get_selected_impl();
if (!impl)
continue;
auto weights_reorder_params = impl->get_weights_reorder_params();
if (weights_reorder_params == nullptr) {
continue;
}
prev_node.first->recalc_output_layouts(false);
_rf.get_out_reorder(p, prev_node.first, node, i);
node->recalc_output_layouts(false);
i++;
}
}
}
p.get_processing_order().calc_processing_order(p);
}

} // namespace cldnn
24 changes: 0 additions & 24 deletions src/plugins/intel_gpu/src/graph/include/pass_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,30 +199,6 @@ class post_optimize_weights : public base_pass {
reorder_factory& _rf;
};

class post_optimize_lstm_weights_and_output : public base_pass {
public:
explicit post_optimize_lstm_weights_and_output(reorder_factory& rf_ref);

private:
struct weights_bias_offset {
size_t weights_offset;
size_t bias_offset;

// When using this ctor weights offset is added to the bias_offset
weights_bias_offset(const size_t w_offset, const size_t b_offset)
: weights_offset(w_offset)
, bias_offset(weights_offset + b_offset)
{}
};

void run(program& p) override;
template<typename T>
weights_bias_offset get_weights_bias_offset(const T& node);
template<typename T>
void optimize_lstm_weights(T& node, program& p);
reorder_factory& _rf;
};

class propagate_constants : public base_pass {
public:
propagate_constants() : base_pass("propagate_constants") {}
Expand Down
33 changes: 11 additions & 22 deletions src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,19 +199,16 @@ void reorder_factory::get_weights_split(primitive_id input_id,

std::string permute_id = input_id + "_perx";
std::vector<uint16_t> ord{2, 4, 3, 0, 1};
//std::vector<uint16_t> ord{0, 1, 2, 3, 4};
auto permute = std::make_shared<cldnn::permute>(permute_id, input_info{concat_id}, ord);
auto& permute_node = p.get_or_create(permute);
p.add_intermediate(permute_node, node, con_node, true);
permute_node.get_output_layout(false);
select_implementation(p, con_node);
select_implementation(p, reorder_node);

select_implementation(p, crop0_node);
select_implementation(p, crop1_node);
select_implementation(p, crop2_node);
select_implementation(p, crop3_node);

select_implementation(p, permute_node);
p.mark_if_constant(crop0_node);
p.mark_if_constant(crop1_node);
Expand All @@ -235,20 +232,17 @@ void reorder_factory::get_bias_split(primitive_id input_id,
cache_key ckey{ input_id, reorder_params->get_output_layout(), false };
auto hiddenSize = reorder_params->get_input_layout().get_shape()[1] / 4;
auto cropSizeR = cldnn::tensor{1, static_cast<int>(hiddenSize), 1, 1};
std::string crop_id = input_id + "_crop";
auto crop0_id = primitive_id(crop_id + "0");
auto crop1_id = primitive_id(crop_id + "1");
auto crop2_id = primitive_id(crop_id + "2");
auto crop3_id = primitive_id(crop_id + "3");
auto crop0 = std::make_shared<cldnn::crop>(crop0_id, input_id, cropSizeR, cldnn::tensor{0, 0, 0, 0});
auto crop1 = std::make_shared<cldnn::crop>(crop1_id, input_id, cropSizeR, cldnn::tensor{0, static_cast<int>(1*hiddenSize), 0, 0});
auto crop2 = std::make_shared<cldnn::crop>(crop2_id, input_id, cropSizeR, cldnn::tensor{0, static_cast<int>(2*hiddenSize), 0, 0});
auto crop3 = std::make_shared<cldnn::crop>(crop3_id, input_id, cropSizeR, cldnn::tensor{0, static_cast<int>(3*hiddenSize), 0, 0});
auto& crop0_node = p.get_or_create(crop0);
auto& crop1_node = p.get_or_create(crop1);
auto& crop2_node = p.get_or_create(crop2);
auto& crop3_node = p.get_or_create(crop3);
std::vector<input_info> con_input{input_info(crop1_id), input_info(crop0_id), input_info(crop2_id), input_info(crop3_id)};
std::string crop_id_b = input_id + "_c";
auto get_crop_node = [&](int cropNum) -> cldnn::program_node& {
auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum));
auto crop_prim = std::make_shared<cldnn::crop>(crop_id, input_id, cropSizeR, cldnn::tensor{0, static_cast<int>(cropNum*hiddenSize), 0, 0});
return p.get_or_create(crop_prim);
};
auto& crop0_node = get_crop_node(0);
auto& crop1_node = get_crop_node(1);
auto& crop2_node = get_crop_node(2);
auto& crop3_node = get_crop_node(3);
std::vector<input_info> con_input{input_info(crop1_node.id()), input_info(crop0_node.id()), input_info(crop2_node.id()), input_info(crop3_node.id())};
cldnn::primitive_id concat_id{input_id + "concat"};
auto con = std::make_shared<cldnn::concatenation>(concat_id, con_input, 2);
auto& con_node = p.get_or_create(con);
Expand All @@ -273,16 +267,11 @@ void reorder_factory::get_bias_split(primitive_id input_id,
p.add_intermediate(permute_node, node, con_node, true);
permute_node.get_output_layout(false);
select_implementation(p, crop0_node);

select_implementation(p, crop1_node);

select_implementation(p, crop2_node);
select_implementation(p, crop3_node);

select_implementation(p, permute_node);

select_implementation(p, con_node);

p.mark_if_constant(crop0_node);
p.mark_if_constant(crop1_node);
p.mark_if_constant(crop2_node);
Expand Down
1 change: 0 additions & 1 deletion src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,6 @@ void program::post_optimize_graph(bool is_internal) {

if (!is_internal) {
apply_opt_pass<post_optimize_weights>(rf);
apply_opt_pass<post_optimize_lstm_weights_and_output>(rf);
}

apply_opt_pass<remove_redundant_reorders>(false, true); // TODO: do we need it at this place also?
Expand Down

0 comments on commit dc8ac73

Please sign in to comment.