Skip to content

Commit

Permalink
[Hetero] Fix passing taget core code generation
Browse files Browse the repository at this point in the history
  • Loading branch information
YWHyuk committed Aug 28, 2024
1 parent b88ae4d commit 24445a4
Show file tree
Hide file tree
Showing 44 changed files with 158 additions and 150 deletions.
30 changes: 15 additions & 15 deletions src/models/LanguageModel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,12 @@ void LanguageModel::initialize_weight(std::vector<std::unique_ptr<Tensor>>& weig
}

weight_table.push_back(std::move(create_weight(name_gen(OperationType::LmHead, ParameterType::Weight), {_hidden_size, _model_config["vocab_size"]})));

_wgt_size = 0;
for (auto& wgt : weight_table) {
if(_run_single_layer && wgt->get_name() != name_gen(OperationType::LmHead, ParameterType::Weight)) {
_wgt_size += ((uint64_t)wgt->get_size()) * _num_layers;
}
}
else {
_wgt_size += wgt->get_size();
}
Expand All @@ -166,7 +166,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
}
std::vector<uint32_t> act_dim = {num_tokens, _hidden_size};
std::map<std::string, std::string> qkv_attr = {
{"has_bias", "1"},
{"has_bias", "1"},
{"input_shape", dims_to_string(act_dim)},
{"weight_shape", dims_to_string({_hidden_size,_qkv_out_dim})},
{"output_shape", dims_to_string({num_tokens, _qkv_out_dim})}};
Expand Down Expand Up @@ -213,14 +213,14 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
uint32_t id = tensor->get_id();
_tensor_map[id] = std::move(tensor);
}

std::map<std::string, std::string> empty_attr;
for(int l = 0; l < _num_sim_layers; l++) {
//QKV Proejction
std::string qkv_name = name_gen(LAYER(l), BlockType::Attention, OperationType::QKVGen);
uint32_t qkv_weight_id = _wgt_map[name_gen(qkv_name, ParameterType::Weight)];
uint32_t qkv_bias_id = _wgt_map[name_gen(qkv_name, ParameterType::Bias)];
auto qkv_op = std::make_unique<GemmWS>(_config, (Model*) this, qkv_name, qkv_attr);
auto qkv_op = std::make_unique<GemmWS>(_config, (Model*) this, qkv_name, qkv_attr, _target_core);
qkv_op->add_input(input_id);
qkv_op->add_input(qkv_weight_id);
qkv_op->add_input(qkv_bias_id);
Expand All @@ -229,7 +229,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
register_operation(std::move(qkv_op));
//KV Cache
auto kv_cache_op = std::make_unique<KVCacheConcat>(
_config, (Model*) this, name_gen(LAYER(l), BlockType::Attention, OperationType::KVCacheConcat), kv_concat_attr);
_config, (Model*) this, name_gen(LAYER(l), BlockType::Attention, OperationType::KVCacheConcat), kv_concat_attr, _target_core);
kv_cache_op->add_input(qkv_output_id);
for(int b = 0; b < _num_batch; b++) {
uint32_t key_cache_id = load_key_cache(l, b);
Expand All @@ -252,7 +252,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
for(int b = 0; b < _num_batch; b++) {
std::string attn_name = name_gen(LAYER(l), BlockType::Attention, OperationType::Attention, std::to_string(b));
attention_attr["num_tokens"] = std::to_string(input_lengthes[b]);
auto attn_op = std::make_unique<Attention>(_config, (Model*) this, attn_name, attention_attr);
auto attn_op = std::make_unique<Attention>(_config, (Model*) this, attn_name, attention_attr, _target_core);
attn_op->add_input(queries[b]);
attn_op->add_input(keys[b]);
attn_op->add_input(values[b]);
Expand All @@ -263,7 +263,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
}
//Concatenate attention outputs
std::string attn_concat_name = name_gen(LAYER(l), BlockType::Attention, OperationType::AttentionConcat);
auto attn_concat_op = std::make_unique<Concat>(_config, (Model*) this, attn_concat_name, concat_attr);
auto attn_concat_op = std::make_unique<Concat>(_config, (Model*) this, attn_concat_name, concat_attr, _target_core);
for(int b = 0; b < _num_batch; b++) {
attn_concat_op->add_input(attention_outs[b]);
}
Expand All @@ -274,7 +274,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
std::string proj_name = name_gen(LAYER(l), BlockType::Attention, OperationType::Projection);
uint32_t proj_weight_id = _wgt_map[name_gen(proj_name, ParameterType::Weight)];
uint32_t proj_bias_id = _wgt_map[name_gen(proj_name, ParameterType::Bias)];
auto proj_op = std::make_unique<GemmWS>(_config, (Model*) this, proj_name, proj_attr);
auto proj_op = std::make_unique<GemmWS>(_config, (Model*) this, proj_name, proj_attr, _target_core);
proj_op->add_input(attn_concat_output_id);
proj_op->add_input(proj_weight_id);
proj_op->add_input(proj_bias_id);
Expand All @@ -285,7 +285,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
std::string ln_name = name_gen(LAYER(l), BlockType::Attention, OperationType::LayerNorm);
uint32_t ln_weight_id = _wgt_map[name_gen(ln_name, ParameterType::Weight)];
uint32_t ln_bias_id = _wgt_map[name_gen(ln_name, ParameterType::Bias)];
auto ln_op = std::make_unique<SkipLayerNorm>(_config, (Model*) this, ln_name, empty_attr);
auto ln_op = std::make_unique<SkipLayerNorm>(_config, (Model*) this, ln_name, empty_attr, _target_core);
ln_op->add_input(input_id);
ln_op->add_input(proj_output_id);
ln_op->add_input(ln_weight_id);
Expand All @@ -298,15 +298,15 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
uint32_t ffn1_weight_id = _wgt_map[name_gen(ffn_name, OperationType::FullyConnected1, ParameterType::Weight)];
uint32_t ffn1_bias_id = _wgt_map[name_gen(ffn_name, OperationType::FullyConnected1, ParameterType::Bias)];
auto ffn1_op = std::make_unique<GemmWS>(
_config, (Model*) this, name_gen(ffn_name, OperationType::FullyConnected1), ffn1_attr);
_config, (Model*) this, name_gen(ffn_name, OperationType::FullyConnected1), ffn1_attr, _target_core);
ffn1_op->add_input(ln_output_id);
ffn1_op->add_input(ffn1_weight_id);
ffn1_op->initialize_tiles(_mapping_table);
uint32_t ffn1_output_id = ffn1_op->get_output(0)->get_id();
register_operation(std::move(ffn1_op));
//Gelu
std::string act_name = name_gen(LAYER(l), BlockType::FeedForward, OperationType::Act);
auto act_op = std::make_unique<BiasAct>(_config, (Model*) this, act_name, bias_act_attr);
auto act_op = std::make_unique<BiasAct>(_config, (Model*) this, act_name, bias_act_attr, _target_core);
act_op->add_input(ffn1_output_id);
act_op->add_input(ffn1_bias_id);
act_op->initialize_tiles(_mapping_table);
Expand All @@ -316,7 +316,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
uint32_t ffn2_weight_id = _wgt_map[name_gen(ffn_name, OperationType::FullyConnected2, ParameterType::Weight)];
uint32_t ffn2_bias_id = _wgt_map[name_gen(ffn_name, OperationType::FullyConnected2, ParameterType::Bias)];
auto ffn2_op = std::make_unique<GemmWS>(
_config, (Model*) this, name_gen(ffn_name, OperationType::FullyConnected2), ffn2_attr);
_config, (Model*) this, name_gen(ffn_name, OperationType::FullyConnected2), ffn2_attr, _target_core);
ffn2_op->add_input(act_output_id);
ffn2_op->add_input(ffn2_weight_id);
ffn2_op->add_input(ffn2_bias_id);
Expand All @@ -327,7 +327,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
std::string ff_ln_name = name_gen(LAYER(l), BlockType::FeedForward, OperationType::LayerNorm);
uint32_t ff_ln_weight_id = _wgt_map[name_gen(ff_ln_name, ParameterType::Weight)];
uint32_t ff_ln_bias_id = _wgt_map[name_gen(ff_ln_name, ParameterType::Bias)];
auto ff_ln_op = std::make_unique<SkipLayerNorm>(_config, (Model*) this, ff_ln_name, empty_attr);
auto ff_ln_op = std::make_unique<SkipLayerNorm>(_config, (Model*) this, ff_ln_name, empty_attr, _target_core);
ff_ln_op->add_input(ln_output_id);
ff_ln_op->add_input(ffn2_output_id);
ff_ln_op->add_input(ff_ln_weight_id);
Expand All @@ -342,7 +342,7 @@ void LanguageModel::initialize_model(std::vector<std::unique_ptr<Tensor>>& weigh
if(val->check_executable()) {
spdlog::debug("runnable op, {}", val->get_optype());
_executable_layer.push_back(val.get());
}
}
}
/* Model initialization time measurement */
auto end = std::chrono::high_resolution_clock::now();
Expand Down
4 changes: 2 additions & 2 deletions src/operations/AdaptiveAvgPool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
#include "../Tensor.h"

AdaptiveAvgPool::AdaptiveAvgPool(SimulationConfig config, Model* model,
onnx::NodeProto& node_proto)
: Operation(config, model, node_proto) {
onnx::NodeProto& node_proto, uint32_t target_core)
: Operation(config, model, node_proto, target_core) {
int kernel_dim = 0;
for (auto attribute : node_proto.attribute()) {
if (attribute.name() == "kernel_shape") {
Expand Down
2 changes: 1 addition & 1 deletion src/operations/AdaptiveAvgPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
class AdaptiveAvgPool : public Operation {
public:
AdaptiveAvgPool(SimulationConfig config, Model* model,
onnx::NodeProto& node_proto);
onnx::NodeProto& node_proto, uint32_t target_core=0);
AdaptiveAvgPool(const AdaptiveAvgPool& src);

virtual void initialize_tiles(MappingTable& mapping_table) override;
Expand Down
20 changes: 10 additions & 10 deletions src/operations/Attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#include "Softmax.h"

Attention::Attention(SimulationConfig config, Model* model,
onnx::NodeProto& node_proto)
: Operation(config, model, node_proto) {
onnx::NodeProto& node_proto, uint32_t target_core)
: Operation(config, model, node_proto, target_core) {
onnx = true;
for (auto attribute : node_proto.attribute()) {
if (attribute.name() == "num_heads") {
Expand Down Expand Up @@ -62,8 +62,8 @@ Attention::Attention(SimulationConfig config, Model* model,
}

Attention::Attention(SimulationConfig config, Model* model,
std::string name, std::map<std::string, std::string>& attributes)
:Operation(config, model, name, attributes) {
std::string name, std::map<std::string, std::string>& attributes, uint32_t target_core)
:Operation(config, model, name, attributes, target_core) {
_batch_size = 1;
_q_len = std::stoi(get_attribute("num_tokens"));
_nh = std::stoi(get_attribute("num_heads"));
Expand Down Expand Up @@ -138,17 +138,17 @@ void Attention::initialize_tiles(MappingTable& mapping_table) {
float kv_mem = _seq * _dk * _nkvh * 2 * _config.precision / (float) 1e9; //GB
float q_mem = _q_len * _dk * _nh * 2 * _config.precision / (float) 1e9; //GB
float total_mem = kv_mem + q_mem;
float compute_time = (qk_flops + kv_flops) / _config.max_systolic_flops(0) * 1e3;
float compute_time = (qk_flops + kv_flops) / _config.max_systolic_flops(target_core) * 1e3;
compute_time += softmax_flops / _config.max_vector_flops(target_core) * 1e3;
float mem_time = total_mem / _config.max_dram_bandwidth() * 1e3;
float total_time = std::max(compute_time, mem_time);
spdlog::info("[Attention] total {} GFLOPs, {} GB", tot_flops, total_mem);
spdlog::info("[Attention] Theoretical time(ms): {} Compute time: {} Memory time: {}",
total_time, compute_time, mem_time);
spdlog::info("[Attention] QK compute {:.4f}ms Softmax compute {:.4f}ms SV compute {:.4f}ms",
qk_flops / _config.max_systolic_flops(0) * 1e3,
qk_flops / _config.max_systolic_flops(target_core) * 1e3,
softmax_flops / _config.max_vector_flops(target_core) * 1e3,
kv_flops / _config.max_systolic_flops(0) * 1e3);
kv_flops / _config.max_systolic_flops(target_core) * 1e3);
}

void Attention::initialize_onnx_tiles(MappingTable& mapping_table) {
Expand All @@ -161,7 +161,7 @@ void Attention::initialize_onnx_tiles(MappingTable& mapping_table) {

/* Create linear node and tensors */
uint32_t fused_op_id = 0;
_projection_node = new GemmWS(_config, mapping_table, _input_shape, _weight_shape, _liner_output_shape);
_projection_node = new GemmWS(_config, mapping_table, _input_shape, _weight_shape, _liner_output_shape, target_core);
std::unique_ptr<Tensor> key_projection = std::make_unique<Tensor>(
_id, "", _projection_output_shape, _config.precision, false);
std::unique_ptr<Tensor> query_projection = std::make_unique<Tensor>(
Expand Down Expand Up @@ -591,7 +591,7 @@ void Attention::initialize_non_fused_tiles(MappingTable& mapping_table) {
for (int req_idx = 0; req_idx < _batch_size; req_idx++) {
for (int head_off=0; head_off<_nh; head_off++) {
/* Key query matmul */
GemmWS key_query = GemmWS(_config, mapping_table, single_head_query_shape, single_head_key_shape, query_key_shape);
GemmWS key_query = GemmWS(_config, mapping_table, single_head_query_shape, single_head_key_shape, query_key_shape, target_core);
/* Todo. dram addr */
key_query.has_bias = false;
key_query.initialize_tiles(mapping_table);
Expand Down Expand Up @@ -624,7 +624,7 @@ void Attention::initialize_non_fused_tiles(MappingTable& mapping_table) {
_tiles.push_back(std::make_unique<Tile>(Tile{.status = Tile::Status::BAR, .layer_id = _id}));

/* attention x value */
GemmWS attention = GemmWS(_config, mapping_table, query_key_shape, single_head_value_shape, single_output_shape);
GemmWS attention = GemmWS(_config, mapping_table, query_key_shape, single_head_value_shape, single_output_shape, target_core);
/* Todo. dram addr */
attention.has_bias = false;
attention.initialize_tiles(mapping_table);
Expand Down
4 changes: 2 additions & 2 deletions src/operations/Attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

class Attention : public Operation {
public:
Attention(SimulationConfig config, Model* model, onnx::NodeProto& node_proto);
Attention(SimulationConfig config, Model* model, std::string name, std::map<std::string, std::string>& attributes);
Attention(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
Attention(SimulationConfig config, Model* model, std::string name, std::map<std::string, std::string>& attributes, uint32_t target_core=0);
//std::vector<Ptr<BTensor>> get_outputs(std::vector<Ptr<BTensor>> inputs) override;

uint32_t _batch_size;
Expand Down
8 changes: 4 additions & 4 deletions src/operations/BiasAct.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ static const std::map<std::string, Opcode> activation_map = {
};

BiasAct::BiasAct(SimulationConfig config, Model* model,
onnx::NodeProto& node_proto)
: Operation(config, model, node_proto) {
onnx::NodeProto& node_proto, uint32_t target_core)
: Operation(config, model, node_proto, target_core) {

/* Load weight info from node */
_input_shape = get_input(0)->get_dims();
Expand All @@ -34,8 +34,8 @@ BiasAct::BiasAct(SimulationConfig config, Model* model,
}

BiasAct::BiasAct(SimulationConfig config, Model* model,
std::string name, std::map<std::string, std::string> &attributes)
: Operation(config, model, name, attributes) {
std::string name, std::map<std::string, std::string> &attributes, uint32_t target_core)
: Operation(config, model, name, attributes, target_core) {
_activation = activation_map.at(get_attribute("activation"));
_use_bias = std::stoi(get_attribute("has_bias"));
_llama_mlp = std::stoi(get_attribute("llama_mlp"));
Expand Down
4 changes: 2 additions & 2 deletions src/operations/BiasAct.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

class BiasAct : public Operation {
public:
BiasAct(SimulationConfig config, Model* model, onnx::NodeProto& node_proto);
BiasAct(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
BiasAct(SimulationConfig config, Model* model, std::string name,
std::map<std::string, std::string>& attributes);
std::map<std::string, std::string>& attributes, uint32_t target_core=0);

void initialize_tiles(MappingTable& mapping_table) override;

Expand Down
8 changes: 4 additions & 4 deletions src/operations/BiasGelu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
#include "../Model.h"

BiasGelu::BiasGelu(SimulationConfig config, Model* model,
onnx::NodeProto& node_proto)
: Operation(config, model, node_proto) {
onnx::NodeProto& node_proto, uint32_t target_core)
: Operation(config, model, node_proto, target_core) {

/* Load weight info from node */
_input_shape = get_input(0)->get_dims();
Expand All @@ -28,8 +28,8 @@ BiasGelu::BiasGelu(SimulationConfig config, Model* model,
}

BiasGelu::BiasGelu(SimulationConfig config, Model* model,
std::string name, std::map<std::string, std::string> &attributes)
: Operation(config, model, name, attributes) {
std::string name, std::map<std::string, std::string> &attributes, uint32_t target_core)
: Operation(config, model, name, attributes, target_core) {
//TODO:implement this
}

Expand Down
4 changes: 2 additions & 2 deletions src/operations/BiasGelu.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

class BiasGelu : public Operation {
public:
BiasGelu(SimulationConfig config, Model* model, onnx::NodeProto& node_proto);
BiasGelu(SimulationConfig config, Model* model, std::string name, std::map<std::string, std::string>& attributes);
BiasGelu(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
BiasGelu(SimulationConfig config, Model* model, std::string name, std::map<std::string, std::string>& attributes, uint32_t target_core=0);

std::vector<uint32_t> _bias_shape;

Expand Down
8 changes: 4 additions & 4 deletions src/operations/Concat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#include "../Tensor.h"

Concat::Concat(SimulationConfig config, Model* model,
onnx::NodeProto& node_proto)
: Operation(config, model, node_proto) {
onnx::NodeProto& node_proto, uint32_t target_core)
: Operation(config, model, node_proto, target_core) {
for (auto attribute : node_proto.attribute()) {
if (attribute.name() == "axis") {
spdlog::trace("concat axis {}", attribute.ints(0));
Expand Down Expand Up @@ -45,8 +45,8 @@ Concat::Concat(const Concat& src) : Operation(src) {
}

Concat::Concat(SimulationConfig config, Model* model,
std::string name, std::map<std::string, std::string> &attributes)
: Operation(config, model, name, attributes) {
std::string name, std::map<std::string, std::string> &attributes, uint32_t target_core)
: Operation(config, model, name, attributes, target_core) {
//TODO:implement this
_axis = std::stoi(get_attribute("axis"));
}
Expand Down
4 changes: 2 additions & 2 deletions src/operations/Concat.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

class Concat : public Operation {
public:
Concat(SimulationConfig config, Model* model, onnx::NodeProto& node_proto);
Concat(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
Concat(const Concat& src);
Concat(SimulationConfig config, Model* model, std::string name,
std::map<std::string, std::string>& attributes);
std::map<std::string, std::string>& attributes, uint32_t target_core=0);
virtual void initialize_tiles(MappingTable& mapping_table) override;
virtual void initialize_instructions(Tile* tile, Mapping mapping) override;
protected:
Expand Down
8 changes: 4 additions & 4 deletions src/operations/Conv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
#include "../Model.h"
#include "../Tensor.h"

Conv::Conv(SimulationConfig config, Model* model, onnx::NodeProto& node_proto)
: Operation(config, model, node_proto) {
Conv::Conv(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core)
: Operation(config, model, node_proto, target_core) {
int kernel_dim = 0;
_activation_fused = false;
_pool_fused = false;
Expand Down Expand Up @@ -210,8 +210,8 @@ void Conv::im2col_nhwc() {
}

Conv::Conv(SimulationConfig config, MappingTable& mapping_table,
convInfo info)
: Operation(config, mapping_table){
convInfo info, uint32_t target_core)
: Operation(config, mapping_table, target_core){
_kernel_shape = info.kernel_shape;
_strides = info.strides;
_dilations = info.dilations;
Expand Down
6 changes: 3 additions & 3 deletions src/operations/Conv.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ struct convInfo{

class Conv : public Operation {
public:
Conv(SimulationConfig config, Model* model, onnx::NodeProto& node_proto);
Conv(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);
Conv(const Conv& src);
Conv(SimulationConfig config, MappingTable& mapping_table, convInfo info);
Conv(SimulationConfig config, MappingTable& mapping_table, convInfo info, uint32_t target_core=0);
// virtual void initialize_tiles(MappingTable& mapping_table) override;
protected:
virtual void im2col_nhwc();
// void init(SimulationConfig config, Model* model, onnx::NodeProto& node_proto);
// void init(SimulationConfig config, Model* model, onnx::NodeProto& node_proto, uint32_t target_core=0);

protected:
std::vector<uint32_t> _kernel_shape;
Expand Down
Loading

0 comments on commit 24445a4

Please sign in to comment.