Skip to content

Commit

Permalink
Offloading for falcon & gpt2
Browse files Browse the repository at this point in the history
  • Loading branch information
LLukas22 committed Aug 7, 2023
1 parent 6f05d63 commit c7cb4e0
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 39 deletions.
72 changes: 51 additions & 21 deletions crates/models/falcon/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,19 @@ impl KnownModel for Falcon {

// model-gobal weights
let tok_embeddings = tl.load("transformer.word_embeddings.weight")?;
let output_norm = tl.load("transformer.ln_f.weight")?;
let output_norm_b = tl.load("transformer.ln_f.bias")?;
let lm_head = tl.load("lm_head.weight")?;

let backend = params.backend(0);

let output_norm = tl.load("transformer.ln_f.weight")?.transfer_to(backend);
let output_norm_b = tl.load("transformer.ln_f.bias")?.transfer_to(backend);
let lm_head = tl.load("lm_head.weight")?.transfer_to(backend);

let mut layers = Vec::new();
// utilizing n_head_kv to determine the model version (parameters)
let Hyperparameters { n_head_kv, .. } = hyperparameters;
for i in 0..hyperparameters.n_layer {
let backend = params.backend(i);

let (input_layernorm_name, attention_norm_name) = if n_head_kv == 1 {
// falcon 7b
(format!("transformer.h.{i}.input_layernorm"), None)
Expand All @@ -76,24 +81,47 @@ impl KnownModel for Falcon {
Some(format!("transformer.h.{i}.ln_attn")),
)
};

let (attention_norm_weight, attention_norm_bias) =
if let Some(norm_name) = attention_norm_name {
(
Some(
tl.load(&format!("{}.weight", norm_name))?
.transfer_to(backend),
),
Some(
tl.load(&format!("{}.bias", norm_name))?
.transfer_to(backend),
),
)
} else {
(None, None)
};

let layer = Layer {
input_layernorm: tl.load(&format!("{}.weight", input_layernorm_name))?,
input_layernorm_b: tl.load(&format!("{}.bias", input_layernorm_name))?,
attention_norm: attention_norm_name
.as_ref()
.map(|path| tl.load(&format!("{}.weight", path)))
.transpose()?,
attention_norm_b: attention_norm_name
.map(|path| tl.load(&format!("{}.bias", path)))
.transpose()?,

query_key_value: tl.load(&format!(
"transformer.h.{i}.self_attention.query_key_value.weight"
))?,
wo: tl.load(&format!("transformer.h.{i}.self_attention.dense.weight"))?,

ffn_up: tl.load(&format!("transformer.h.{i}.mlp.dense_h_to_4h.weight"))?,
ffn_down: tl.load(&format!("transformer.h.{i}.mlp.dense_4h_to_h.weight"))?,
input_layernorm: tl
.load(&format!("{}.weight", input_layernorm_name))?
.transfer_to(backend),
input_layernorm_b: tl
.load(&format!("{}.bias", input_layernorm_name))?
.transfer_to(backend),
attention_norm: attention_norm_weight,
attention_norm_b: attention_norm_bias,
query_key_value: tl
.load(&format!(
"transformer.h.{i}.self_attention.query_key_value.weight"
))?
.transfer_to(backend),
wo: tl
.load(&format!("transformer.h.{i}.self_attention.dense.weight"))?
.transfer_to(backend),

ffn_up: tl
.load(&format!("transformer.h.{i}.mlp.dense_h_to_4h.weight"))?
.transfer_to(backend),
ffn_down: tl
.load(&format!("transformer.h.{i}.mlp.dense_4h_to_h.weight"))?
.transfer_to(backend),
};

layers.push(layer);
Expand Down Expand Up @@ -147,7 +175,7 @@ impl KnownModel for Falcon {
let n = input_len;

let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
let ctx0 = builder.ctx0.borrow();
let mut ctx0 = builder.ctx0.borrow_mut();
let embd = builder.embd;
let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, embd);

Expand All @@ -167,6 +195,7 @@ impl KnownModel for Falcon {
for il in 0..n_layer {
// attention uses first scratch buffer
ctx0.use_scratch(builder.get_scratch(0));
ctx0.set_offloading(self.params.should_offload(il));

// self-attention
layernorm_output = ctx0.op_norm(&input_layer);
Expand Down Expand Up @@ -321,6 +350,7 @@ impl KnownModel for Falcon {

let embeddings_tensor: ggml::Tensor = input_layer.share();

ctx0.set_offloading(false);
ctx0.use_scratch(None);

// lm_head
Expand Down
64 changes: 46 additions & 18 deletions crates/models/gpt2/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,30 +56,57 @@ impl KnownModel for Gpt2 {
let mut tl = tensor_loader;

// model-global weights
let ln_f_g = tl.load("model/ln_f/g")?;
let ln_f_b = tl.load("model/ln_f/b")?;
let wte = tl.load("model/wte")?;
let wpe = tl.load("model/wpe")?;

let backend = params.backend(0);

let wte = tl.load("model/wte")?.transfer_to(backend);

let ln_f_g = tl.load("model/ln_f/g")?.transfer_to(backend);
let ln_f_b = tl.load("model/ln_f/b")?.transfer_to(backend);

// GPT-2's language model head is optional; if it is not present,
// the `wte` tensor is used instead.
let lm_head = tl.load("model/lm_head").ok();
let lm_head = {
if let Ok(tensor) = tl.load("model/lm_head") {
Some(tensor.transfer_to(backend))
} else {
None
}
};

let mut layers = Vec::new();
for i in 0..hyperparameters.n_layer {
let backend = params.backend(i);
let layer = Layer {
ln_1_g: tl.load(&format!("model/h{i}/ln_1/g"))?,
ln_1_b: tl.load(&format!("model/h{i}/ln_1/b"))?,
ln_2_g: tl.load(&format!("model/h{i}/ln_2/g"))?,
ln_2_b: tl.load(&format!("model/h{i}/ln_2/b"))?,
c_attn_attn_w: tl.load(&format!("model/h{i}/attn/c_attn/w"))?,
c_attn_attn_b: tl.load(&format!("model/h{i}/attn/c_attn/b"))?,
c_attn_proj_w: tl.load(&format!("model/h{i}/attn/c_proj/w"))?,
c_attn_proj_b: tl.load(&format!("model/h{i}/attn/c_proj/b"))?,
c_mlp_fc_w: tl.load(&format!("model/h{i}/mlp/c_fc/w"))?,
c_mlp_fc_b: tl.load(&format!("model/h{i}/mlp/c_fc/b"))?,
c_mlp_proj_w: tl.load(&format!("model/h{i}/mlp/c_proj/w"))?,
c_mlp_proj_b: tl.load(&format!("model/h{i}/mlp/c_proj/b"))?,
ln_1_g: tl.load(&format!("model/h{i}/ln_1/g"))?.transfer_to(backend),
ln_1_b: tl.load(&format!("model/h{i}/ln_1/b"))?.transfer_to(backend),
ln_2_g: tl.load(&format!("model/h{i}/ln_2/g"))?.transfer_to(backend),
ln_2_b: tl.load(&format!("model/h{i}/ln_2/b"))?.transfer_to(backend),
c_attn_attn_w: tl
.load(&format!("model/h{i}/attn/c_attn/w"))?
.transfer_to(backend),
c_attn_attn_b: tl
.load(&format!("model/h{i}/attn/c_attn/b"))?
.transfer_to(backend),
c_attn_proj_w: tl
.load(&format!("model/h{i}/attn/c_proj/w"))?
.transfer_to(backend),
c_attn_proj_b: tl
.load(&format!("model/h{i}/attn/c_proj/b"))?
.transfer_to(backend),
c_mlp_fc_w: tl
.load(&format!("model/h{i}/mlp/c_fc/w"))?
.transfer_to(backend),
c_mlp_fc_b: tl
.load(&format!("model/h{i}/mlp/c_fc/b"))?
.transfer_to(backend),
c_mlp_proj_w: tl
.load(&format!("model/h{i}/mlp/c_proj/w"))?
.transfer_to(backend),
c_mlp_proj_b: tl
.load(&format!("model/h{i}/mlp/c_proj/b"))?
.transfer_to(backend),
};

layers.push(layer);
Expand Down Expand Up @@ -130,7 +157,7 @@ impl KnownModel for Gpt2 {
} = self.hyperparameters;

let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
let ctx0 = builder.ctx0.borrow();
let mut ctx0 = builder.ctx0.borrow_mut();
let (memory_k_size, memory_v_size) = (
builder.memory_k.element_size(),
builder.memory_v.element_size(),
Expand All @@ -149,8 +176,8 @@ impl KnownModel for Gpt2 {

let mut gf = ctx0.create_compute_graph();
for il in 0..n_layer {
ctx0.set_offloading(self.params.should_offload(il));
ctx0.use_scratch(builder.get_scratch(0));

// norm
let mut current = ctx0.op_norm(&input_layer);
current = ctx0.op_add(
Expand Down Expand Up @@ -288,6 +315,7 @@ impl KnownModel for Gpt2 {
input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_f_g), &self.ln_f_b);

ctx0.use_scratch(None);
ctx0.set_offloading(false);

let embeddings_tensor: ggml::Tensor = input_layer.share();

Expand Down

0 comments on commit c7cb4e0

Please sign in to comment.