Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions src/llama-hparams.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,11 @@ struct llama_hparams {
float f_attn_out_scale = 0.0f;
uint32_t attn_temp_length = 0;

bool causal_attn = true;
bool use_alibi = false;
bool attn_soft_cap = false;
bool use_kq_norm = false;
bool causal_attn = true;
bool use_alibi = false;
bool attn_soft_cap = false;
bool use_kq_norm = false;
bool offload_input_layer = false;

// for Classifiers
uint32_t n_cls_out = 1;
Expand Down
18 changes: 17 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2081,6 +2081,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
}
hparams.n_layer_dense_lead = hparams.n_layer;
hparams.offload_input_layer = true;
switch (hparams.n_ff()) {
case 4608: type = LLM_TYPE_350M; break;
case 6912: type = LLM_TYPE_700M; break;
Expand All @@ -2101,6 +2102,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
}

hparams.offload_input_layer = true;
type = LLM_TYPE_8B_A1B;
} break;
case LLM_ARCH_SMALLTHINKER:
Expand Down Expand Up @@ -2266,8 +2268,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
};

// assign the input layer
// there is very little benefit to offloading the input layer, so always keep it on the CPU
// there is very little benefit to offloading the input layer, so keep it on the CPU, with exception for some specific models
pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
if (hparams.offload_input_layer && act_gpu_layers > 0) {
int anchor_layer = 0;
if (n_layer == 0) {
anchor_layer = 0;
} else if (i_gpu_start < n_layer) {
anchor_layer = i_gpu_start;
} else {
anchor_layer = (int) n_layer - 1;
}
auto candidate = get_layer_buft_list(anchor_layer);
if (candidate.dev != cpu_dev) {
pimpl->dev_input = candidate;
}
}

// assign the repeating layers to the devices according to the splits
pimpl->dev_layer.resize(n_layer);
Expand Down