Skip to content

Commit b43556e

Browse files
authored
Merge branch 'ggml-org:master' into testing_fork
2 parents 0e87250 + a812838 commit b43556e

File tree

11 files changed

+136
-43
lines changed

11 files changed

+136
-43
lines changed

examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,17 +333,17 @@ static void print_params(struct my_llama_hparams * params) {
333333
}
334334

335335
static void print_tensor_info(const struct ggml_context * ctx) {
336-
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
336+
for (auto * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
337337
LOG_INF("%s: Allocating ", __func__);
338338
int64_t total = 1;
339339
int i = 0;
340340
for (; i < ggml_n_dims(t); ++i) {
341-
if (i > 0) LOG("x ");
342-
LOG("[%" PRId64 "] ", t->ne[i]);
341+
if (i > 0) { LOG_INF("x "); }
342+
LOG_INF("[%" PRId64 "] ", t->ne[i]);
343343
total *= t->ne[i];
344344
}
345-
if (i > 1) LOG("= [%" PRId64 "] ", total);
346-
LOG("float space for %s\n", ggml_get_name(t));
345+
if (i > 1) { LOG_INF("= [%" PRId64 "] ", total); }
346+
LOG_INF("float space for %s\n", ggml_get_name(t));
347347
}
348348
}
349349

ggml/src/gguf.cpp

Lines changed: 102 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1166,50 +1166,51 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
11661166
ctx->info[tensor_id].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const
11671167
}
11681168

1169-
struct gguf_writer {
1170-
std::vector<int8_t> & buf;
1169+
struct gguf_writer_base {
1170+
size_t written_bytes {0u};
1171+
1172+
~gguf_writer_base(void) {}
11711173

1172-
gguf_writer(std::vector<int8_t> & buf) : buf(buf) {}
1174+
// we bet on devirtualization
1175+
virtual void write(int8_t val) = 0;
1176+
virtual void write(const std::vector<int8_t> & val) = 0;
1177+
virtual void write_tensor_data(const struct gguf_tensor_info & info, size_t offset_data, size_t alignment) = 0;
11731178

11741179
template <typename T>
1175-
void write(const T & val) const {
1180+
void write(const T & val) {
11761181
for (size_t i = 0; i < sizeof(val); ++i) {
1177-
buf.push_back(reinterpret_cast<const int8_t *>(&val)[i]);
1182+
write(reinterpret_cast<const int8_t *>(&val)[i]);
11781183
}
11791184
}
11801185

1181-
void write(const std::vector<int8_t> & val) const {
1182-
buf.insert(buf.end(), val.begin(), val.end());
1183-
}
1184-
1185-
void write(const bool & val) const {
1186+
void write(const bool & val) {
11861187
const int8_t val8 = val ? 1 : 0;
11871188
write(val8);
11881189
}
11891190

1190-
void write(const std::string & val) const {
1191+
void write(const std::string & val) {
11911192
{
11921193
const uint64_t n = val.length();
11931194
write(n);
11941195
}
11951196
for (size_t i = 0; i < val.length(); ++i) {
1196-
buf.push_back(reinterpret_cast<const int8_t *>(val.data())[i]);
1197+
write((val.data())[i]);
11971198
}
11981199
}
11991200

1200-
void write(const char * val) const {
1201+
void write(const char * val) {
12011202
write(std::string(val));
12021203
}
12031204

1204-
void write(const enum ggml_type & val) const {
1205+
void write(const enum ggml_type & val) {
12051206
write(int32_t(val));
12061207
}
12071208

1208-
void write(const enum gguf_type & val) const {
1209+
void write(const enum gguf_type & val) {
12091210
write(int32_t(val));
12101211
}
12111212

1212-
void write(const struct gguf_kv & kv) const {
1213+
void write(const struct gguf_kv & kv) {
12131214
const uint64_t ne = kv.get_ne();
12141215

12151216
write(kv.get_key());
@@ -1250,7 +1251,7 @@ struct gguf_writer {
12501251
}
12511252
}
12521253

1253-
void write_tensor_meta(const struct gguf_tensor_info & info) const {
1254+
void write_tensor_meta(const struct gguf_tensor_info & info) {
12541255
write(info.t.name);
12551256

12561257
const uint32_t n_dims = ggml_n_dims(&info.t);
@@ -1263,14 +1264,33 @@ struct gguf_writer {
12631264
write(info.offset);
12641265
}
12651266

1266-
void pad(const size_t alignment) const {
1267-
while (buf.size() % alignment != 0) {
1267+
void pad(const size_t alignment) {
1268+
while (written_bytes % alignment != 0) {
12681269
const int8_t zero = 0;
12691270
write(zero);
12701271
}
12711272
}
1273+
};
1274+
1275+
// vector buffer based writer
1276+
struct gguf_writer_buf final : public gguf_writer_base {
1277+
std::vector<int8_t> & buf;
1278+
1279+
gguf_writer_buf(std::vector<int8_t> & buf) : buf(buf) {}
1280+
1281+
using gguf_writer_base::write;
1282+
1283+
void write(const int8_t val) override {
1284+
buf.push_back(val);
1285+
written_bytes++;
1286+
}
12721287

1273-
void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) const {
1288+
void write(const std::vector<int8_t> & val) override {
1289+
buf.insert(buf.end(), val.begin(), val.end());
1290+
written_bytes += val.size();
1291+
}
1292+
1293+
void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) override {
12741294
GGML_ASSERT(buf.size() - offset_data == info.offset);
12751295

12761296
GGML_ASSERT(ggml_is_contiguous(&info.t));
@@ -1284,14 +1304,58 @@ struct gguf_writer {
12841304
GGML_ASSERT(info.t.data);
12851305
memcpy(buf.data() + offset, info.t.data, nbytes);
12861306
}
1307+
written_bytes += nbytes;
12871308

12881309
pad(alignment);
12891310
}
12901311
};
12911312

1292-
void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta) {
1293-
const struct gguf_writer gw(buf);
1313+
// file based writer
1314+
struct gguf_writer_file final : public gguf_writer_base {
1315+
FILE * file;
1316+
1317+
gguf_writer_file(FILE* file) : file(file) {}
1318+
1319+
using gguf_writer_base::write;
1320+
1321+
void write(const int8_t val) override {
1322+
const auto real_val = static_cast<uint8_t>(val);
1323+
const auto ret = fputc(real_val, file);
1324+
written_bytes++;
1325+
if (ret != real_val) {
1326+
throw std::runtime_error("unexpected fputc result '" + std::to_string(ret) + "' instead of '" + std::to_string((int)real_val) + "'");
1327+
}
1328+
}
1329+
1330+
void write(const std::vector<int8_t> & val) override {
1331+
const auto ret = fwrite(val.data(), 1, val.size(), file);
1332+
written_bytes += val.size();
1333+
if (ret != val.size()) {
1334+
throw std::runtime_error("unexpected fwrite number of bytes written, '" + std::to_string(ret) + "' instead of '" + std::to_string(val.size()) + "'");
1335+
}
1336+
}
1337+
1338+
void write_tensor_data(const struct gguf_tensor_info & info, const size_t offset_data, const size_t alignment) override {
1339+
GGML_ASSERT(written_bytes - offset_data == info.offset);
1340+
1341+
GGML_ASSERT(ggml_is_contiguous(&info.t));
1342+
const size_t nbytes = ggml_nbytes(&info.t);
12941343

1344+
std::vector<int8_t> buf(nbytes);
1345+
if (info.t.buffer) {
1346+
ggml_backend_tensor_get(&info.t, buf.data(), 0, nbytes);
1347+
} else {
1348+
GGML_ASSERT(info.t.data);
1349+
memcpy(buf.data(), info.t.data, nbytes);
1350+
}
1351+
write(buf);
1352+
1353+
pad(alignment);
1354+
}
1355+
};
1356+
1357+
template <typename writer_t>
1358+
static void gguf_write_out(const struct gguf_context * ctx, writer_t & gw, bool only_meta) {
12951359
const int64_t n_kv = gguf_get_n_kv(ctx);
12961360
const int64_t n_tensors = gguf_get_n_tensors(ctx);
12971361

@@ -1321,14 +1385,19 @@ void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & bu
13211385
return;
13221386
}
13231387

1324-
const size_t offset_data = gw.buf.size();
1388+
const size_t offset_data = gw.written_bytes;
13251389

13261390
// write tensor data
13271391
for (int64_t i = 0; i < n_tensors; ++i) {
13281392
gw.write_tensor_data(ctx->info[i], offset_data, ctx->alignment);
13291393
}
13301394
}
13311395

1396+
void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta) {
1397+
gguf_writer_buf gw(buf);
1398+
gguf_write_out(ctx, gw, only_meta);
1399+
}
1400+
13321401
bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
13331402
FILE * file = ggml_fopen(fname, "wb");
13341403

@@ -1337,11 +1406,17 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
13371406
return false;
13381407
}
13391408

1340-
std::vector<int8_t> buf;
1341-
gguf_write_to_buf(ctx, buf, only_meta);
1342-
const bool ok = fwrite(buf.data(), 1, buf.size(), file) == buf.size();
1409+
try {
1410+
gguf_writer_file gw(file);
1411+
gguf_write_out(ctx, gw, only_meta);
1412+
} catch (const std::runtime_error& ex) {
1413+
GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what());
1414+
fclose(file);
1415+
return false;
1416+
}
1417+
13431418
fclose(file);
1344-
return ok;
1419+
return true;
13451420
}
13461421

13471422
size_t gguf_get_meta_size(const struct gguf_context * ctx) {

src/llama-graph.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
297297

298298
float * data = (float *) kq_mask->data;
299299

300+
// [TAG_NO_CACHE_ISWA]
301+
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
302+
300303
for (int h = 0; h < 1; ++h) {
301304
for (int i1 = 0; i1 < n_tokens; ++i1) {
302305
const llama_seq_id s1 = ubatch->seq_id[i1][0];
@@ -315,9 +318,10 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
315318
continue; // skip future tokens for causal attention
316319
}
317320

318-
if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
319-
continue; // skip masked tokens for SWA
320-
}
321+
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
322+
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
323+
// continue; // skip masked tokens for SWA
324+
//}
321325

322326
// TODO: reimplement this like in llama_kv_cache_unified
323327
if (hparams.use_alibi) {

src/llama-hparams.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ uint32_t llama_hparams::n_layer_kv() const {
180180
return res;
181181
}
182182

183-
bool llama_hparams::is_masked_swa(llama_pos p0, llama_pos p1) const {
183+
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
184184
assert(p0 >= 0 && p1 >= 0);
185185

186186
switch (swa_type) {

src/llama-hparams.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,10 @@ struct llama_hparams {
229229
// number of layers for which has_kv() returns true
230230
uint32_t n_layer_kv() const;
231231

232-
bool is_masked_swa(llama_pos p0, llama_pos p1) const;
232+
// note that this function uses different SWA parameters from those in the hparams
233+
// TODO: think of a better place for this function
234+
// TODO: pack the SWA params in a struct?
235+
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
233236
};
234237

235238
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

src/llama-kv-cache-iswa.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
6060
kv_base = std::make_unique<llama_kv_cache>(
6161
model, type_k, type_v,
6262
v_trans, offload, unified, size_base, n_seq_max, n_pad,
63-
0, filter_base, reuse);
63+
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
6464

6565
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
6666

6767
kv_swa = std::make_unique<llama_kv_cache>(
6868
model, type_k, type_v,
6969
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
70-
hparams.n_swa, filter_swa, reuse);
70+
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
7171
}
7272

7373
void llama_kv_cache_iswa::clear(bool data) {

src/llama-kv-cache.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,11 @@ llama_kv_cache::llama_kv_cache(
2727
uint32_t n_seq_max,
2828
uint32_t n_pad,
2929
uint32_t n_swa,
30+
llama_swa_type swa_type,
3031
const layer_filter_cb & filter,
3132
const layer_reuse_cb & reuse) :
3233
model(model), hparams(model.hparams), v_trans(v_trans),
33-
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa) {
34+
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
3435

3536
GGML_ASSERT(kv_size % n_pad == 0);
3637

@@ -1392,7 +1393,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
13921393
}
13931394

13941395
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
1395-
return hparams.is_masked_swa(p0, p1);
1396+
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
13961397
}
13971398

13981399
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {

src/llama-kv-cache.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ class llama_kv_cache : public llama_memory_i {
8989
uint32_t n_seq_max,
9090
uint32_t n_pad,
9191
uint32_t n_swa,
92+
llama_swa_type swa_type,
9293
const layer_filter_cb & filter,
9394
const layer_reuse_cb & reuse);
9495

@@ -211,6 +212,9 @@ class llama_kv_cache : public llama_memory_i {
211212
// env: LLAMA_KV_CACHE_DEBUG
212213
int debug = 0;
213214

215+
// this is the SWA type of the cache - not to be confused with the model SWA type
216+
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
217+
214218
std::vector<ggml_context_ptr> ctxs;
215219
std::vector<ggml_backend_buffer_ptr> bufs;
216220

src/llama-memory-hybrid.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ llama_memory_hybrid::llama_memory_hybrid(
1717
uint32_t kv_size,
1818
uint32_t n_pad,
1919
uint32_t n_swa,
20+
llama_swa_type swa_type,
2021
/* recurrent */
2122
ggml_type type_r,
2223
ggml_type type_s,
@@ -40,6 +41,7 @@ llama_memory_hybrid::llama_memory_hybrid(
4041
n_seq_max,
4142
n_pad,
4243
n_swa,
44+
swa_type,
4345
filter_attn == nullptr ?
4446
[&](int32_t il) { return !hparams.is_recurrent(il); }
4547
: filter_attn,

src/llama-memory-hybrid.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class llama_memory_hybrid : public llama_memory_i {
2727
uint32_t kv_size,
2828
uint32_t n_pad,
2929
uint32_t n_swa,
30+
llama_swa_type swa_type,
3031
/* recurrent */
3132
ggml_type type_r,
3233
ggml_type type_s,

0 commit comments

Comments
 (0)