diff --git a/ggml-sys/ggml/CREDITS.txt b/ggml-sys/ggml/CREDITS.txt index 876de74b..036b95bc 100644 --- a/ggml-sys/ggml/CREDITS.txt +++ b/ggml-sys/ggml/CREDITS.txt @@ -1,4 +1,4 @@ -Vendored version: https://github.com/ggerganov/llama.cpp/commit/437e77855a54e69c86fe03bc501f63d9a3fddb0e +Vendored version: https://github.com/ggerganov/llama.cpp/commit/cc9cee8e9e7598bd280295f6264f36d3a9224006 The ggml.c and ggml.h files are distributed under the terms of the MIT license. Credit goes to the original authors: Copyright (c) 2023 Georgi Gerganov diff --git a/ggml-sys/ggml/ggml.c b/ggml-sys/ggml/ggml.c index 59e84ab4..8a60bc38 100644 --- a/ggml-sys/ggml/ggml.c +++ b/ggml-sys/ggml/ggml.c @@ -3219,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.pad =*/ { 0 }, }; - ggml_assert_aligned(result->data); + // TODO: this should not be needed as long as we don't rely on aligned SIMD loads + //ggml_assert_aligned(result->data); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; @@ -3620,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + + result->nb[0] = src->nb[0]; + result->nb[1] = src->nb[1]; + result->nb[2] = src->nb[2]; + result->nb[3] = src->nb[3]; + + return result; } //////////////////////////////////////////////////////////////////////////////// @@ -4510,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d( return result; } +// ggml_view_3d + +struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, + size_t nb2, + size_t offset) { + if (a->grad) { + GGML_ASSERT(false); // gradient propagation is not supported + } + + const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; + + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + + result->nb[1] = nb1; + result->nb[2] = nb2; + result->nb[3] = result->nb[2]*ne2; + + result->op = GGML_OP_VIEW; + result->grad = NULL; + result->src0 = a; + result->src1 = NULL; // TODO: maybe store the offset here? + + return result; +} + // ggml_permute struct ggml_tensor * ggml_permute( @@ -4845,7 +4884,6 @@ static void ggml_compute_forward_dup_f16( const struct ggml_tensor * src0, struct ggml_tensor * dst) { GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -4862,85 +4900,96 @@ static void ggml_compute_forward_dup_f16( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; - if (ggml_is_contiguous(src0) && src0->type == dst->type) { + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); return; } - if (src0->nb[0] == sizeof(ggml_fp16_t)) { - if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - const size_t rs = ne00*nb00; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; - - memcpy(dst_ptr, src0_ptr, rs); - - id++; - } - } - } - } else if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); - id++; - } - } + if (src0->type == dst->type && + src0->ne[0] == dst->ne[0] && + src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); } } - } else { - GGML_ASSERT(false); // TODO: implement } - } else { - //printf("%s: this is not optimal - fix me\n", __func__); + return; + } - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; + // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); - id++; + if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } } } } } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; + } + } else if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } } } } } - } else { - GGML_ASSERT(false); // TODO: implement } + } else { + GGML_ASSERT(false); // TODO: implement } } @@ -4949,7 +4998,6 @@ static void ggml_compute_forward_dup_f32( const struct ggml_tensor * src0, struct ggml_tensor * dst) { GGML_ASSERT(params->ith == 0); - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { @@ -4966,85 +5014,76 @@ static void ggml_compute_forward_dup_f32( const size_t nb02 = src0->nb[2]; const size_t nb03 = src0->nb[3]; - if (ggml_is_contiguous(src0) && src0->type == dst->type) { + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); return; } - if (src0->nb[0] == sizeof(float)) { - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - const size_t rs = ne00*nb00; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - char * dst_ptr = (char *) dst->data + id*rs; + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; - memcpy(dst_ptr, src0_ptr, rs); - - id++; - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; + if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(float)); + + if (++i10 == dst->ne[0]) { + i10 = 0; + if (++i11 == dst->ne[1]) { + i11 = 0; + if (++i12 == dst->ne[2]) { + i12 = 0; + if (++i13 == dst->ne[3]) { + i13 = 0; + } + } + } } } } } - } else { - GGML_ASSERT(false); // TODO: implement } - } else { - //printf("%s: this is not optimal - fix me\n", __func__); - - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - float * dst_ptr = (float *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = *src0_ptr; - id++; - } - } - } - } - } else if (dst->type == GGML_TYPE_F16) { - size_t id = 0; - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - for (int64_t i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); - id++; + } else if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + + if (++i10 == dst->ne[0]) { + i10 = 0; + if (++i11 == dst->ne[1]) { + i11 = 0; + if (++i12 == dst->ne[2]) { + i12 = 0; + if (++i13 == dst->ne[3]) { + i13 = 0; + } + } + } } } } } - } else { - GGML_ASSERT(false); // TODO: implement } + } else { + GGML_ASSERT(false); // TODO: implement } } @@ -7199,7 +7238,6 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); assert(ggml_nelements(src1) == 3); @@ -7226,11 +7264,28 @@ static void ggml_compute_forward_rope_f32( assert(nb0 == sizeof(float)); - // TODO: optimize + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { const int p = (mode == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + for (int i0 = 0; i0 < n_dims; i0 += 2) { const float theta = powf(10000.0, ((float)-i0)/n_dims); @@ -7256,7 +7311,6 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(params->ith == 0); assert(src1->type == GGML_TYPE_I32); assert(ggml_nelements(src1) == 3); @@ -7283,10 +7337,28 @@ static void ggml_compute_forward_rope_f16( assert(nb0 == sizeof(ggml_fp16_t)); + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + // row index used to determine which thread to use + int ir = 0; + for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) { const int p = (mode == 0 ? n_past + i2 : i2); for (int64_t i1 = 0; i1 < ne1; i1++) { + if (ir++ < ir0) continue; + if (ir > ir1) break; + for (int i0 = 0; i0 < n_dims; i0 += 2) { const float theta = powf(10000.0, ((float)-i0)/n_dims); @@ -9385,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_ROPE: { - node->n_tasks = 1; + node->n_tasks = n_threads; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: diff --git a/ggml-sys/ggml/ggml.h b/ggml-sys/ggml/ggml.h index ad962b10..3c94efc3 100644 --- a/ggml-sys/ggml/ggml.h +++ b/ggml-sys/ggml/ggml.h @@ -558,6 +558,16 @@ struct ggml_tensor * ggml_view_2d( size_t nb1, // row stride in bytes size_t offset); +struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); + struct ggml_tensor * ggml_permute( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ggml-sys/src/lib.rs b/ggml-sys/src/lib.rs index 3640a6af..e7bbe03c 100644 --- a/ggml-sys/src/lib.rs +++ b/ggml-sys/src/lib.rs @@ -795,6 +795,18 @@ extern "C" { offset: usize, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_view_3d( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + ne0: i64, + ne1: i64, + ne2: i64, + nb1: usize, + nb2: usize, + offset: usize, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_permute( ctx: *mut ggml_context, diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs index 22c9eee8..76a7e4ab 100644 --- a/ggml/src/lib.rs +++ b/ggml/src/lib.rs @@ -147,6 +147,12 @@ impl Context { self.new_tensor_raw(raw) } + /// Unknown, aside from the obvious. It's transposing something! + pub fn op_transpose(&self, a: &Tensor) -> Tensor { + let tensor = unsafe { ggml_sys::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) }; + self.new_tensor_raw(tensor) + } + /// Unknown. pub fn op_get_rows(&self, a: &Tensor, b: &Tensor) -> Tensor { let tensor = @@ -235,6 +241,55 @@ impl Context { self.new_tensor_raw(tensor) } + /// Creates a 2D view over `a`. + pub fn op_view_2d( + &self, + a: &Tensor, + ne0: usize, + ne1: usize, + nb1: usize, + offset: usize, + ) -> Tensor { + let tensor = unsafe { + ggml_sys::ggml_view_2d( + self.ptr.as_ptr(), + a.ptr.as_ptr(), + usize_to_i64(ne0), + usize_to_i64(ne1), + nb1, + offset, + ) + }; + self.new_tensor_raw(tensor) + } + + /// Creates a 3d view over `a`. + #[allow(clippy::too_many_arguments)] + pub fn op_view_3d( + &self, + a: &Tensor, + ne0: usize, + ne1: usize, + ne2: usize, + nb1: usize, + nb2: usize, + offset: usize, + ) -> Tensor { + let tensor = unsafe { + ggml_sys::ggml_view_3d( + self.ptr.as_ptr(), + a.ptr.as_ptr(), + usize_to_i64(ne0), + usize_to_i64(ne1), + usize_to_i64(ne2), + nb1, + nb2, + offset, + ) + }; + self.new_tensor_raw(tensor) + } + /// Copies `a` to `b` and returns `b`. pub fn op_cpy(&self, a: &Tensor, b: &Tensor) -> Tensor { let tensor = @@ -264,6 +319,26 @@ impl Context { self.new_tensor_raw(tensor) } + /// In-place; reshapes `a` in accordance with the dimensions of `b` + pub fn op_reshape(&self, a: &Tensor, b: &Tensor) -> Tensor { + let tensor = + unsafe { ggml_sys::ggml_reshape(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) }; + self.new_tensor_raw(tensor) + } + + /// In-place; reshapes `a` in accordance with the specified dimensions. + pub fn op_reshape_2d(&self, a: &Tensor, ne0: usize, ne1: usize) -> Tensor { + let tensor = unsafe { + ggml_sys::ggml_reshape_2d( + self.ptr.as_ptr(), + a.ptr.as_ptr(), + usize_to_i64(ne0), + usize_to_i64(ne1), + ) + }; + self.new_tensor_raw(tensor) + } + /// In-place; reshapes `a` in accordance with the specified dimensions. pub fn op_reshape_3d(&self, a: &Tensor, ne0: usize, ne1: usize, ne2: usize) -> Tensor { let tensor = unsafe { diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs index 86820ef3..e4b54d35 100644 --- a/llama-cli/src/cli_args.rs +++ b/llama-cli/src/cli_args.rs @@ -232,7 +232,6 @@ impl Generate { } }), play_back_previous_tokens: session_loaded, - ..Default::default() } } } diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index d5ef2a23..14553379 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -233,11 +233,6 @@ pub struct InferenceParameters { pub bias_tokens: TokenBias, /// Whether or not previous tokens should be played back in [InferenceSession::inference_with_prompt]. pub play_back_previous_tokens: bool, - /// If set, the inference process will behave more deterministically at the potential cost of performance. - /// - /// Note that this does not guarantee full determinism. When run on the same machine with the same parameters, - /// seed, and this set, inference should be identical, but this is not guaranteed to hold across machines. - pub increased_determinism: bool, } impl Default for InferenceParameters { @@ -251,7 +246,6 @@ impl Default for InferenceParameters { temperature: 0.80, bias_tokens: TokenBias::default(), play_back_previous_tokens: false, - increased_determinism: true, } } } @@ -1135,7 +1129,9 @@ impl Model { let n = input_tokens.len(); let n_past = session.n_past; let n_threads = params.n_threads; - let increased_determinism = params.increased_determinism; + + let memk_elsize = session.memory_k.element_size(); + let memv_elsize = session.memory_v.element_size(); let Hyperparameters { n_vocab, @@ -1164,27 +1160,6 @@ impl Model { let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd); - // Defined here to avoid repetition and creating a binding inside nested loops. - // See the call site below for more context. - let vtrans_fun = |il: usize| -> ggml::Tensor { - ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - &session.memory_v, - (n_past + n) * n_embd, - il * n_ctx * session.memory_v.element_size() * n_embd, - ), - n_embd / n_head, - n_head, - n_past + n, - ), - 1, - 2, - 0, - 3, - ) - }; - for il in 0..n_layer { let input_self_attention = input_layer.share(); let mut current: ggml::Tensor; @@ -1202,61 +1177,70 @@ impl Model { // self-attention { - let q_current = ctx0.op_mul_mat(&self.layers[il].wq, ¤t); - let k_current = ctx0.op_mul_mat(&self.layers[il].wk, ¤t); - let v_current = ctx0.op_mul_mat(&self.layers[il].wv, ¤t); + // compute Q and K and RoPE them + let q_current = ctx0.op_rope( + &ctx0.op_reshape_3d( + &ctx0.op_mul_mat(&self.layers[il].wq, ¤t), + n_embd / n_head, + n_head, + n, + ), + n_past, + n_rot, + 0, + ); + let k_current = ctx0.op_rope( + &ctx0.op_reshape_3d( + &ctx0.op_mul_mat(&self.layers[il].wk, ¤t), + n_embd / n_head, + n_head, + n, + ), + n_past, + n_rot, + 0, + ); // store key and value to memory - if n >= 1 { + { + // compute the transposed [N, n_embd] V matrix + let v_current = ctx0.op_transpose(&ctx0.op_reshape_2d( + &ctx0.op_mul_mat(&self.layers[il].wv, ¤t), + n_embd, + n, + )); + let k = ctx0.op_view_1d( &session.memory_k, n * n_embd, - (session.memory_k.element_size() * n_embd) * (il * n_ctx + n_past), + (memk_elsize * n_embd) * (il * n_ctx + n_past), ); - let v = ctx0.op_view_1d( + let v = ctx0.op_view_2d( &session.memory_v, - n * n_embd, - (session.memory_v.element_size() * n_embd) * (il * n_ctx + n_past), + n, + n_embd, + n_ctx * memv_elsize, + (il * n_ctx) * memv_elsize * n_embd + n_past * memv_elsize, ); + // important: storing RoPE-ed version of K in the KV cache! gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k)); gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); } - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - let q = ctx0.op_permute( - &ctx0.op_rope( - &ctx0.op_cpy( - &q_current, - &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n), - ), - n_past, - n_rot, - 0, - ), - 0, - 2, - 1, - 3, - ); + let q = ctx0.op_permute(&q_current, 0, 2, 1, 3); - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) let k = ctx0.op_permute( - &ctx0.op_rope( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - &session.memory_k, - (n_past + n) * n_embd, - il * n_ctx * session.memory_k.element_size() * n_embd, - ), - n_embd / n_head, - n_head, - n_past + n, + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + &session.memory_k, + (n_past + n) * n_embd, + il * n_ctx * memk_elsize * n_embd, ), - n_past, - n_rot, - 1, + n_embd / n_head, + n_head, + n_past + n, ), 0, 2, @@ -1279,25 +1263,18 @@ impl Model { // KQ = soft_max(KQ_masked) let k_q_soft_max = ctx0.op_soft_max(&k_q_masked); - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - let v_transposed = { - if !increased_determinism { - vtrans_fun(il) - } else { - ctx0.op_cpy( - &vtrans_fun(il), - &ctx0.new_tensor_3d( - ggml::Type::F32, - n_past + n, - n_embd / n_head, - n_head, - ), - ) - } - }; + // split cached V into n_head heads + let v = ctx0.op_view_3d( + &session.memory_v, + n_past + n, + n_embd / n_head, + n_head, + n_ctx * memv_elsize, + n_ctx * memv_elsize * n_embd / n_head, + il * n_ctx * memv_elsize * n_embd, + ); - // KQV = transpose(V) * KQ_soft_max - let k_q_v = ctx0.op_mul_mat(&v_transposed, &k_q_soft_max); + let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) let k_q_v_merged = ctx0.op_permute(&k_q_v, 0, 2, 1, 3);