diff --git a/ggml-sys/ggml/CREDITS.txt b/ggml-sys/ggml/CREDITS.txt
index 876de74b..036b95bc 100644
--- a/ggml-sys/ggml/CREDITS.txt
+++ b/ggml-sys/ggml/CREDITS.txt
@@ -1,4 +1,4 @@
-Vendored version: https://github.com/ggerganov/llama.cpp/commit/437e77855a54e69c86fe03bc501f63d9a3fddb0e
+Vendored version: https://github.com/ggerganov/llama.cpp/commit/cc9cee8e9e7598bd280295f6264f36d3a9224006
 
 The ggml.c and ggml.h files are distributed under the terms of the MIT license.
 Credit goes to the original authors: Copyright (c) 2023 Georgi Gerganov
diff --git a/ggml-sys/ggml/ggml.c b/ggml-sys/ggml/ggml.c
index 59e84ab4..8a60bc38 100644
--- a/ggml-sys/ggml/ggml.c
+++ b/ggml-sys/ggml/ggml.c
@@ -3219,7 +3219,8 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.pad          =*/ { 0 },
     };
 
-    ggml_assert_aligned(result->data);
+    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
+    //ggml_assert_aligned(result->data);
 
     for (int i = 0; i < n_dims; i++) {
         result->ne[i] = ne[i];
@@ -3620,7 +3621,14 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
 struct ggml_tensor * ggml_view_tensor(
         struct ggml_context * ctx,
         const struct ggml_tensor * src) {
-    return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
+
+    result->nb[0] = src->nb[0];
+    result->nb[1] = src->nb[1];
+    result->nb[2] = src->nb[2];
+    result->nb[3] = src->nb[3];
+
+    return result;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -4510,6 +4518,37 @@ struct ggml_tensor * ggml_view_2d(
     return result;
 }
 
+// ggml_view_3d
+
+struct ggml_tensor * ggml_view_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1,
+        size_t                nb2,
+        size_t                offset) {
+    if (a->grad) {
+        GGML_ASSERT(false); // gradient propagation is not supported
+    }
+
+    const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
+
+    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
+
+    result->nb[1] = nb1;
+    result->nb[2] = nb2;
+    result->nb[3] = result->nb[2]*ne2;
+
+    result->op   = GGML_OP_VIEW;
+    result->grad = NULL;
+    result->src0 = a;
+    result->src1 = NULL; // TODO: maybe store the offset here?
+
+    return result;
+}
+
 // ggml_permute
 
 struct ggml_tensor * ggml_permute(
@@ -4845,7 +4884,6 @@ static void ggml_compute_forward_dup_f16(
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
     GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_is_contiguous(dst));
     GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -4862,85 +4900,96 @@ static void ggml_compute_forward_dup_f16(
     const size_t nb02 = src0->nb[2];
     const size_t nb03 = src0->nb[3];
 
-    if (ggml_is_contiguous(src0) && src0->type == dst->type) {
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+    const size_t nb2 = dst->nb[2];
+    const size_t nb3 = dst->nb[3];
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
         memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
         return;
     }
 
-    if (src0->nb[0] == sizeof(ggml_fp16_t)) {
-        if (dst->type == GGML_TYPE_F16) {
-            size_t id = 0;
-            const size_t rs = ne00*nb00;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                        char * dst_ptr = (char *) dst->data + id*rs;
-
-                        memcpy(dst_ptr, src0_ptr, rs);
-
-                        id++;
-                    }
-                }
-            }
-        } else if (dst->type == GGML_TYPE_F32) {
-            size_t id = 0;
-            float * dst_ptr = (float *) dst->data;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
-                            id++;
-                        }
-                    }
+    if (src0->type == dst->type &&
+        src0->ne[0] == dst->ne[0] &&
+        src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
+        // copy by rows
+        const size_t rs = ne00*nb00;
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    memcpy(
+                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
+                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
+                        rs);
                 }
             }
-        } else {
-            GGML_ASSERT(false); // TODO: implement
         }
-    } else {
-        //printf("%s: this is not optimal - fix me\n", __func__);
+        return;
+    }
 
-        if (dst->type == GGML_TYPE_F32) {
-            size_t id = 0;
-            float * dst_ptr = (float *) dst->data;
+    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
 
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+    // dst counters
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
 
-                            dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
-                            id++;
+    if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
+
+                        if (++i10 == ne00) {
+                            i10 = 0;
+                            if (++i11 == ne01) {
+                                i11 = 0;
+                                if (++i12 == ne02) {
+                                    i12 = 0;
+                                    if (++i13 == ne03) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
                         }
                     }
                 }
             }
-        } else if (dst->type == GGML_TYPE_F16) {
-            size_t id = 0;
-            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            dst_ptr[id] = *src0_ptr;
-                            id++;
+        }
+    } else if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
+
+                        if (++i10 == ne00) {
+                            i10 = 0;
+                            if (++i11 == ne01) {
+                                i11 = 0;
+                                if (++i12 == ne02) {
+                                    i12 = 0;
+                                    if (++i13 == ne03) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
                         }
                     }
                 }
             }
-        } else {
-            GGML_ASSERT(false); // TODO: implement
         }
+    } else {
+        GGML_ASSERT(false); // TODO: implement
     }
 }
 
@@ -4949,7 +4998,6 @@ static void ggml_compute_forward_dup_f32(
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
     GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_is_contiguous(dst));
     GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -4966,85 +5014,76 @@ static void ggml_compute_forward_dup_f32(
     const size_t nb02 = src0->nb[2];
     const size_t nb03 = src0->nb[3];
 
-    if (ggml_is_contiguous(src0) && src0->type == dst->type) {
+    const size_t nb0 = dst->nb[0];
+    const size_t nb1 = dst->nb[1];
+    const size_t nb2 = dst->nb[2];
+    const size_t nb3 = dst->nb[3];
+
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
         memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
         return;
     }
 
-    if (src0->nb[0] == sizeof(float)) {
-        if (dst->type == GGML_TYPE_F32) {
-            size_t id = 0;
-            const size_t rs = ne00*nb00;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                        char * dst_ptr = (char *) dst->data + id*rs;
+    // dst counters
+    int64_t i10 = 0;
+    int64_t i11 = 0;
+    int64_t i12 = 0;
+    int64_t i13 = 0;
 
-                        memcpy(dst_ptr, src0_ptr, rs);
-
-                        id++;
-                    }
-                }
-            }
-        } else if (dst->type == GGML_TYPE_F16) {
-            size_t id = 0;
-            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                            id++;
+    if (dst->type == GGML_TYPE_F32) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        memcpy(dst_ptr, src0_ptr, sizeof(float));
+
+                        if (++i10 == dst->ne[0]) {
+                            i10 = 0;
+                            if (++i11 == dst->ne[1]) {
+                                i11 = 0;
+                                if (++i12 == dst->ne[2]) {
+                                    i12 = 0;
+                                    if (++i13 == dst->ne[3]) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
                         }
                     }
                 }
             }
-        } else {
-            GGML_ASSERT(false); // TODO: implement
         }
-    } else {
-        //printf("%s: this is not optimal - fix me\n", __func__);
-
-        if (dst->type == GGML_TYPE_F32) {
-            size_t id = 0;
-            float * dst_ptr = (float *) dst->data;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            dst_ptr[id] = *src0_ptr;
-                            id++;
-                        }
-                    }
-                }
-            }
-        } else if (dst->type == GGML_TYPE_F16) {
-            size_t id = 0;
-            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    for (int64_t i01 = 0; i01 < ne01; i01++) {
-                        for (int64_t i00 = 0; i00 < ne00; i00++) {
-                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                            id++;
+    } else if (dst->type == GGML_TYPE_F16) {
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
+
+                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
+
+                        if (++i10 == dst->ne[0]) {
+                            i10 = 0;
+                            if (++i11 == dst->ne[1]) {
+                                i11 = 0;
+                                if (++i12 == dst->ne[2]) {
+                                    i12 = 0;
+                                    if (++i13 == dst->ne[3]) {
+                                        i13 = 0;
+                                    }
+                                }
+                            }
                         }
                     }
                 }
             }
-        } else {
-            GGML_ASSERT(false); // TODO: implement
         }
+    } else {
+        GGML_ASSERT(false); // TODO: implement
     }
 }
 
@@ -7199,7 +7238,6 @@ static void ggml_compute_forward_rope_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    assert(params->ith == 0);
     assert(src1->type == GGML_TYPE_I32);
     assert(ggml_nelements(src1) == 3);
 
@@ -7226,11 +7264,28 @@ static void ggml_compute_forward_rope_f32(
 
     assert(nb0 == sizeof(float));
 
-    // TODO: optimize
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
             const int p = (mode == 0 ? n_past + i2 : i2);
             for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
                 for (int i0 = 0; i0 < n_dims; i0 += 2) {
                     const float theta = powf(10000.0, ((float)-i0)/n_dims);
 
@@ -7256,7 +7311,6 @@ static void ggml_compute_forward_rope_f16(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    assert(params->ith == 0);
     assert(src1->type == GGML_TYPE_I32);
     assert(ggml_nelements(src1) == 3);
 
@@ -7283,10 +7337,28 @@ static void ggml_compute_forward_rope_f16(
 
     assert(nb0 == sizeof(ggml_fp16_t));
 
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    // row index used to determine which thread to use
+    int ir = 0;
+
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {
             const int p = (mode == 0 ? n_past + i2 : i2);
             for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
                 for (int i0 = 0; i0 < n_dims; i0 += 2) {
                     const float theta = powf(10000.0, ((float)-i0)/n_dims);
 
@@ -9385,7 +9457,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_ROPE:
                     {
-                        node->n_tasks = 1;
+                        node->n_tasks = n_threads;
                     } break;
                 case GGML_OP_CONV_1D_1S:
                 case GGML_OP_CONV_1D_2S:
diff --git a/ggml-sys/ggml/ggml.h b/ggml-sys/ggml/ggml.h
index ad962b10..3c94efc3 100644
--- a/ggml-sys/ggml/ggml.h
+++ b/ggml-sys/ggml/ggml.h
@@ -558,6 +558,16 @@ struct ggml_tensor * ggml_view_2d(
         size_t                nb1, // row stride in bytes
         size_t                offset);
 
+struct ggml_tensor * ggml_view_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1, // row   stride in bytes
+        size_t                nb2, // slice stride in bytes
+        size_t                offset);
+
 struct ggml_tensor * ggml_permute(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
diff --git a/ggml-sys/src/lib.rs b/ggml-sys/src/lib.rs
index 3640a6af..e7bbe03c 100644
--- a/ggml-sys/src/lib.rs
+++ b/ggml-sys/src/lib.rs
@@ -795,6 +795,18 @@ extern "C" {
         offset: usize,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_view_3d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+        nb1: usize,
+        nb2: usize,
+        offset: usize,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_permute(
         ctx: *mut ggml_context,
diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs
index 22c9eee8..76a7e4ab 100644
--- a/ggml/src/lib.rs
+++ b/ggml/src/lib.rs
@@ -147,6 +147,12 @@ impl Context {
         self.new_tensor_raw(raw)
     }
 
+    /// Unknown, aside from the obvious. It's transposing something!
+    pub fn op_transpose(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { ggml_sys::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
     /// Unknown.
     pub fn op_get_rows(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor =
@@ -235,6 +241,55 @@ impl Context {
         self.new_tensor_raw(tensor)
     }
 
+    /// Creates a 2D view over `a`.
+    pub fn op_view_2d(
+        &self,
+        a: &Tensor,
+        ne0: usize,
+        ne1: usize,
+        nb1: usize,
+        offset: usize,
+    ) -> Tensor {
+        let tensor = unsafe {
+            ggml_sys::ggml_view_2d(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+                nb1,
+                offset,
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a 3d view over `a`.
+    #[allow(clippy::too_many_arguments)]
+    pub fn op_view_3d(
+        &self,
+        a: &Tensor,
+        ne0: usize,
+        ne1: usize,
+        ne2: usize,
+        nb1: usize,
+        nb2: usize,
+        offset: usize,
+    ) -> Tensor {
+        let tensor = unsafe {
+            ggml_sys::ggml_view_3d(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+                usize_to_i64(ne2),
+                nb1,
+                nb2,
+                offset,
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
     /// Copies `a` to `b` and returns `b`.
     pub fn op_cpy(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor =
@@ -264,6 +319,26 @@ impl Context {
         self.new_tensor_raw(tensor)
     }
 
+    /// In-place; reshapes `a` in accordance with the dimensions of `b`
+    pub fn op_reshape(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor =
+            unsafe { ggml_sys::ggml_reshape(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// In-place; reshapes `a` in accordance with the specified dimensions.
+    pub fn op_reshape_2d(&self, a: &Tensor, ne0: usize, ne1: usize) -> Tensor {
+        let tensor = unsafe {
+            ggml_sys::ggml_reshape_2d(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
     /// In-place; reshapes `a` in accordance with the specified dimensions.
     pub fn op_reshape_3d(&self, a: &Tensor, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
         let tensor = unsafe {
diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs
index 86820ef3..e4b54d35 100644
--- a/llama-cli/src/cli_args.rs
+++ b/llama-cli/src/cli_args.rs
@@ -232,7 +232,6 @@ impl Generate {
                 }
             }),
             play_back_previous_tokens: session_loaded,
-            ..Default::default()
         }
     }
 }
diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index d5ef2a23..14553379 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -233,11 +233,6 @@ pub struct InferenceParameters {
     pub bias_tokens: TokenBias,
     /// Whether or not previous tokens should be played back in [InferenceSession::inference_with_prompt].
     pub play_back_previous_tokens: bool,
-    /// If set, the inference process will behave more deterministically at the potential cost of performance.
-    ///
-    /// Note that this does not guarantee full determinism. When run on the same machine with the same parameters,
-    /// seed, and this set, inference should be identical, but this is not guaranteed to hold across machines.
-    pub increased_determinism: bool,
 }
 
 impl Default for InferenceParameters {
@@ -251,7 +246,6 @@ impl Default for InferenceParameters {
             temperature: 0.80,
             bias_tokens: TokenBias::default(),
             play_back_previous_tokens: false,
-            increased_determinism: true,
         }
     }
 }
@@ -1135,7 +1129,9 @@ impl Model {
         let n = input_tokens.len();
         let n_past = session.n_past;
         let n_threads = params.n_threads;
-        let increased_determinism = params.increased_determinism;
+
+        let memk_elsize = session.memory_k.element_size();
+        let memv_elsize = session.memory_v.element_size();
 
         let Hyperparameters {
             n_vocab,
@@ -1164,27 +1160,6 @@ impl Model {
 
         let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
 
-        // Defined here to avoid repetition and creating a binding inside nested loops.
-        // See the call site below for more context.
-        let vtrans_fun = |il: usize| -> ggml::Tensor {
-            ctx0.op_permute(
-                &ctx0.op_reshape_3d(
-                    &ctx0.op_view_1d(
-                        &session.memory_v,
-                        (n_past + n) * n_embd,
-                        il * n_ctx * session.memory_v.element_size() * n_embd,
-                    ),
-                    n_embd / n_head,
-                    n_head,
-                    n_past + n,
-                ),
-                1,
-                2,
-                0,
-                3,
-            )
-        };
-
         for il in 0..n_layer {
             let input_self_attention = input_layer.share();
             let mut current: ggml::Tensor;
@@ -1202,61 +1177,70 @@ impl Model {
 
             // self-attention
             {
-                let q_current = ctx0.op_mul_mat(&self.layers[il].wq, &current);
-                let k_current = ctx0.op_mul_mat(&self.layers[il].wk, &current);
-                let v_current = ctx0.op_mul_mat(&self.layers[il].wv, &current);
+                // compute Q and K and RoPE them
+                let q_current = ctx0.op_rope(
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_mul_mat(&self.layers[il].wq, &current),
+                        n_embd / n_head,
+                        n_head,
+                        n,
+                    ),
+                    n_past,
+                    n_rot,
+                    0,
+                );
+                let k_current = ctx0.op_rope(
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_mul_mat(&self.layers[il].wk, &current),
+                        n_embd / n_head,
+                        n_head,
+                        n,
+                    ),
+                    n_past,
+                    n_rot,
+                    0,
+                );
 
                 // store key and value to memory
-                if n >= 1 {
+                {
+                    // compute the transposed [N, n_embd] V matrix
+                    let v_current = ctx0.op_transpose(&ctx0.op_reshape_2d(
+                        &ctx0.op_mul_mat(&self.layers[il].wv, &current),
+                        n_embd,
+                        n,
+                    ));
+
                     let k = ctx0.op_view_1d(
                         &session.memory_k,
                         n * n_embd,
-                        (session.memory_k.element_size() * n_embd) * (il * n_ctx + n_past),
+                        (memk_elsize * n_embd) * (il * n_ctx + n_past),
                     );
 
-                    let v = ctx0.op_view_1d(
+                    let v = ctx0.op_view_2d(
                         &session.memory_v,
-                        n * n_embd,
-                        (session.memory_v.element_size() * n_embd) * (il * n_ctx + n_past),
+                        n,
+                        n_embd,
+                        n_ctx * memv_elsize,
+                        (il * n_ctx) * memv_elsize * n_embd + n_past * memv_elsize,
                     );
 
+                    // important: storing RoPE-ed version of K in the KV cache!
                     gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));
                     gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v));
                 }
 
-                // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-                let q = ctx0.op_permute(
-                    &ctx0.op_rope(
-                        &ctx0.op_cpy(
-                            &q_current,
-                            &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n),
-                        ),
-                        n_past,
-                        n_rot,
-                        0,
-                    ),
-                    0,
-                    2,
-                    1,
-                    3,
-                );
+                let q = ctx0.op_permute(&q_current, 0, 2, 1, 3);
 
-                // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
                 let k = ctx0.op_permute(
-                    &ctx0.op_rope(
-                        &ctx0.op_reshape_3d(
-                            &ctx0.op_view_1d(
-                                &session.memory_k,
-                                (n_past + n) * n_embd,
-                                il * n_ctx * session.memory_k.element_size() * n_embd,
-                            ),
-                            n_embd / n_head,
-                            n_head,
-                            n_past + n,
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_view_1d(
+                            &session.memory_k,
+                            (n_past + n) * n_embd,
+                            il * n_ctx * memk_elsize * n_embd,
                         ),
-                        n_past,
-                        n_rot,
-                        1,
+                        n_embd / n_head,
+                        n_head,
+                        n_past + n,
                     ),
                     0,
                     2,
@@ -1279,25 +1263,18 @@ impl Model {
                 // KQ = soft_max(KQ_masked)
                 let k_q_soft_max = ctx0.op_soft_max(&k_q_masked);
 
-                // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-                let v_transposed = {
-                    if !increased_determinism {
-                        vtrans_fun(il)
-                    } else {
-                        ctx0.op_cpy(
-                            &vtrans_fun(il),
-                            &ctx0.new_tensor_3d(
-                                ggml::Type::F32,
-                                n_past + n,
-                                n_embd / n_head,
-                                n_head,
-                            ),
-                        )
-                    }
-                };
+                // split cached V into n_head heads
+                let v = ctx0.op_view_3d(
+                    &session.memory_v,
+                    n_past + n,
+                    n_embd / n_head,
+                    n_head,
+                    n_ctx * memv_elsize,
+                    n_ctx * memv_elsize * n_embd / n_head,
+                    il * n_ctx * memv_elsize * n_embd,
+                );
 
-                // KQV = transpose(V) * KQ_soft_max
-                let k_q_v = ctx0.op_mul_mat(&v_transposed, &k_q_soft_max);
+                let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max);
 
                 // KQV_merged = KQV.permute(0, 2, 1, 3)
                 let k_q_v_merged = ctx0.op_permute(&k_q_v, 0, 2, 1, 3);