diff --git a/src/bench_internal.c b/src/bench_internal.c
index 827a389938..1a06e9441a 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -241,7 +241,7 @@ void bench_wnaf_const(void* arg) {
     bench_inv_t *data = (bench_inv_t*)arg;
 
     for (i = 0; i < 20000; i++) {
-        secp256k1_wnaf_const(data->wnaf, &data->scalar_x, WINDOW_A);
+        secp256k1_wnaf_const(data->wnaf, data->scalar_x, WINDOW_A);
         secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
     }
 }
diff --git a/src/ecmult_const_impl.h b/src/ecmult_const_impl.h
index 956d5e4c0f..3632d2ecb5 100644
--- a/src/ecmult_const_impl.h
+++ b/src/ecmult_const_impl.h
@@ -12,7 +12,11 @@
 #include "ecmult_const.h"
 #include "ecmult_impl.h"
 
-#define WNAF_BITS 256
+#ifdef USE_ENDOMORPHISM
+    #define WNAF_BITS 128
+#else
+    #define WNAF_BITS 256
+#endif
 #define WNAF_SIZE(w) ((WNAF_BITS + (w) - 1) / (w))
 
 /* This is like `ECMULT_TABLE_GET_GE` but is constant time */
@@ -49,17 +53,47 @@
  *
  *  Numbers reference steps of `Algorithm SPA-resistant Width-w NAF with Odd Scalar` on pp. 335
  */
-static void secp256k1_wnaf_const(int *wnaf, const secp256k1_scalar_t *a, int w) {
-    secp256k1_scalar_t s = *a;
-    /* Negate to force oddness */
-    int is_even = secp256k1_scalar_is_even(&s);
-    int global_sign = secp256k1_scalar_cond_negate(&s, is_even);
-
+static int secp256k1_wnaf_const(int *wnaf, secp256k1_scalar_t s, int w) {
+    int global_sign = 1;
+    int skew = 0;
     int word = 0;
     /* 1 2 3 */
-    int u_last = secp256k1_scalar_shr_int(&s, w);
+    int u_last;
     int u;
+
+#ifdef USE_ENDOMORPHISM
+    /* If we are using the endomorphism, we cannot handle even numbers by negating
+     * them, since we are working with 128-bit numbers whose negations would be 256
+     * bits, eliminating the performance advantage. Instead we use a technique from
+     * Section 4.2 of the Okeya/Tagaki paper, which is to add either 1 (for even)
+     * or 2 (for odd) to the number we are encoding, then compensating after the
+     * multiplication. */
+    /* Negative 128-bit numbers will be negated, since otherwise they are 256-bit */
+    int flip = secp256k1_scalar_is_high(&s);
+    /* We add 1 to even numbers, 2 to odd ones, noting that negation flips parity */
+    int bit = flip ^ (s.d[0] & 1);
+    /* We check for negative one, since adding 2 to it will cause an overflow */
+    secp256k1_scalar_t neg_s;
+    int not_neg_one;
+    secp256k1_scalar_negate(&neg_s, &s);
+    not_neg_one = !secp256k1_scalar_is_one(&neg_s);
+    secp256k1_scalar_cadd_bit(&s, bit, not_neg_one);
+    /* If we had negative one, flip == 1, s.d[0] == 0, bit == 1, so caller expects
+     * that we added two to it and flipped it. In fact for -1 these operations are
+     * identical. We only flipped, but since skewing is required (in the sense that
+     * the skew must be 1 or 2, never zero) and flipping is not, we need to change
+     * our flags to claim that we only skewed. */
+    global_sign = secp256k1_scalar_cond_negate(&s, flip);
+    global_sign *= not_neg_one * 2 - 1;
+    skew = 1 << bit;
+#else
+    /* Otherwise, we just negate to force oddness */
+    int is_even = secp256k1_scalar_is_even(&s);
+    global_sign = secp256k1_scalar_cond_negate(&s, is_even);
+#endif
+
     /* 4 */
+    u_last = secp256k1_scalar_shr_int(&s, w);
     while (word * w < WNAF_BITS) {
         int sign;
         int even;
@@ -81,6 +115,7 @@ static void secp256k1_wnaf_const(int *wnaf, const secp256k1_scalar_t *a, int w)
 
     VERIFY_CHECK(secp256k1_scalar_is_zero(&s));
     VERIFY_CHECK(word == WNAF_SIZE(w));
+    return skew;
 }
 
 
@@ -89,17 +124,37 @@ static void secp256k1_ecmult_const(secp256k1_gej_t *r, const secp256k1_ge_t *a,
     secp256k1_ge_t tmpa;
     secp256k1_fe_t Z;
 
+#ifdef USE_ENDOMORPHISM
+    secp256k1_ge_t pre_a_lam[ECMULT_TABLE_SIZE(WINDOW_A)];
+    int wnaf_1[1 + WNAF_SIZE(WINDOW_A - 1)];
+    int wnaf_lam[1 + WNAF_SIZE(WINDOW_A - 1)];
+    int skew_1;
+    int skew_lam;
+    secp256k1_scalar_t q_1, q_lam;
+#else
     int wnaf[1 + WNAF_SIZE(WINDOW_A - 1)];
+#endif
 
     int i;
-    int is_zero = secp256k1_scalar_is_zero(scalar);
     secp256k1_scalar_t sc = *scalar;
+
+    /* build wnaf representation for q. */
+#ifdef USE_ENDOMORPHISM
+    /* split q into q_1 and q_lam (where q = q_1 + q_lam*lambda, and q_1 and q_lam are ~128 bit) */
+    secp256k1_scalar_split_lambda(&q_1, &q_lam, &sc);
+    /* no need for zero correction when using endomorphism since even
+     * numbers have one added to them anyway */
+    skew_1   = secp256k1_wnaf_const(wnaf_1,   q_1,   WINDOW_A - 1);
+    skew_lam = secp256k1_wnaf_const(wnaf_lam, q_lam, WINDOW_A - 1);
+#else
+    int is_zero = secp256k1_scalar_is_zero(scalar);
     /* the wNAF ladder cannot handle zero, so bump this to one .. we will
      * correct the result after the fact */
     sc.d[0] += is_zero;
+    VERIFY_CHECK(!secp256k1_scalar_is_zero(&sc));
 
-    /* build wnaf representation for q. */
-    secp256k1_wnaf_const(wnaf, &sc, WINDOW_A - 1);
+    secp256k1_wnaf_const(wnaf, sc, WINDOW_A - 1);
+#endif
 
     /* Calculate odd multiples of a.
      * All multiples are brought to the same Z 'denominator', which is stored
@@ -109,14 +164,31 @@ static void secp256k1_ecmult_const(secp256k1_gej_t *r, const secp256k1_ge_t *a,
      */
     secp256k1_gej_set_ge(r, a);
     secp256k1_ecmult_odd_multiples_table_globalz_windowa(pre_a, &Z, r);
+#ifdef USE_ENDOMORPHISM
+    for (i = 0; i < ECMULT_TABLE_SIZE(WINDOW_A); i++) {
+        secp256k1_ge_mul_lambda(&pre_a_lam[i], &pre_a[i]);
+    }
+#endif
 
     /* first loop iteration (separated out so we can directly set r, rather
      * than having it start at infinity, get doubled several times, then have
      * its new value added to it) */
+#ifdef USE_ENDOMORPHISM
+    i = wnaf_1[WNAF_SIZE(WINDOW_A - 1)];
+    VERIFY_CHECK(i != 0);
+    ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a, i, WINDOW_A);
+    secp256k1_gej_set_ge(r, &tmpa);
+
+    i = wnaf_lam[WNAF_SIZE(WINDOW_A - 1)];
+    VERIFY_CHECK(i != 0);
+    ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a_lam, i, WINDOW_A);
+    secp256k1_gej_add_ge(r, r, &tmpa);
+#else
     i = wnaf[WNAF_SIZE(WINDOW_A - 1)];
     VERIFY_CHECK(i != 0);
     ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a, i, WINDOW_A);
     secp256k1_gej_set_ge(r, &tmpa);
+#endif
     /* remaining loop iterations */
     for (i = WNAF_SIZE(WINDOW_A - 1) - 1; i >= 0; i--) {
         int n;
@@ -124,16 +196,59 @@ static void secp256k1_ecmult_const(secp256k1_gej_t *r, const secp256k1_ge_t *a,
         for (j = 0; j < WINDOW_A - 1; ++j) {
             secp256k1_gej_double_nonzero(r, r, NULL);
         }
+#ifdef USE_ENDOMORPHISM
+        n = wnaf_1[i];
+        ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a, n, WINDOW_A);
+        VERIFY_CHECK(n != 0);
+        secp256k1_gej_add_ge(r, r, &tmpa);
+
+        n = wnaf_lam[i];
+        ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a_lam, n, WINDOW_A);
+        VERIFY_CHECK(n != 0);
+        secp256k1_gej_add_ge(r, r, &tmpa);
+#else
         n = wnaf[i];
         VERIFY_CHECK(n != 0);
         ECMULT_CONST_TABLE_GET_GE(&tmpa, pre_a, n, WINDOW_A);
         secp256k1_gej_add_ge(r, r, &tmpa);
+#endif
     }
 
     secp256k1_fe_mul(&r->z, &r->z, &Z);
 
+#ifdef USE_ENDOMORPHISM
+    {
+        /* Correct for wNAF skew */
+        secp256k1_ge_t correction = *a;
+        secp256k1_ge_storage_t correction_1_stor;
+        secp256k1_ge_storage_t correction_lam_stor;
+        secp256k1_ge_storage_t a2_stor;
+        secp256k1_gej_t tmpj;
+        secp256k1_gej_set_ge(&tmpj, &correction);
+        secp256k1_gej_double_var(&tmpj, &tmpj, NULL);
+        secp256k1_ge_set_gej(&correction, &tmpj);
+        secp256k1_ge_to_storage(&correction_1_stor, a);
+        secp256k1_ge_to_storage(&correction_lam_stor, a);
+        secp256k1_ge_to_storage(&a2_stor, &correction);
+
+        /* For odd numbers this is 2a (so replace it), for even ones a (so no-op) */
+        secp256k1_ge_storage_cmov(&correction_1_stor, &a2_stor, skew_1 == 2);
+        secp256k1_ge_storage_cmov(&correction_lam_stor, &a2_stor, skew_lam == 2);
+
+        /* Apply the correction */
+        secp256k1_ge_from_storage(&correction, &correction_1_stor);
+        secp256k1_ge_neg(&correction, &correction);
+        secp256k1_gej_add_ge(r, r, &correction);
+
+        secp256k1_ge_from_storage(&correction, &correction_lam_stor);
+        secp256k1_ge_neg(&correction, &correction);
+        secp256k1_ge_mul_lambda(&correction, &correction);
+        secp256k1_gej_add_ge(r, r, &correction);
+    }
+#else
     /* correct for zero */
     r->infinity |= is_zero;
+#endif
 }
 
 #endif
diff --git a/src/tests.c b/src/tests.c
index bfcc8bce11..feb9576330 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -1550,10 +1550,21 @@ void test_constant_wnaf(const secp256k1_scalar_t *number, int w) {
     secp256k1_scalar_t x, shift;
     int wnaf[256] = {0};
     int i;
+#ifdef USE_ENDOMORPHISM
+    int skew;
+#endif
+    secp256k1_scalar_t num = *number;
 
     secp256k1_scalar_set_int(&x, 0);
     secp256k1_scalar_set_int(&shift, 1 << w);
-    secp256k1_wnaf_const(wnaf, number, w);
+    /* With USE_ENDOMORPHISM on we only consider 128-bit numbers */
+#ifdef USE_ENDOMORPHISM
+    for (i = 0; i < 16; ++i)
+        secp256k1_scalar_shr_int(&num, 8);
+    skew = secp256k1_wnaf_const(wnaf, num, w);
+#else
+    secp256k1_wnaf_const(wnaf, num, w);
+#endif
 
     for (i = WNAF_SIZE(w); i >= 0; --i) {
         secp256k1_scalar_t t;
@@ -1572,7 +1583,11 @@ void test_constant_wnaf(const secp256k1_scalar_t *number, int w) {
         }
         secp256k1_scalar_add(&x, &x, &t);
     }
-    CHECK(secp256k1_scalar_eq(&x, number));
+#ifdef USE_ENDOMORPHISM
+    /* Skew num because when encoding 128-bit numbers as odd we use an offset */
+    secp256k1_scalar_cadd_bit(&num, skew == 2, 1);
+#endif
+    CHECK(secp256k1_scalar_eq(&x, &num));
 }
 
 void run_wnaf(void) {