Skip to content

Commit 61fc72a

Browse files
authored
gh-124951: Optimize base64 encode & decode for an easy 2-3x speedup [no SIMD] (GH-143262)
Optimize base64 encoding/decoding by eliminating loop-carried dependencies. Key changes: - Add `base64_encode_trio()` and `base64_decode_quad()` helper functions that process complete groups independently - Add `base64_encode_fast()` and `base64_decode_fast()` wrappers - Update `b2a_base64` and `a2b_base64` to use fast path for complete groups Performance gains (encode/decode speedup vs main, PGO builds): ``` 64 bytes 64K 1M Zen2: 1.2x/1.8x 1.7x/2.8x 1.5x/2.8x Zen4: 1.2x/1.7x 1.6x/3.0x 1.5x/3.0x [old data, likely faster] M4: 1.3x/1.9x 2.3x/2.8x 2.4x/2.9x [old data, likely faster] RPi5-32: 1.2x/1.2x 2.4x/2.4x 2.0x/2.1x ``` Based on my exploratory work done in main...gpshead:cpython:claude/vectorize-base64-c-S7Hku See PR and issue for further thoughts on sometimes MUCH faster SIMD vectorized versions of this.
1 parent 6b9a6c6 commit 61fc72a

File tree

3 files changed

+134
-23
lines changed

3 files changed

+134
-23
lines changed

Doc/whatsnew/3.15.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,13 @@ argparse
428428
inline code when color output is enabled.
429429
(Contributed by Savannah Ostrowski in :gh:`142390`.)
430430

431+
base64 & binascii
432+
-----------------
433+
434+
* CPython's underlying base64 implementation now encodes 2x faster and decodes 3x
435+
faster thanks to simple CPU pipelining optimizations.
436+
(Contributed by Gregory P. Smith & Serhiy Storchaka in :gh:`143262`.)
437+
431438
calendar
432439
--------
433440

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and
2+
related codec has been optimized for modern pipelined CPU architectures and
3+
now performs 2-3x faster across all platforms.

Modules/binascii.c

Lines changed: 124 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,12 @@ get_binascii_state(PyObject *module)
7676
}
7777

7878

79-
static const unsigned char table_a2b_base64[] = {
79+
/* Align to 64 bytes for L1 cache line friendliness */
80+
static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = {
8081
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
8182
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
8283
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
83-
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1, 0,-1,-1, /* Note PAD->0 */
84+
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,64,-1,-1, /* PAD->64 detected by fast path */
8485
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
8586
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
8687
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
@@ -101,9 +102,91 @@ static const unsigned char table_a2b_base64[] = {
101102
/* Max binary chunk size; limited only by available memory */
102103
#define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)
103104

104-
static const unsigned char table_b2a_base64[] =
105+
/*
106+
* Fast base64 encoding/decoding helpers.
107+
*
108+
* Process complete groups without loop-carried dependencies.
109+
*/
110+
111+
/* Align to 64 bytes for L1 cache line friendliness */
112+
static const unsigned char table_b2a_base64[] Py_ALIGNED(64) =
105113
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
106114

115+
/* Encode 3 bytes into 4 base64 characters. */
116+
static inline void
117+
base64_encode_trio(const unsigned char *in, unsigned char *out,
118+
const unsigned char *table)
119+
{
120+
unsigned int combined = ((unsigned int)in[0] << 16) |
121+
((unsigned int)in[1] << 8) |
122+
(unsigned int)in[2];
123+
out[0] = table[(combined >> 18) & 0x3f];
124+
out[1] = table[(combined >> 12) & 0x3f];
125+
out[2] = table[(combined >> 6) & 0x3f];
126+
out[3] = table[combined & 0x3f];
127+
}
128+
129+
/* Encode multiple complete 3-byte groups.
130+
* Returns the number of input bytes processed (always a multiple of 3).
131+
*/
132+
static inline Py_ssize_t
133+
base64_encode_fast(const unsigned char *in, Py_ssize_t in_len,
134+
unsigned char *out, const unsigned char *table)
135+
{
136+
Py_ssize_t n_trios = in_len / 3;
137+
const unsigned char *in_end = in + n_trios * 3;
138+
139+
while (in < in_end) {
140+
base64_encode_trio(in, out, table);
141+
in += 3;
142+
out += 4;
143+
}
144+
145+
return n_trios * 3;
146+
}
147+
148+
/* Decode 4 base64 characters into 3 bytes.
149+
* Returns 1 on success, 0 if any character is invalid.
150+
*/
151+
static inline int
152+
base64_decode_quad(const unsigned char *in, unsigned char *out,
153+
const unsigned char *table)
154+
{
155+
unsigned char v0 = table[in[0]];
156+
unsigned char v1 = table[in[1]];
157+
unsigned char v2 = table[in[2]];
158+
unsigned char v3 = table[in[3]];
159+
160+
if ((v0 | v1 | v2 | v3) & 0xc0) {
161+
return 0;
162+
}
163+
164+
out[0] = (v0 << 2) | (v1 >> 4);
165+
out[1] = (v1 << 4) | (v2 >> 2);
166+
out[2] = (v2 << 6) | v3;
167+
return 1;
168+
}
169+
170+
/* Decode multiple complete 4-character groups (no padding allowed).
171+
* Returns the number of input characters processed.
172+
* Stops at the first invalid character, padding, or incomplete group.
173+
*/
174+
static inline Py_ssize_t
175+
base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
176+
unsigned char *out, const unsigned char *table)
177+
{
178+
Py_ssize_t n_quads = in_len / 4;
179+
Py_ssize_t i;
180+
181+
for (i = 0; i < n_quads; i++) {
182+
if (!base64_decode_quad(in + i * 4, out + i * 3, table)) {
183+
break;
184+
}
185+
}
186+
187+
return i * 4;
188+
}
189+
107190

108191
static const unsigned short crctab_hqx[256] = {
109192
0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
@@ -403,10 +486,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
403486
goto error_end;
404487
}
405488

489+
size_t i = 0; /* Current position in input */
490+
491+
/* Fast path: use optimized decoder for complete quads.
492+
* This works for both strict and non-strict mode for valid input.
493+
* The fast path stops at padding, invalid chars, or incomplete groups.
494+
*/
495+
if (ascii_len >= 4) {
496+
Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len,
497+
bin_data, table_a2b_base64);
498+
if (fast_chars > 0) {
499+
i = (size_t)fast_chars;
500+
bin_data += (fast_chars / 4) * 3;
501+
}
502+
}
503+
504+
/* Slow path: handle remaining input (padding, invalid chars, partial groups) */
406505
int quad_pos = 0;
407506
unsigned char leftchar = 0;
408507
int pads = 0;
409-
for (size_t i = 0; i < ascii_len; i++) {
508+
for (; i < ascii_len; i++) {
410509
unsigned char this_ch = ascii_data[i];
411510

412511
/* Check for pad sequences and ignore
@@ -533,9 +632,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
533632
/*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/
534633
{
535634
const unsigned char *bin_data;
536-
int leftbits = 0;
537-
unsigned char this_ch;
538-
unsigned int leftchar = 0;
539635
Py_ssize_t bin_len;
540636
binascii_state *state;
541637

@@ -566,26 +662,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
566662
}
567663
unsigned char *ascii_data = PyBytesWriter_GetData(writer);
568664

569-
for( ; bin_len > 0 ; bin_len--, bin_data++ ) {
570-
/* Shift the data into our buffer */
571-
leftchar = (leftchar << 8) | *bin_data;
572-
leftbits += 8;
573-
574-
/* See if there are 6-bit groups ready */
575-
while ( leftbits >= 6 ) {
576-
this_ch = (leftchar >> (leftbits-6)) & 0x3f;
577-
leftbits -= 6;
578-
*ascii_data++ = table_b2a_base64[this_ch];
579-
}
580-
}
581-
if ( leftbits == 2 ) {
582-
*ascii_data++ = table_b2a_base64[(leftchar&3) << 4];
665+
/* Use the optimized fast path for complete 3-byte groups */
666+
Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data,
667+
table_b2a_base64);
668+
bin_data += fast_bytes;
669+
ascii_data += (fast_bytes / 3) * 4;
670+
bin_len -= fast_bytes;
671+
672+
/* Handle remaining 0-2 bytes */
673+
if (bin_len == 1) {
674+
/* 1 byte remaining: produces 2 base64 chars + 2 padding */
675+
unsigned int val = bin_data[0];
676+
*ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f];
677+
*ascii_data++ = table_b2a_base64[(val << 4) & 0x3f];
583678
*ascii_data++ = BASE64_PAD;
584679
*ascii_data++ = BASE64_PAD;
585-
} else if ( leftbits == 4 ) {
586-
*ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2];
680+
}
681+
else if (bin_len == 2) {
682+
/* 2 bytes remaining: produces 3 base64 chars + 1 padding */
683+
unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1];
684+
*ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f];
685+
*ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f];
686+
*ascii_data++ = table_b2a_base64[(val << 2) & 0x3f];
587687
*ascii_data++ = BASE64_PAD;
588688
}
689+
589690
if (newline)
590691
*ascii_data++ = '\n'; /* Append a courtesy newline */
591692

0 commit comments

Comments
 (0)