From 92da9f1085f71297fc63df66b3a97b819391331e Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Sat, 13 Jan 2024 14:06:42 +0100
Subject: [PATCH 1/7] lib/env.h: add BASE64_FORCE_INLINE macro

Add a macro to enforce inlining of functions in the hot loop, regardless
of optimization level or compiler heuristics.
---
 lib/env.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/lib/env.h b/lib/env.h
index d489ba5..0837065 100644
--- a/lib/env.h
+++ b/lib/env.h
@@ -71,4 +71,14 @@
 #  define BASE64_FALLTHROUGH
 #endif
 
+// Declare macros to ensure that functions that are intended to be inlined, are
+// actually inlined, even when no optimization is applied. A lot of inner loop
+// code is factored into separate functions for reasons of readability, but
+// that code should always be inlined (and optimized) in the main loop.
+#ifdef _MSC_VER
+#  define BASE64_FORCE_INLINE	__forceinline
+#else
+#  define BASE64_FORCE_INLINE  inline __attribute__((always_inline))
+#endif
+
 #endif	// BASE64_ENV_H

From c88662a8ee3ab3622af768e99f20107dcbf1fc03 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Sat, 13 Jan 2024 14:18:12 +0100
Subject: [PATCH 2/7] avx2: apply BASE64_FORCE_INLINE macro

---
 lib/arch/avx2/dec_loop.c      | 2 +-
 lib/arch/avx2/dec_reshuffle.c | 2 +-
 lib/arch/avx2/enc_loop.c      | 4 ++--
 lib/arch/avx2/enc_reshuffle.c | 2 +-
 lib/arch/avx2/enc_translate.c | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c
index f959fc4..b8a4cca 100644
--- a/lib/arch/avx2/dec_loop.c
+++ b/lib/arch/avx2/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const __m256i lut_lo = _mm256_setr_epi8(
diff --git a/lib/arch/avx2/dec_reshuffle.c b/lib/arch/avx2/dec_reshuffle.c
index f351809..bc875ce 100644
--- a/lib/arch/avx2/dec_reshuffle.c
+++ b/lib/arch/avx2/dec_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 dec_reshuffle (const __m256i in)
 {
 	// in, lower lane, bits, upper case are most significant bits, lower
diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c
index b9e2736..6f4aa0a 100644
--- a/lib/arch/avx2/enc_loop.c
+++ b/lib/arch/avx2/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
 {
 	// First load is done at s - 0 to not get a segfault:
@@ -17,7 +17,7 @@ enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
 	*o += 32;
 }
 
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input:
diff --git a/lib/arch/avx2/enc_reshuffle.c b/lib/arch/avx2/enc_reshuffle.c
index ba16690..82c659b 100644
--- a/lib/arch/avx2/enc_reshuffle.c
+++ b/lib/arch/avx2/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 enc_reshuffle (const __m256i input)
 {
 	// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
diff --git a/lib/arch/avx2/enc_translate.c b/lib/arch/avx2/enc_translate.c
index 46173cd..370da98 100644
--- a/lib/arch/avx2/enc_translate.c
+++ b/lib/arch/avx2/enc_translate.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 enc_translate (const __m256i in)
 {
 	// A lookup table containing the absolute offsets for all ranges:

From 01c22e81861cab440b755c1137c05de2f095703c Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Sat, 13 Jan 2024 14:26:18 +0100
Subject: [PATCH 3/7] avx512: apply BASE64_FORCE_INLINE macro

---
 lib/arch/avx512/enc_loop.c                | 2 +-
 lib/arch/avx512/enc_reshuffle_translate.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c
index 4c71e16..cb44696 100644
--- a/lib/arch/avx512/enc_loop.c
+++ b/lib/arch/avx512/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input.
diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c
index 5c332bb..c6634f0 100644
--- a/lib/arch/avx512/enc_reshuffle_translate.c
+++ b/lib/arch/avx512/enc_reshuffle_translate.c
@@ -1,7 +1,7 @@
 // AVX512 algorithm is based on permutevar and multishift. The code is based on
 // https://github.com/WojciechMula/base64simd which is under BSD-2 license.
 
-static inline __m512i
+static BASE64_FORCE_INLINE __m512i
 enc_reshuffle_translate (const __m512i input)
 {
 	// 32-bit input

From ca095e85044dc9544fd172dced0f988539f75ca6 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Sat, 13 Jan 2024 19:31:28 +0100
Subject: [PATCH 4/7] generic: apply BASE64_FORCE_INLINE macro

---
 lib/arch/generic/32/dec_loop.c | 2 +-
 lib/arch/generic/32/enc_loop.c | 2 +-
 lib/arch/generic/64/enc_loop.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/arch/generic/32/dec_loop.c b/lib/arch/generic/32/dec_loop.c
index 8a8260f..aa290d7 100644
--- a/lib/arch/generic/32/dec_loop.c
+++ b/lib/arch/generic/32/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const uint32_t str
diff --git a/lib/arch/generic/32/enc_loop.c b/lib/arch/generic/32/enc_loop.c
index f4870a7..b5e6eef 100644
--- a/lib/arch/generic/32/enc_loop.c
+++ b/lib/arch/generic/32/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
 {
 	uint32_t src;
diff --git a/lib/arch/generic/64/enc_loop.c b/lib/arch/generic/64/enc_loop.c
index 0840bc7..e6a29cd 100644
--- a/lib/arch/generic/64/enc_loop.c
+++ b/lib/arch/generic/64/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
 {
 	uint64_t src;

From a4e2252335d05557b84447a8f39a67b30a79954c Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Sat, 13 Jan 2024 19:30:26 +0100
Subject: [PATCH 5/7] neon32: apply BASE64_FORCE_INLINE macro

---
 lib/arch/neon32/codec.c         | 2 +-
 lib/arch/neon32/dec_loop.c      | 6 +++---
 lib/arch/neon32/enc_loop.c      | 4 ++--
 lib/arch/neon32/enc_reshuffle.c | 2 +-
 lib/arch/neon32/enc_translate.c | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/arch/neon32/codec.c b/lib/arch/neon32/codec.c
index 70c80e4..d552344 100644
--- a/lib/arch/neon32/codec.c
+++ b/lib/arch/neon32/codec.c
@@ -22,7 +22,7 @@
 #define BASE64_NEON32_USE_ASM
 #endif
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
 {
 	// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
diff --git a/lib/arch/neon32/dec_loop.c b/lib/arch/neon32/dec_loop.c
index 2216b39..e4caed7 100644
--- a/lib/arch/neon32/dec_loop.c
+++ b/lib/arch/neon32/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 is_nonzero (const uint8x16_t v)
 {
 	uint64_t u64;
@@ -9,7 +9,7 @@ is_nonzero (const uint8x16_t v)
 	return u64 != 0;
 }
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 delta_lookup (const uint8x16_t v)
 {
 	const uint8x8_t lut = {
@@ -21,7 +21,7 @@ delta_lookup (const uint8x16_t v)
 		vtbl1_u8(lut, vget_high_u8(v)));
 }
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 dec_loop_neon32_lane (uint8x16_t *lane)
 {
 	// See the SSSE3 decoder for an explanation of the algorithm.
diff --git a/lib/arch/neon32/enc_loop.c b/lib/arch/neon32/enc_loop.c
index d694b33..2adff48 100644
--- a/lib/arch/neon32/enc_loop.c
+++ b/lib/arch/neon32/enc_loop.c
@@ -1,5 +1,5 @@
 #ifdef BASE64_NEON32_USE_ASM
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
 {
 	// This function duplicates the functionality of enc_loop_neon32_inner,
@@ -106,7 +106,7 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
 }
 #endif
 
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
 {
 #ifdef BASE64_NEON32_USE_ASM
diff --git a/lib/arch/neon32/enc_reshuffle.c b/lib/arch/neon32/enc_reshuffle.c
index d6e97cb..fa94d27 100644
--- a/lib/arch/neon32/enc_reshuffle.c
+++ b/lib/arch/neon32/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_reshuffle (uint8x16x3_t in)
 {
 	uint8x16x4_t out;
diff --git a/lib/arch/neon32/enc_translate.c b/lib/arch/neon32/enc_translate.c
index e616d54..ff3d88d 100644
--- a/lib/arch/neon32/enc_translate.c
+++ b/lib/arch/neon32/enc_translate.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_translate (const uint8x16x4_t in)
 {
 	// A lookup table containing the absolute offsets for all ranges:

From caf6eb1531455e492e2996579d7dd22d88033207 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Sat, 13 Jan 2024 19:30:37 +0100
Subject: [PATCH 6/7] neon64: apply BASE64_FORCE_INLINE macro

---
 lib/arch/neon64/codec.c         | 2 +-
 lib/arch/neon64/enc_loop.c      | 2 +-
 lib/arch/neon64/enc_reshuffle.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/arch/neon64/codec.c b/lib/arch/neon64/codec.c
index f5cda63..6b664b4 100644
--- a/lib/arch/neon64/codec.c
+++ b/lib/arch/neon64/codec.c
@@ -22,7 +22,7 @@
 #define BASE64_NEON64_USE_ASM
 #endif
 
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 load_64byte_table (const uint8_t *p)
 {
 #ifdef BASE64_NEON64_USE_ASM
diff --git a/lib/arch/neon64/enc_loop.c b/lib/arch/neon64/enc_loop.c
index 59a1c59..8bdd088 100644
--- a/lib/arch/neon64/enc_loop.c
+++ b/lib/arch/neon64/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
 {
 	// Load 48 bytes and deinterleave:
diff --git a/lib/arch/neon64/enc_reshuffle.c b/lib/arch/neon64/enc_reshuffle.c
index ea543e0..2655df1 100644
--- a/lib/arch/neon64/enc_reshuffle.c
+++ b/lib/arch/neon64/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_reshuffle (const uint8x16x3_t in)
 {
 	uint8x16x4_t out;

From 32e5eb6a5e701ac7b19fafffa7b14659596caf20 Mon Sep 17 00:00:00 2001
From: Alfred Klomp <git@alfredklomp.com>
Date: Sat, 13 Jan 2024 19:31:00 +0100
Subject: [PATCH 7/7] ssse3: apply BASE64_FORCE_INLINE macro

---
 lib/arch/ssse3/dec_loop.c      | 2 +-
 lib/arch/ssse3/dec_reshuffle.c | 2 +-
 lib/arch/ssse3/enc_loop.c      | 2 +-
 lib/arch/ssse3/enc_reshuffle.c | 2 +-
 lib/arch/ssse3/enc_translate.c | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/arch/ssse3/dec_loop.c b/lib/arch/ssse3/dec_loop.c
index 9da71ab..7ddb73b 100644
--- a/lib/arch/ssse3/dec_loop.c
+++ b/lib/arch/ssse3/dec_loop.c
@@ -65,7 +65,7 @@
 // 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 // 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const __m128i lut_lo = _mm_setr_epi8(
diff --git a/lib/arch/ssse3/dec_reshuffle.c b/lib/arch/ssse3/dec_reshuffle.c
index fdf587f..d3dd395 100644
--- a/lib/arch/ssse3/dec_reshuffle.c
+++ b/lib/arch/ssse3/dec_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 dec_reshuffle (const __m128i in)
 {
 	// in, bits, upper case are most significant bits, lower case are least significant bits
diff --git a/lib/arch/ssse3/enc_loop.c b/lib/arch/ssse3/enc_loop.c
index 6de652e..9b67b70 100644
--- a/lib/arch/ssse3/enc_loop.c
+++ b/lib/arch/ssse3/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input:
diff --git a/lib/arch/ssse3/enc_reshuffle.c b/lib/arch/ssse3/enc_reshuffle.c
index b738591..f9dc949 100644
--- a/lib/arch/ssse3/enc_reshuffle.c
+++ b/lib/arch/ssse3/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 enc_reshuffle (__m128i in)
 {
 	// Input, bytes MSB to LSB:
diff --git a/lib/arch/ssse3/enc_translate.c b/lib/arch/ssse3/enc_translate.c
index 04f288f..60d9a42 100644
--- a/lib/arch/ssse3/enc_translate.c
+++ b/lib/arch/ssse3/enc_translate.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 enc_translate (const __m128i in)
 {
 	// A lookup table containing the absolute offsets for all ranges: