diff --git a/alg/gdalwarpkernel.cpp b/alg/gdalwarpkernel.cpp
index 9053cb24cddf..56211577ec82 100644
--- a/alg/gdalwarpkernel.cpp
+++ b/alg/gdalwarpkernel.cpp
@@ -2849,11 +2849,14 @@ static bool GWKBilinearResampleNoMasks4SampleT(const GDALWarpKernel *poWK,
 // or http://en.wikipedia.org/wiki/Cubic_Hermite_spline : CINTx(p_1,p0,p1,p2)
 // http://en.wikipedia.org/wiki/Bicubic_interpolation: matrix notation
 
-// TODO(schwehr): Use an inline function.
-#define CubicConvolution(distance1, distance2, distance3, f0, f1, f2, f3)      \
-    (f1 + 0.5 * (distance1 * (f2 - f0) +                                       \
-                 distance2 * (2.0 * f0 - 5.0 * f1 + 4.0 * f2 - f3) +           \
-                 distance3 * (3.0 * (f1 - f2) + f3 - f0)))
+template <typename T>
+static inline T CubicConvolution(T distance1, T distance2, T distance3, T f0,
+                                 T f1, T f2, T f3)
+{
+    return (f1 + T(0.5) * (distance1 * (f2 - f0) +
+                           distance2 * (2 * f0 - 5 * f1 + 4 * f2 - f3) +
+                           distance3 * (3 * (f1 - f2) + f3 - f0)));
+}
 
 /************************************************************************/
 /*                       GWKCubicComputeWeights()                       */
@@ -2861,19 +2864,18 @@ static bool GWKBilinearResampleNoMasks4SampleT(const GDALWarpKernel *poWK,
 
 // adfCoeffs[2] = 1.0 - (adfCoeffs[0] + adfCoeffs[1] - adfCoeffs[3]);
 
-// TODO(schwehr): Use an inline function.
-#define GWKCubicComputeWeights(dfX_, adfCoeffs)                                \
-    {                                                                          \
-        const double dfX = dfX_;                                               \
-        const double dfHalfX = 0.5 * dfX;                                      \
-        const double dfThreeX = 3.0 * dfX;                                     \
-        const double dfHalfX2 = dfHalfX * dfX;                                 \
-                                                                               \
-        adfCoeffs[0] = dfHalfX * (-1 + dfX * (2 - dfX));                       \
-        adfCoeffs[1] = 1 + dfHalfX2 * (-5 + dfThreeX);                         \
-        adfCoeffs[2] = dfHalfX * (1 + dfX * (4 - dfThreeX));                   \
-        adfCoeffs[3] = dfHalfX2 * (-1 + dfX);                                  \
-    }
+template <typename T>
+static inline void GWKCubicComputeWeights(T x, T coeffs[4])
+{
+    const T halfX = T(0.5) * x;
+    const T threeX = T(3.0) * x;
+    const T halfX2 = halfX * x;
+
+    coeffs[0] = halfX * (-1 + x * (2 - x));
+    coeffs[1] = 1 + halfX2 * (-5 + threeX);
+    coeffs[2] = halfX * (1 + x * (4 - threeX));
+    coeffs[3] = halfX2 * (-1 + x);
+}
 
 // TODO(schwehr): Use an inline function.
 #define CONVOL4(v1, v2)                                                        \
@@ -2969,10 +2971,7 @@ static bool GWKCubicResample4Sample(const GDALWarpKernel *poWK, int iBand,
     return true;
 }
 
-// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
-// perf benefit.
-
-#if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
+#if defined(__x86_64) || defined(_M_X64)
 
 /************************************************************************/
 /*                           XMMLoad4Values()                           */
@@ -2985,7 +2984,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
 {
     unsigned int i;
     memcpy(&i, ptr, 4);
-    __m128i xmm_i = _mm_cvtsi32_si128(s);
+    __m128i xmm_i = _mm_cvtsi32_si128(i);
     // Zero extend 4 packed unsigned 8-bit integers in a to packed
     // 32-bit integers.
 #if __SSE4_1__
@@ -2997,11 +2996,12 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
     return _mm_cvtepi32_ps(xmm_i);
 }
 
+#if defined(USE_SSE_CUBIC_IMPL)
 static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
 {
     GUInt64 i;
     memcpy(&i, ptr, 8);
-    __m128i xmm_i = _mm_cvtsi64_si128(s);
+    __m128i xmm_i = _mm_cvtsi64_si128(i);
     // Zero extend 4 packed unsigned 16-bit integers in a to packed
     // 32-bit integers.
 #if __SSE4_1__
@@ -3011,6 +3011,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
 #endif
     return _mm_cvtepi32_ps(xmm_i);
 }
+#endif
 
 /************************************************************************/
 /*                           XMMHorizontalAdd()                         */
@@ -3038,7 +3039,7 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v)
 }
 #endif
 
-#endif  // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
+#endif  // (defined(__x86_64) || defined(_M_X64))
 
 /************************************************************************/
 /*            GWKCubicResampleSrcMaskIsDensity4SampleRealT()            */
@@ -3046,6 +3047,8 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v)
 
 // Note: if USE_SSE_CUBIC_IMPL, only instantiate that for Byte and UInt16,
 // because there are a few assumptions above those types.
+// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
+// perf benefit.
 
 template <class T>
 static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT(
@@ -5853,6 +5856,137 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK)
     return GWKRun(poWK, "GWKRealCase", GWKRealCaseThread);
 }
 
+/************************************************************************/
+/*                 GWKCubicResampleNoMasks4ByteMultiBand()              */
+/************************************************************************/
+
+/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
+/* and enough SSE registries */
+#if defined(__x86_64) || defined(_M_X64)
+
+static inline float Convolute4x4(const __m128 row0, const __m128 row1,
+                                 const __m128 row2, const __m128 row3,
+                                 const __m128 weightsX, const __m128 weightsY0,
+                                 const __m128 weightsY1, const __m128 weightsY2,
+                                 const __m128 weightsY3)
+{
+    return XMMHorizontalAdd(_mm_add_ps(
+        _mm_add_ps(
+            _mm_add_ps(_mm_mul_ps(_mm_mul_ps(row0, weightsX), weightsY0),
+                       _mm_mul_ps(_mm_mul_ps(row1, weightsX), weightsY1)),
+            _mm_mul_ps(_mm_mul_ps(row2, weightsX), weightsY2)),
+        _mm_mul_ps(_mm_mul_ps(row3, weightsX), weightsY3)));
+}
+
+static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
+                                                  double dfSrcX, double dfSrcY,
+                                                  const GPtrDiff_t iDstOffset)
+{
+    const double dfSrcXShifted = dfSrcX - 0.5;
+    const int iSrcX = static_cast<int>(dfSrcXShifted);
+    const double dfSrcYShifted = dfSrcY - 0.5;
+    const int iSrcY = static_cast<int>(dfSrcYShifted);
+    const GPtrDiff_t iSrcOffset =
+        iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;
+
+    // Get the bilinear interpolation at the image borders.
+    if (iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize || iSrcY - 1 < 0 ||
+        iSrcY + 2 >= poWK->nSrcYSize)
+    {
+        for (int iBand = 0; iBand < poWK->nBands; iBand++)
+        {
+            GByte value;
+            GWKBilinearResampleNoMasks4SampleT(poWK, iBand, dfSrcX, dfSrcY,
+                                               &value);
+            reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand])[iDstOffset] =
+                value;
+        }
+    }
+    else
+    {
+        const float fDeltaX = static_cast<float>(dfSrcXShifted) - iSrcX;
+        const float fDeltaY = static_cast<float>(dfSrcYShifted) - iSrcY;
+
+        float afCoeffsX[4];
+        float afCoeffsY[4];
+        GWKCubicComputeWeights(fDeltaX, afCoeffsX);
+        GWKCubicComputeWeights(fDeltaY, afCoeffsY);
+        const auto weightsX = _mm_loadu_ps(afCoeffsX);
+        const auto weightsY0 = _mm_load1_ps(&afCoeffsY[0]);
+        const auto weightsY1 = _mm_load1_ps(&afCoeffsY[1]);
+        const auto weightsY2 = _mm_load1_ps(&afCoeffsY[2]);
+        const auto weightsY3 = _mm_load1_ps(&afCoeffsY[3]);
+
+        const GPtrDiff_t iOffset = iSrcOffset - poWK->nSrcXSize - 1;
+
+        int iBand = 0;
+        // Process 2 bands at a time
+        for (; iBand + 1 < poWK->nBands; iBand += 2)
+        {
+            const GByte *CPL_RESTRICT pabyBand0 =
+                reinterpret_cast<const GByte *>(poWK->papabySrcImage[iBand]);
+            const auto row0_0 = XMMLoad4Values(pabyBand0 + iOffset);
+            const auto row1_0 =
+                XMMLoad4Values(pabyBand0 + iOffset + poWK->nSrcXSize);
+            const auto row2_0 =
+                XMMLoad4Values(pabyBand0 + iOffset + 2 * poWK->nSrcXSize);
+            const auto row3_0 =
+                XMMLoad4Values(pabyBand0 + iOffset + 3 * poWK->nSrcXSize);
+
+            const GByte *CPL_RESTRICT pabyBand1 =
+                reinterpret_cast<const GByte *>(
+                    poWK->papabySrcImage[iBand + 1]);
+            const auto row0_1 = XMMLoad4Values(pabyBand1 + iOffset);
+            const auto row1_1 =
+                XMMLoad4Values(pabyBand1 + iOffset + poWK->nSrcXSize);
+            const auto row2_1 =
+                XMMLoad4Values(pabyBand1 + iOffset + 2 * poWK->nSrcXSize);
+            const auto row3_1 =
+                XMMLoad4Values(pabyBand1 + iOffset + 3 * poWK->nSrcXSize);
+
+            const float fValue_0 =
+                Convolute4x4(row0_0, row1_0, row2_0, row3_0, weightsX,
+                             weightsY0, weightsY1, weightsY2, weightsY3);
+
+            const float fValue_1 =
+                Convolute4x4(row0_1, row1_1, row2_1, row3_1, weightsX,
+                             weightsY0, weightsY1, weightsY2, weightsY3);
+
+            GByte *CPL_RESTRICT pabyDstBand0 =
+                reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand]);
+            pabyDstBand0[iDstOffset] = GWKClampValueT<GByte>(fValue_0);
+
+            GByte *CPL_RESTRICT pabyDstBand1 =
+                reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand + 1]);
+            pabyDstBand1[iDstOffset] = GWKClampValueT<GByte>(fValue_1);
+        }
+        if (iBand < poWK->nBands)
+        {
+            const GByte *pabyBand0 =
+                reinterpret_cast<const GByte *>(poWK->papabySrcImage[iBand]);
+            const auto row0 = XMMLoad4Values(pabyBand0 + iOffset);
+            const auto row1 =
+                XMMLoad4Values(pabyBand0 + iOffset + poWK->nSrcXSize);
+            const auto row2 =
+                XMMLoad4Values(pabyBand0 + iOffset + 2 * poWK->nSrcXSize);
+            const auto row3 =
+                XMMLoad4Values(pabyBand0 + iOffset + 3 * poWK->nSrcXSize);
+
+            const float fValue =
+                Convolute4x4(row0, row1, row2, row3, weightsX, weightsY0,
+                             weightsY1, weightsY2, weightsY3);
+
+            reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand])[iDstOffset] =
+                GWKClampValueT<GByte>(fValue);
+        }
+    }
+
+    if (poWK->pafDstDensity)
+        poWK->pafDstDensity[iDstOffset] = 1.0f;
+}
+
+#endif  // defined(__x86_64) || defined(_M_X64)
+
 /************************************************************************/
 /*                GWKResampleNoMasksOrDstDensityOnlyThreadInternal()    */
 /************************************************************************/
@@ -5957,6 +6091,21 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
             const GPtrDiff_t iDstOffset =
                 iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;
 
+#if defined(__x86_64) || defined(_M_X64)
+            if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic &&
+                          std::is_same<T, GByte>::value)
+            {
+                if (poWK->nBands > 1 && !poWK->bApplyVerticalShift)
+                {
+                    GWKCubicResampleNoMasks4ByteMultiBand(
+                        poWK, padfX[iDstX] - poWK->nSrcXOff,
+                        padfY[iDstX] - poWK->nSrcYOff, iDstOffset);
+
+                    continue;
+                }
+            }
+#endif  // defined(__x86_64) || defined(_M_X64)
+
             [[maybe_unused]] double dfInvWeights = 0;
             for (int iBand = 0; iBand < poWK->nBands; iBand++)
             {
diff --git a/autotest/utilities/test_gdalwarp_lib.py b/autotest/utilities/test_gdalwarp_lib.py
index 3cb97c04aee6..4efbeb8c992b 100755
--- a/autotest/utilities/test_gdalwarp_lib.py
+++ b/autotest/utilities/test_gdalwarp_lib.py
@@ -4330,3 +4330,53 @@ def test_gdalwarp_lib_src_is_geog_arc_second():
     assert out_ds.RasterXSize == 5464
     assert out_ds.RasterYSize == 5464
     assert out_ds.GetRasterBand(1).Checksum() == 31856
+
+
+###############################################################################
+# Test GWKCubicResampleNoMasks4ByteMultiBand()
+
+
+def test_gdalwarp_lib_cubic_multiband_byte_4sample_optim():
+
+    src_ds = gdal.Open("../gdrivers/data/small_world.tif")
+
+    # RGB only
+    out_ds = gdal.Warp(
+        "",
+        src_ds,
+        options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic",
+    )
+    assert out_ds.RasterXSize == 21
+    assert out_ds.RasterYSize == 21
+    assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
+        4785,
+        4689,
+        5007,
+    ]
+
+    # With dest alpha
+    out_ds = gdal.Warp(
+        "",
+        src_ds,
+        options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic -dstalpha",
+    )
+    assert out_ds.RasterXSize == 21
+    assert out_ds.RasterYSize == 21
+    assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
+        4785,
+        4689,
+        5007,
+    ]
+    assert out_ds.GetRasterBand(4).ComputeRasterMinMax() == (255, 255)
+
+    # Test edge effects
+    # (slightly change the target resolution so that the nearest approximation
+    # doesn't kick in)
+    out_ds = gdal.Warp(
+        "",
+        src_ds,
+        options="-f MEM -r cubic -tr 0.9000001 0.9000001 -wo XSCALE=1 -wo YSCALE=1",
+    )
+    assert out_ds.RasterXSize == 400
+    assert out_ds.RasterYSize == 200
+    assert out_ds.ReadRaster() == src_ds.ReadRaster()