diff --git a/alg/gdalwarpkernel.cpp b/alg/gdalwarpkernel.cpp index 9053cb24cddf..56211577ec82 100644 --- a/alg/gdalwarpkernel.cpp +++ b/alg/gdalwarpkernel.cpp @@ -2849,11 +2849,14 @@ static bool GWKBilinearResampleNoMasks4SampleT(const GDALWarpKernel *poWK, // or http://en.wikipedia.org/wiki/Cubic_Hermite_spline : CINTx(p_1,p0,p1,p2) // http://en.wikipedia.org/wiki/Bicubic_interpolation: matrix notation -// TODO(schwehr): Use an inline function. -#define CubicConvolution(distance1, distance2, distance3, f0, f1, f2, f3) \ - (f1 + 0.5 * (distance1 * (f2 - f0) + \ - distance2 * (2.0 * f0 - 5.0 * f1 + 4.0 * f2 - f3) + \ - distance3 * (3.0 * (f1 - f2) + f3 - f0))) +template +static inline T CubicConvolution(T distance1, T distance2, T distance3, T f0, + T f1, T f2, T f3) +{ + return (f1 + T(0.5) * (distance1 * (f2 - f0) + + distance2 * (2 * f0 - 5 * f1 + 4 * f2 - f3) + + distance3 * (3 * (f1 - f2) + f3 - f0))); +} /************************************************************************/ /* GWKCubicComputeWeights() */ @@ -2861,19 +2864,18 @@ static bool GWKBilinearResampleNoMasks4SampleT(const GDALWarpKernel *poWK, // adfCoeffs[2] = 1.0 - (adfCoeffs[0] + adfCoeffs[1] - adfCoeffs[3]); -// TODO(schwehr): Use an inline function. -#define GWKCubicComputeWeights(dfX_, adfCoeffs) \ - { \ - const double dfX = dfX_; \ - const double dfHalfX = 0.5 * dfX; \ - const double dfThreeX = 3.0 * dfX; \ - const double dfHalfX2 = dfHalfX * dfX; \ - \ - adfCoeffs[0] = dfHalfX * (-1 + dfX * (2 - dfX)); \ - adfCoeffs[1] = 1 + dfHalfX2 * (-5 + dfThreeX); \ - adfCoeffs[2] = dfHalfX * (1 + dfX * (4 - dfThreeX)); \ - adfCoeffs[3] = dfHalfX2 * (-1 + dfX); \ - } +template +static inline void GWKCubicComputeWeights(T x, T coeffs[4]) +{ + const T halfX = T(0.5) * x; + const T threeX = T(3.0) * x; + const T halfX2 = halfX * x; + + coeffs[0] = halfX * (-1 + x * (2 - x)); + coeffs[1] = 1 + halfX2 * (-5 + threeX); + coeffs[2] = halfX * (1 + x * (4 - threeX)); + coeffs[3] = halfX2 * (-1 + x); +} // TODO(schwehr): Use an inline function. #define CONVOL4(v1, v2) \ @@ -2969,10 +2971,7 @@ static bool GWKCubicResample4Sample(const GDALWarpKernel *poWK, int iBand, return true; } -// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero -// perf benefit. - -#if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64)) +#if defined(__x86_64) || defined(_M_X64) /************************************************************************/ /* XMMLoad4Values() */ @@ -2985,7 +2984,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr) { unsigned int i; memcpy(&i, ptr, 4); - __m128i xmm_i = _mm_cvtsi32_si128(s); + __m128i xmm_i = _mm_cvtsi32_si128(i); // Zero extend 4 packed unsigned 8-bit integers in a to packed // 32-bit integers. #if __SSE4_1__ @@ -2997,11 +2996,12 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr) return _mm_cvtepi32_ps(xmm_i); } +#if defined(USE_SSE_CUBIC_IMPL) static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr) { GUInt64 i; memcpy(&i, ptr, 8); - __m128i xmm_i = _mm_cvtsi64_si128(s); + __m128i xmm_i = _mm_cvtsi64_si128(i); // Zero extend 4 packed unsigned 16-bit integers in a to packed // 32-bit integers. #if __SSE4_1__ @@ -3011,6 +3011,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr) #endif return _mm_cvtepi32_ps(xmm_i); } +#endif /************************************************************************/ /* XMMHorizontalAdd() */ @@ -3038,7 +3039,7 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v) } #endif -#endif // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64)) +#endif // (defined(__x86_64) || defined(_M_X64)) /************************************************************************/ /* GWKCubicResampleSrcMaskIsDensity4SampleRealT() */ @@ -3046,6 +3047,8 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v) // Note: if USE_SSE_CUBIC_IMPL, only instantiate that for Byte and UInt16, // because there are a few assumptions above those types. +// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero +// perf benefit. template static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT( @@ -5853,6 +5856,137 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK) return GWKRun(poWK, "GWKRealCase", GWKRealCaseThread); } +/************************************************************************/ +/* GWKCubicResampleNoMasks4ByteMultiBand() */ +/************************************************************************/ + +/* We restrict to 64bit processors because they are guaranteed to have SSE2 */ +/* and enough SSE registries */ +#if defined(__x86_64) || defined(_M_X64) + +static inline float Convolute4x4(const __m128 row0, const __m128 row1, + const __m128 row2, const __m128 row3, + const __m128 weightsX, const __m128 weightsY0, + const __m128 weightsY1, const __m128 weightsY2, + const __m128 weightsY3) +{ + return XMMHorizontalAdd(_mm_add_ps( + _mm_add_ps( + _mm_add_ps(_mm_mul_ps(_mm_mul_ps(row0, weightsX), weightsY0), + _mm_mul_ps(_mm_mul_ps(row1, weightsX), weightsY1)), + _mm_mul_ps(_mm_mul_ps(row2, weightsX), weightsY2)), + _mm_mul_ps(_mm_mul_ps(row3, weightsX), weightsY3))); +} + +static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK, + double dfSrcX, double dfSrcY, + const GPtrDiff_t iDstOffset) +{ + const double dfSrcXShifted = dfSrcX - 0.5; + const int iSrcX = static_cast(dfSrcXShifted); + const double dfSrcYShifted = dfSrcY - 0.5; + const int iSrcY = static_cast(dfSrcYShifted); + const GPtrDiff_t iSrcOffset = + iSrcX + static_cast(iSrcY) * poWK->nSrcXSize; + + // Get the bilinear interpolation at the image borders. + if (iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize || iSrcY - 1 < 0 || + iSrcY + 2 >= poWK->nSrcYSize) + { + for (int iBand = 0; iBand < poWK->nBands; iBand++) + { + GByte value; + GWKBilinearResampleNoMasks4SampleT(poWK, iBand, dfSrcX, dfSrcY, + &value); + reinterpret_cast(poWK->papabyDstImage[iBand])[iDstOffset] = + value; + } + } + else + { + const float fDeltaX = static_cast(dfSrcXShifted) - iSrcX; + const float fDeltaY = static_cast(dfSrcYShifted) - iSrcY; + + float afCoeffsX[4]; + float afCoeffsY[4]; + GWKCubicComputeWeights(fDeltaX, afCoeffsX); + GWKCubicComputeWeights(fDeltaY, afCoeffsY); + const auto weightsX = _mm_loadu_ps(afCoeffsX); + const auto weightsY0 = _mm_load1_ps(&afCoeffsY[0]); + const auto weightsY1 = _mm_load1_ps(&afCoeffsY[1]); + const auto weightsY2 = _mm_load1_ps(&afCoeffsY[2]); + const auto weightsY3 = _mm_load1_ps(&afCoeffsY[3]); + + const GPtrDiff_t iOffset = iSrcOffset - poWK->nSrcXSize - 1; + + int iBand = 0; + // Process 2 bands at a time + for (; iBand + 1 < poWK->nBands; iBand += 2) + { + const GByte *CPL_RESTRICT pabyBand0 = + reinterpret_cast(poWK->papabySrcImage[iBand]); + const auto row0_0 = XMMLoad4Values(pabyBand0 + iOffset); + const auto row1_0 = + XMMLoad4Values(pabyBand0 + iOffset + poWK->nSrcXSize); + const auto row2_0 = + XMMLoad4Values(pabyBand0 + iOffset + 2 * poWK->nSrcXSize); + const auto row3_0 = + XMMLoad4Values(pabyBand0 + iOffset + 3 * poWK->nSrcXSize); + + const GByte *CPL_RESTRICT pabyBand1 = + reinterpret_cast( + poWK->papabySrcImage[iBand + 1]); + const auto row0_1 = XMMLoad4Values(pabyBand1 + iOffset); + const auto row1_1 = + XMMLoad4Values(pabyBand1 + iOffset + poWK->nSrcXSize); + const auto row2_1 = + XMMLoad4Values(pabyBand1 + iOffset + 2 * poWK->nSrcXSize); + const auto row3_1 = + XMMLoad4Values(pabyBand1 + iOffset + 3 * poWK->nSrcXSize); + + const float fValue_0 = + Convolute4x4(row0_0, row1_0, row2_0, row3_0, weightsX, + weightsY0, weightsY1, weightsY2, weightsY3); + + const float fValue_1 = + Convolute4x4(row0_1, row1_1, row2_1, row3_1, weightsX, + weightsY0, weightsY1, weightsY2, weightsY3); + + GByte *CPL_RESTRICT pabyDstBand0 = + reinterpret_cast(poWK->papabyDstImage[iBand]); + pabyDstBand0[iDstOffset] = GWKClampValueT(fValue_0); + + GByte *CPL_RESTRICT pabyDstBand1 = + reinterpret_cast(poWK->papabyDstImage[iBand + 1]); + pabyDstBand1[iDstOffset] = GWKClampValueT(fValue_1); + } + if (iBand < poWK->nBands) + { + const GByte *pabyBand0 = + reinterpret_cast(poWK->papabySrcImage[iBand]); + const auto row0 = XMMLoad4Values(pabyBand0 + iOffset); + const auto row1 = + XMMLoad4Values(pabyBand0 + iOffset + poWK->nSrcXSize); + const auto row2 = + XMMLoad4Values(pabyBand0 + iOffset + 2 * poWK->nSrcXSize); + const auto row3 = + XMMLoad4Values(pabyBand0 + iOffset + 3 * poWK->nSrcXSize); + + const float fValue = + Convolute4x4(row0, row1, row2, row3, weightsX, weightsY0, + weightsY1, weightsY2, weightsY3); + + reinterpret_cast(poWK->papabyDstImage[iBand])[iDstOffset] = + GWKClampValueT(fValue); + } + } + + if (poWK->pafDstDensity) + poWK->pafDstDensity[iDstOffset] = 1.0f; +} + +#endif // defined(__x86_64) || defined(_M_X64) + /************************************************************************/ /* GWKResampleNoMasksOrDstDensityOnlyThreadInternal() */ /************************************************************************/ @@ -5957,6 +6091,21 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData) const GPtrDiff_t iDstOffset = iDstX + static_cast(iDstY) * nDstXSize; +#if defined(__x86_64) || defined(_M_X64) + if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic && + std::is_same::value) + { + if (poWK->nBands > 1 && !poWK->bApplyVerticalShift) + { + GWKCubicResampleNoMasks4ByteMultiBand( + poWK, padfX[iDstX] - poWK->nSrcXOff, + padfY[iDstX] - poWK->nSrcYOff, iDstOffset); + + continue; + } + } +#endif // defined(__x86_64) || defined(_M_X64) + [[maybe_unused]] double dfInvWeights = 0; for (int iBand = 0; iBand < poWK->nBands; iBand++) { diff --git a/autotest/utilities/test_gdalwarp_lib.py b/autotest/utilities/test_gdalwarp_lib.py index 3cb97c04aee6..4efbeb8c992b 100755 --- a/autotest/utilities/test_gdalwarp_lib.py +++ b/autotest/utilities/test_gdalwarp_lib.py @@ -4330,3 +4330,53 @@ def test_gdalwarp_lib_src_is_geog_arc_second(): assert out_ds.RasterXSize == 5464 assert out_ds.RasterYSize == 5464 assert out_ds.GetRasterBand(1).Checksum() == 31856 + + +############################################################################### +# Test GWKCubicResampleNoMasks4ByteMultiBand() + + +def test_gdalwarp_lib_cubic_multiband_byte_4sample_optim(): + + src_ds = gdal.Open("../gdrivers/data/small_world.tif") + + # RGB only + out_ds = gdal.Warp( + "", + src_ds, + options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic", + ) + assert out_ds.RasterXSize == 21 + assert out_ds.RasterYSize == 21 + assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [ + 4785, + 4689, + 5007, + ] + + # With dest alpha + out_ds = gdal.Warp( + "", + src_ds, + options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic -dstalpha", + ) + assert out_ds.RasterXSize == 21 + assert out_ds.RasterYSize == 21 + assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [ + 4785, + 4689, + 5007, + ] + assert out_ds.GetRasterBand(4).ComputeRasterMinMax() == (255, 255) + + # Test edge effects + # (slightly change the target resolution so that the nearest approximation + # doesn't kick in) + out_ds = gdal.Warp( + "", + src_ds, + options="-f MEM -r cubic -tr 0.9000001 0.9000001 -wo XSCALE=1 -wo YSCALE=1", + ) + assert out_ds.RasterXSize == 400 + assert out_ds.RasterYSize == 200 + assert out_ds.ReadRaster() == src_ds.ReadRaster()