Skip to content

Commit

Permalink
Warper: improve performance of multi-band bicubic warping with XSCALE…
Browse files Browse the repository at this point in the history
…=1 and YSCALE=1 on SSE2 / x86_64

On extracted2.tif dataset from
OSGeo#11042 (comment) (12500 x 10000 pixels, 3 bands, Byte) ``gdalwarp -s_srs EPSG:8353 -t_srs EPSG:3857 -r cubic extracted2.tif tmp.tif -overwrite -wo XSCALE=1 -wo YSCALE=1`` goes from 19.0 seconds to 15.7.
  • Loading branch information
rouault committed Oct 19, 2024
1 parent 40ce085 commit dbc8764
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 25 deletions.
193 changes: 168 additions & 25 deletions alg/gdalwarpkernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2849,31 +2849,33 @@ static bool GWKBilinearResampleNoMasks4SampleT(const GDALWarpKernel *poWK,
// or http://en.wikipedia.org/wiki/Cubic_Hermite_spline : CINTx(p_1,p0,p1,p2)
// http://en.wikipedia.org/wiki/Bicubic_interpolation: matrix notation

// TODO(schwehr): Use an inline function.
#define CubicConvolution(distance1, distance2, distance3, f0, f1, f2, f3) \
(f1 + 0.5 * (distance1 * (f2 - f0) + \
distance2 * (2.0 * f0 - 5.0 * f1 + 4.0 * f2 - f3) + \
distance3 * (3.0 * (f1 - f2) + f3 - f0)))
template <typename T>
static inline T CubicConvolution(T distance1, T distance2, T distance3, T f0,
T f1, T f2, T f3)
{
return (f1 + T(0.5) * (distance1 * (f2 - f0) +
distance2 * (2 * f0 - 5 * f1 + 4 * f2 - f3) +
distance3 * (3 * (f1 - f2) + f3 - f0)));
}

/************************************************************************/
/* GWKCubicComputeWeights() */
/************************************************************************/

// adfCoeffs[2] = 1.0 - (adfCoeffs[0] + adfCoeffs[1] - adfCoeffs[3]);

// TODO(schwehr): Use an inline function.
#define GWKCubicComputeWeights(dfX_, adfCoeffs) \
{ \
const double dfX = dfX_; \
const double dfHalfX = 0.5 * dfX; \
const double dfThreeX = 3.0 * dfX; \
const double dfHalfX2 = dfHalfX * dfX; \
\
adfCoeffs[0] = dfHalfX * (-1 + dfX * (2 - dfX)); \
adfCoeffs[1] = 1 + dfHalfX2 * (-5 + dfThreeX); \
adfCoeffs[2] = dfHalfX * (1 + dfX * (4 - dfThreeX)); \
adfCoeffs[3] = dfHalfX2 * (-1 + dfX); \
}
template <typename T>
static inline void GWKCubicComputeWeights(T x, T coeffs[4])
{
const T halfX = T(0.5) * x;
const T threeX = T(3.0) * x;
const T halfX2 = halfX * x;

coeffs[0] = halfX * (-1 + x * (2 - x));
coeffs[1] = 1 + halfX2 * (-5 + threeX);
coeffs[2] = halfX * (1 + x * (4 - threeX));
coeffs[3] = halfX2 * (-1 + x);
}

// TODO(schwehr): Use an inline function.
#define CONVOL4(v1, v2) \
Expand Down Expand Up @@ -2969,10 +2971,7 @@ static bool GWKCubicResample4Sample(const GDALWarpKernel *poWK, int iBand,
return true;
}

// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
// perf benefit.

#if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
#if defined(__x86_64) || defined(_M_X64)

/************************************************************************/
/* XMMLoad4Values() */
Expand All @@ -2985,7 +2984,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
{
unsigned int i;
memcpy(&i, ptr, 4);
__m128i xmm_i = _mm_cvtsi32_si128(s);
__m128i xmm_i = _mm_cvtsi32_si128(i);
// Zero extend 4 packed unsigned 8-bit integers in a to packed
// 32-bit integers.
#if __SSE4_1__
Expand All @@ -2997,11 +2996,12 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
return _mm_cvtepi32_ps(xmm_i);
}

#if defined(USE_SSE_CUBIC_IMPL)
static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
{
GUInt64 i;
memcpy(&i, ptr, 8);
__m128i xmm_i = _mm_cvtsi64_si128(s);
__m128i xmm_i = _mm_cvtsi64_si128(i);
// Zero extend 4 packed unsigned 16-bit integers in a to packed
// 32-bit integers.
#if __SSE4_1__
Expand All @@ -3011,6 +3011,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
#endif
return _mm_cvtepi32_ps(xmm_i);
}
#endif

/************************************************************************/
/* XMMHorizontalAdd() */
Expand Down Expand Up @@ -3038,14 +3039,16 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v)
}
#endif

#endif // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
#endif // (defined(__x86_64) || defined(_M_X64))

/************************************************************************/
/* GWKCubicResampleSrcMaskIsDensity4SampleRealT() */
/************************************************************************/

// Note: if USE_SSE_CUBIC_IMPL, only instantiate that for Byte and UInt16,
// because there are a few assumptions above those types.
// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
// perf benefit.

template <class T>
static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT(
Expand Down Expand Up @@ -5853,6 +5856,131 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK)
return GWKRun(poWK, "GWKRealCase", GWKRealCaseThread);
}

/************************************************************************/
/* GWKCubicResampleNoMasks4ByteMultiBand() */
/************************************************************************/

/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
/* and enough SSE registries */
#if defined(__x86_64) || defined(_M_X64)

static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
double dfSrcX, double dfSrcY,
const GPtrDiff_t iDstOffset)
{
const int iSrcX = static_cast<int>(dfSrcX - 0.5);
const int iSrcY = static_cast<int>(dfSrcY - 0.5);
const GPtrDiff_t iSrcOffset =
iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;

// Get the bilinear interpolation at the image borders.
if (iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize || iSrcY - 1 < 0 ||
iSrcY + 2 >= poWK->nSrcYSize)
{
for (int iBand = 0; iBand < poWK->nBands; iBand++)
{
GByte value;
GWKBilinearResampleNoMasks4SampleT(poWK, iBand, dfSrcX, dfSrcY,
&value);
reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand])[iDstOffset] =
value;
}
}
else
{
const float fDeltaX = float(dfSrcX) - 0.5f - iSrcX;
const float fDeltaY = float(dfSrcY) - 0.5f - iSrcY;

float afCoeffsX[4];
float afCoeffsY[4];
GWKCubicComputeWeights(fDeltaX, afCoeffsX);
GWKCubicComputeWeights(fDeltaY, afCoeffsY);
const auto weightsX = _mm_loadu_ps(afCoeffsX);
const auto weightsY0 = _mm_load1_ps(&afCoeffsY[0]);
const auto weightsY1 = _mm_load1_ps(&afCoeffsY[1]);
const auto weightsY2 = _mm_load1_ps(&afCoeffsY[2]);
const auto weightsY3 = _mm_load1_ps(&afCoeffsY[3]);

const GPtrDiff_t iOffset = iSrcOffset - poWK->nSrcXSize - 1;

int iBand = 0;
// Process 2-bands at a time
for (; iBand + 1 < poWK->nBands; iBand += 2)
{
const GByte *pabyBand0 =
reinterpret_cast<const GByte *>(poWK->papabySrcImage[iBand]);
const auto row0_0 = XMMLoad4Values(pabyBand0 + iOffset);
const auto row1_0 =
XMMLoad4Values(pabyBand0 + iOffset + poWK->nSrcXSize);
const auto row2_0 =
XMMLoad4Values(pabyBand0 + iOffset + 2 * poWK->nSrcXSize);
const auto row3_0 =
XMMLoad4Values(pabyBand0 + iOffset + 3 * poWK->nSrcXSize);

const GByte *pabyBand1 = reinterpret_cast<const GByte *>(
poWK->papabySrcImage[iBand + 1]);
const auto row0_1 = XMMLoad4Values(pabyBand1 + iOffset);
const auto row1_1 =
XMMLoad4Values(pabyBand1 + iOffset + poWK->nSrcXSize);
const auto row2_1 =
XMMLoad4Values(pabyBand1 + iOffset + 2 * poWK->nSrcXSize);
const auto row3_1 =
XMMLoad4Values(pabyBand1 + iOffset + 3 * poWK->nSrcXSize);

const float fValue_0 = XMMHorizontalAdd(_mm_add_ps(
_mm_add_ps(
_mm_add_ps(
_mm_mul_ps(_mm_mul_ps(row0_0, weightsX), weightsY0),
_mm_mul_ps(_mm_mul_ps(row1_0, weightsX), weightsY1)),
_mm_mul_ps(_mm_mul_ps(row2_0, weightsX), weightsY2)),
_mm_mul_ps(_mm_mul_ps(row3_0, weightsX), weightsY3)));

const float fValue_1 = XMMHorizontalAdd(_mm_add_ps(
_mm_add_ps(
_mm_add_ps(
_mm_mul_ps(_mm_mul_ps(row0_1, weightsX), weightsY0),
_mm_mul_ps(_mm_mul_ps(row1_1, weightsX), weightsY1)),
_mm_mul_ps(_mm_mul_ps(row2_1, weightsX), weightsY2)),
_mm_mul_ps(_mm_mul_ps(row3_1, weightsX), weightsY3)));

reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand])[iDstOffset] =
GWKClampValueT<GByte>(fValue_0);

reinterpret_cast<GByte *>(
poWK->papabyDstImage[iBand + 1])[iDstOffset] =
GWKClampValueT<GByte>(fValue_1);
}
if (iBand < poWK->nBands)
{
const GByte *pabyBand0 =
reinterpret_cast<const GByte *>(poWK->papabySrcImage[iBand]);
const auto row0 = XMMLoad4Values(pabyBand0 + iOffset);
const auto row1 =
XMMLoad4Values(pabyBand0 + iOffset + poWK->nSrcXSize);
const auto row2 =
XMMLoad4Values(pabyBand0 + iOffset + 2 * poWK->nSrcXSize);
const auto row3 =
XMMLoad4Values(pabyBand0 + iOffset + 3 * poWK->nSrcXSize);

const float fValue = XMMHorizontalAdd(_mm_add_ps(
_mm_add_ps(
_mm_add_ps(
_mm_mul_ps(_mm_mul_ps(row0, weightsX), weightsY0),
_mm_mul_ps(_mm_mul_ps(row1, weightsX), weightsY1)),
_mm_mul_ps(_mm_mul_ps(row2, weightsX), weightsY2)),
_mm_mul_ps(_mm_mul_ps(row3, weightsX), weightsY3)));

reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand])[iDstOffset] =
GWKClampValueT<GByte>(fValue);
}
}

if (poWK->pafDstDensity)
poWK->pafDstDensity[iDstOffset] = 1.0f;
}

#endif // defined(__x86_64) || defined(_M_X64)

/************************************************************************/
/* GWKResampleNoMasksOrDstDensityOnlyThreadInternal() */
/************************************************************************/
Expand Down Expand Up @@ -5957,6 +6085,21 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
const GPtrDiff_t iDstOffset =
iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;

#if defined(__x86_64) || defined(_M_X64)
if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic &&
std::is_same<T, GByte>::value)
{
if (poWK->nBands > 1 && !poWK->bApplyVerticalShift)
{
GWKCubicResampleNoMasks4ByteMultiBand(
poWK, padfX[iDstX] - poWK->nSrcXOff,
padfY[iDstX] - poWK->nSrcYOff, iDstOffset);

continue;
}
}
#endif // defined(__x86_64) || defined(_M_X64)

[[maybe_unused]] double dfInvWeights = 0;
for (int iBand = 0; iBand < poWK->nBands; iBand++)
{
Expand Down
20 changes: 20 additions & 0 deletions autotest/utilities/test_gdalwarp_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -4330,3 +4330,23 @@ def test_gdalwarp_lib_src_is_geog_arc_second():
assert out_ds.RasterXSize == 5464
assert out_ds.RasterYSize == 5464
assert out_ds.GetRasterBand(1).Checksum() == 31856


###############################################################################
#


def test_gdalwarp_lib_cubic_multiband_byte_4sample_optim():

out_ds = gdal.Warp(
"",
"../gdrivers/data/small_world.tif",
options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic",
)
assert out_ds.RasterXSize == 21
assert out_ds.RasterYSize == 21
assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
4785,
4689,
5007,
]

0 comments on commit dbc8764

Please sign in to comment.