Skip to content

Commit

Permalink
Warper: improve performance of Byte/UInt16 multi-band bicubic warping…
Browse files Browse the repository at this point in the history
… with XSCALE=1 and YSCALE=1 on SSE2 / x86_64

On extracted2.tif dataset from
OSGeo#11042 (comment) (12500 x 10000 pixels, 3 bands, Byte) ``gdalwarp -s_srs EPSG:8353 -t_srs EPSG:3857 -r cubic extracted2.tif tmp.tif -overwrite -wo XSCALE=1 -wo YSCALE=1`` goes from 19.0 seconds to 15.7.
  • Loading branch information
rouault committed Oct 20, 2024
1 parent 40ce085 commit 78eed8c
Show file tree
Hide file tree
Showing 2 changed files with 255 additions and 25 deletions.
203 changes: 178 additions & 25 deletions alg/gdalwarpkernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2849,31 +2849,33 @@ static bool GWKBilinearResampleNoMasks4SampleT(const GDALWarpKernel *poWK,
// or http://en.wikipedia.org/wiki/Cubic_Hermite_spline : CINTx(p_1,p0,p1,p2)
// http://en.wikipedia.org/wiki/Bicubic_interpolation: matrix notation

// TODO(schwehr): Use an inline function.
#define CubicConvolution(distance1, distance2, distance3, f0, f1, f2, f3) \
(f1 + 0.5 * (distance1 * (f2 - f0) + \
distance2 * (2.0 * f0 - 5.0 * f1 + 4.0 * f2 - f3) + \
distance3 * (3.0 * (f1 - f2) + f3 - f0)))
template <typename T>
static inline T CubicConvolution(T distance1, T distance2, T distance3, T f0,
T f1, T f2, T f3)
{
return (f1 + T(0.5) * (distance1 * (f2 - f0) +
distance2 * (2 * f0 - 5 * f1 + 4 * f2 - f3) +
distance3 * (3 * (f1 - f2) + f3 - f0)));
}

/************************************************************************/
/* GWKCubicComputeWeights() */
/************************************************************************/

// adfCoeffs[2] = 1.0 - (adfCoeffs[0] + adfCoeffs[1] - adfCoeffs[3]);

// TODO(schwehr): Use an inline function.
#define GWKCubicComputeWeights(dfX_, adfCoeffs) \
{ \
const double dfX = dfX_; \
const double dfHalfX = 0.5 * dfX; \
const double dfThreeX = 3.0 * dfX; \
const double dfHalfX2 = dfHalfX * dfX; \
\
adfCoeffs[0] = dfHalfX * (-1 + dfX * (2 - dfX)); \
adfCoeffs[1] = 1 + dfHalfX2 * (-5 + dfThreeX); \
adfCoeffs[2] = dfHalfX * (1 + dfX * (4 - dfThreeX)); \
adfCoeffs[3] = dfHalfX2 * (-1 + dfX); \
}
template <typename T>
static inline void GWKCubicComputeWeights(T x, T coeffs[4])
{
const T halfX = T(0.5) * x;
const T threeX = T(3.0) * x;
const T halfX2 = halfX * x;

coeffs[0] = halfX * (-1 + x * (2 - x));
coeffs[1] = 1 + halfX2 * (-5 + threeX);
coeffs[2] = halfX * (1 + x * (4 - threeX));
coeffs[3] = halfX2 * (-1 + x);
}

// TODO(schwehr): Use an inline function.
#define CONVOL4(v1, v2) \
Expand Down Expand Up @@ -2969,10 +2971,7 @@ static bool GWKCubicResample4Sample(const GDALWarpKernel *poWK, int iBand,
return true;
}

// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
// perf benefit.

#if defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
#if defined(__x86_64) || defined(_M_X64)

/************************************************************************/
/* XMMLoad4Values() */
Expand All @@ -2985,7 +2984,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
{
unsigned int i;
memcpy(&i, ptr, 4);
__m128i xmm_i = _mm_cvtsi32_si128(s);
__m128i xmm_i = _mm_cvtsi32_si128(i);
// Zero extend 4 packed unsigned 8-bit integers in a to packed
// 32-bit integers.
#if __SSE4_1__
Expand All @@ -3001,7 +3000,7 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
{
GUInt64 i;
memcpy(&i, ptr, 8);
__m128i xmm_i = _mm_cvtsi64_si128(s);
__m128i xmm_i = _mm_cvtsi64_si128(i);
// Zero extend 4 packed unsigned 16-bit integers in a to packed
// 32-bit integers.
#if __SSE4_1__
Expand Down Expand Up @@ -3038,14 +3037,16 @@ static CPL_INLINE float XMMHorizontalAdd(__m128 v)
}
#endif

#endif // defined(USE_SSE_CUBIC_IMPL) && (defined(__x86_64) || defined(_M_X64))
#endif // (defined(__x86_64) || defined(_M_X64))

/************************************************************************/
/* GWKCubicResampleSrcMaskIsDensity4SampleRealT() */
/************************************************************************/

// Note: if USE_SSE_CUBIC_IMPL, only instantiate that for Byte and UInt16,
// because there are a few assumptions above those types.
// We do not define USE_SSE_CUBIC_IMPL since in practice, it gives zero
// perf benefit.

template <class T>
static CPL_INLINE bool GWKCubicResampleSrcMaskIsDensity4SampleRealT(
Expand Down Expand Up @@ -5853,6 +5854,142 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK)
return GWKRun(poWK, "GWKRealCase", GWKRealCaseThread);
}

/************************************************************************/
/* GWKCubicResampleNoMasks4MultiBandT() */
/************************************************************************/

/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
/* and enough SSE registries */
#if defined(__x86_64) || defined(_M_X64)

static inline float Convolute4x4(const __m128 row0, const __m128 row1,
const __m128 row2, const __m128 row3,
const __m128 weightsXY0,
const __m128 weightsXY1,
const __m128 weightsXY2,
const __m128 weightsXY3)
{
return XMMHorizontalAdd(
_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(row0, weightsXY0),
_mm_mul_ps(row1, weightsXY1)),
_mm_mul_ps(row2, weightsXY2)),
_mm_mul_ps(row3, weightsXY3)));
}

template <class T>
static void GWKCubicResampleNoMasks4MultiBandT(const GDALWarpKernel *poWK,
double dfSrcX, double dfSrcY,
const GPtrDiff_t iDstOffset)
{
const double dfSrcXShifted = dfSrcX - 0.5;
const int iSrcX = static_cast<int>(dfSrcXShifted);
const double dfSrcYShifted = dfSrcY - 0.5;
const int iSrcY = static_cast<int>(dfSrcYShifted);
const GPtrDiff_t iSrcOffset =
iSrcX + static_cast<GPtrDiff_t>(iSrcY) * poWK->nSrcXSize;

// Get the bilinear interpolation at the image borders.
if (iSrcX - 1 < 0 || iSrcX + 2 >= poWK->nSrcXSize || iSrcY - 1 < 0 ||
iSrcY + 2 >= poWK->nSrcYSize)
{
for (int iBand = 0; iBand < poWK->nBands; iBand++)
{
T value;
GWKBilinearResampleNoMasks4SampleT(poWK, iBand, dfSrcX, dfSrcY,
&value);
reinterpret_cast<T *>(poWK->papabyDstImage[iBand])[iDstOffset] =
value;
}
}
else
{
const float fDeltaX = static_cast<float>(dfSrcXShifted) - iSrcX;
const float fDeltaY = static_cast<float>(dfSrcYShifted) - iSrcY;

float afCoeffsX[4];
float afCoeffsY[4];
GWKCubicComputeWeights(fDeltaX, afCoeffsX);
GWKCubicComputeWeights(fDeltaY, afCoeffsY);
const auto weightsX = _mm_loadu_ps(afCoeffsX);
const auto weightsXY0 =
_mm_mul_ps(_mm_load1_ps(&afCoeffsY[0]), weightsX);
const auto weightsXY1 =
_mm_mul_ps(_mm_load1_ps(&afCoeffsY[1]), weightsX);
const auto weightsXY2 =
_mm_mul_ps(_mm_load1_ps(&afCoeffsY[2]), weightsX);
const auto weightsXY3 =
_mm_mul_ps(_mm_load1_ps(&afCoeffsY[3]), weightsX);

const GPtrDiff_t iOffset = iSrcOffset - poWK->nSrcXSize - 1;

int iBand = 0;
// Process 2 bands at a time
for (; iBand + 1 < poWK->nBands; iBand += 2)
{
const T *CPL_RESTRICT pBand0 =
reinterpret_cast<const T *>(poWK->papabySrcImage[iBand]);
const auto row0_0 = XMMLoad4Values(pBand0 + iOffset);
const auto row1_0 =
XMMLoad4Values(pBand0 + iOffset + poWK->nSrcXSize);
const auto row2_0 =
XMMLoad4Values(pBand0 + iOffset + 2 * poWK->nSrcXSize);
const auto row3_0 =
XMMLoad4Values(pBand0 + iOffset + 3 * poWK->nSrcXSize);

const T *CPL_RESTRICT pBand1 =
reinterpret_cast<const T *>(poWK->papabySrcImage[iBand + 1]);
const auto row0_1 = XMMLoad4Values(pBand1 + iOffset);
const auto row1_1 =
XMMLoad4Values(pBand1 + iOffset + poWK->nSrcXSize);
const auto row2_1 =
XMMLoad4Values(pBand1 + iOffset + 2 * poWK->nSrcXSize);
const auto row3_1 =
XMMLoad4Values(pBand1 + iOffset + 3 * poWK->nSrcXSize);

const float fValue_0 =
Convolute4x4(row0_0, row1_0, row2_0, row3_0, weightsXY0,
weightsXY1, weightsXY2, weightsXY3);

const float fValue_1 =
Convolute4x4(row0_1, row1_1, row2_1, row3_1, weightsXY0,
weightsXY1, weightsXY2, weightsXY3);

T *CPL_RESTRICT pDstBand0 =
reinterpret_cast<T *>(poWK->papabyDstImage[iBand]);
pDstBand0[iDstOffset] = GWKClampValueT<T>(fValue_0);

T *CPL_RESTRICT pDstBand1 =
reinterpret_cast<T *>(poWK->papabyDstImage[iBand + 1]);
pDstBand1[iDstOffset] = GWKClampValueT<T>(fValue_1);
}
if (iBand < poWK->nBands)
{
const T *CPL_RESTRICT pBand0 =
reinterpret_cast<const T *>(poWK->papabySrcImage[iBand]);
const auto row0 = XMMLoad4Values(pBand0 + iOffset);
const auto row1 =
XMMLoad4Values(pBand0 + iOffset + poWK->nSrcXSize);
const auto row2 =
XMMLoad4Values(pBand0 + iOffset + 2 * poWK->nSrcXSize);
const auto row3 =
XMMLoad4Values(pBand0 + iOffset + 3 * poWK->nSrcXSize);

const float fValue =
Convolute4x4(row0, row1, row2, row3, weightsXY0, weightsXY1,
weightsXY2, weightsXY3);

T *CPL_RESTRICT pDstBand =
reinterpret_cast<T *>(poWK->papabyDstImage[iBand]);
pDstBand[iDstOffset] = GWKClampValueT<T>(fValue);
}
}

if (poWK->pafDstDensity)
poWK->pafDstDensity[iDstOffset] = 1.0f;
}

#endif // defined(__x86_64) || defined(_M_X64)

/************************************************************************/
/* GWKResampleNoMasksOrDstDensityOnlyThreadInternal() */
/************************************************************************/
Expand Down Expand Up @@ -5957,6 +6094,22 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
const GPtrDiff_t iDstOffset =
iDstX + static_cast<GPtrDiff_t>(iDstY) * nDstXSize;

#if defined(__x86_64) || defined(_M_X64)
if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic &&
(std::is_same<T, GByte>::value ||
std::is_same<T, GUInt16>::value))
{
if (poWK->nBands > 1 && !poWK->bApplyVerticalShift)
{
GWKCubicResampleNoMasks4MultiBandT<T>(
poWK, padfX[iDstX] - poWK->nSrcXOff,
padfY[iDstX] - poWK->nSrcYOff, iDstOffset);

continue;
}
}
#endif // defined(__x86_64) || defined(_M_X64)

[[maybe_unused]] double dfInvWeights = 0;
for (int iBand = 0; iBand < poWK->nBands; iBand++)
{
Expand Down
77 changes: 77 additions & 0 deletions autotest/utilities/test_gdalwarp_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -4330,3 +4330,80 @@ def test_gdalwarp_lib_src_is_geog_arc_second():
assert out_ds.RasterXSize == 5464
assert out_ds.RasterYSize == 5464
assert out_ds.GetRasterBand(1).Checksum() == 31856


###############################################################################
# Test GWKCubicResampleNoMasks4MultiBandT<Byte>()


def test_gdalwarp_lib_cubic_multiband_byte_4sample_optim():

src_ds = gdal.Open("../gdrivers/data/small_world.tif")

# RGB only
out_ds = gdal.Warp(
"",
src_ds,
options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic",
)
assert out_ds.RasterXSize == 21
assert out_ds.RasterYSize == 21
assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
4785,
4689,
5007,
]

# With dest alpha
out_ds = gdal.Warp(
"",
src_ds,
options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic -dstalpha",
)
assert out_ds.RasterXSize == 21
assert out_ds.RasterYSize == 21
assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
4785,
4689,
5007,
]
assert out_ds.GetRasterBand(4).ComputeRasterMinMax() == (255, 255)

# Test edge effects
# (slightly change the target resolution so that the nearest approximation
# doesn't kick in)
out_ds = gdal.Warp(
"",
src_ds,
options="-f MEM -r cubic -tr 0.9000001 0.9000001 -wo XSCALE=1 -wo YSCALE=1",
)
assert out_ds.RasterXSize == 400
assert out_ds.RasterYSize == 200
assert out_ds.ReadRaster() == src_ds.ReadRaster()


###############################################################################
# Test GWKCubicResampleNoMasks4MultiBandT<GUInt16>()


def test_gdalwarp_lib_cubic_multiband_uint16_4sample_optim():

src_ds = gdal.Open("../gdrivers/data/small_world.tif")
src_ds = gdal.Translate(
"", src_ds, options="-f MEM -ot UInt16 -scale 0 255 0 65535"
)

# RGB only
out_ds = gdal.Warp(
"",
src_ds,
options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic",
)
out_ds = gdal.Translate("", out_ds, options="-f MEM -ot Byte -scale 0 65535 0 255")
assert out_ds.RasterXSize == 21
assert out_ds.RasterYSize == 21
assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
4785,
4689,
5007,
]

0 comments on commit 78eed8c

Please sign in to comment.