Skip to content

Commit

Permalink
Warper: generalize previous commit to UInt16
Browse files Browse the repository at this point in the history
  • Loading branch information
rouault committed Oct 20, 2024
1 parent 1c5796f commit 2509f96
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 38 deletions.
73 changes: 36 additions & 37 deletions alg/gdalwarpkernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2996,7 +2996,6 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
return _mm_cvtepi32_ps(xmm_i);
}

#if defined(USE_SSE_CUBIC_IMPL)
static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
{
GUInt64 i;
Expand All @@ -3011,7 +3010,6 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
#endif
return _mm_cvtepi32_ps(xmm_i);
}
#endif

/************************************************************************/
/* XMMHorizontalAdd() */
Expand Down Expand Up @@ -5857,7 +5855,7 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK)
}

/************************************************************************/
/* GWKCubicResampleNoMasks4ByteMultiBand() */
/* GWKCubicResampleNoMasks4MultiBandT() */
/************************************************************************/

/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
Expand All @@ -5878,9 +5876,10 @@ static inline float Convolute4x4(const __m128 row0, const __m128 row1,
_mm_mul_ps(_mm_mul_ps(row3, weightsX), weightsY3)));
}

static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
double dfSrcX, double dfSrcY,
const GPtrDiff_t iDstOffset)
template <class T>
static void GWKCubicResampleNoMasks4MultiBandT(const GDALWarpKernel *poWK,
double dfSrcX, double dfSrcY,
const GPtrDiff_t iDstOffset)
{
const double dfSrcXShifted = dfSrcX - 0.5;
const int iSrcX = static_cast<int>(dfSrcXShifted);
Expand All @@ -5895,10 +5894,10 @@ static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
{
for (int iBand = 0; iBand < poWK->nBands; iBand++)
{
GByte value;
T value;
GWKBilinearResampleNoMasks4SampleT(poWK, iBand, dfSrcX, dfSrcY,
&value);
reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand])[iDstOffset] =
reinterpret_cast<T *>(poWK->papabyDstImage[iBand])[iDstOffset] =
value;
}
}
Expand All @@ -5923,26 +5922,25 @@ static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
// Process 2 bands at a time
for (; iBand + 1 < poWK->nBands; iBand += 2)
{
const GByte *CPL_RESTRICT pabyBand0 =
reinterpret_cast<const GByte *>(poWK->papabySrcImage[iBand]);
const auto row0_0 = XMMLoad4Values(pabyBand0 + iOffset);
const T *CPL_RESTRICT pBand0 =
reinterpret_cast<const T *>(poWK->papabySrcImage[iBand]);
const auto row0_0 = XMMLoad4Values(pBand0 + iOffset);
const auto row1_0 =
XMMLoad4Values(pabyBand0 + iOffset + poWK->nSrcXSize);
XMMLoad4Values(pBand0 + iOffset + poWK->nSrcXSize);
const auto row2_0 =
XMMLoad4Values(pabyBand0 + iOffset + 2 * poWK->nSrcXSize);
XMMLoad4Values(pBand0 + iOffset + 2 * poWK->nSrcXSize);
const auto row3_0 =
XMMLoad4Values(pabyBand0 + iOffset + 3 * poWK->nSrcXSize);
XMMLoad4Values(pBand0 + iOffset + 3 * poWK->nSrcXSize);

const GByte *CPL_RESTRICT pabyBand1 =
reinterpret_cast<const GByte *>(
poWK->papabySrcImage[iBand + 1]);
const auto row0_1 = XMMLoad4Values(pabyBand1 + iOffset);
const T *CPL_RESTRICT pBand1 =
reinterpret_cast<const T *>(poWK->papabySrcImage[iBand + 1]);
const auto row0_1 = XMMLoad4Values(pBand1 + iOffset);
const auto row1_1 =
XMMLoad4Values(pabyBand1 + iOffset + poWK->nSrcXSize);
XMMLoad4Values(pBand1 + iOffset + poWK->nSrcXSize);
const auto row2_1 =
XMMLoad4Values(pabyBand1 + iOffset + 2 * poWK->nSrcXSize);
XMMLoad4Values(pBand1 + iOffset + 2 * poWK->nSrcXSize);
const auto row3_1 =
XMMLoad4Values(pabyBand1 + iOffset + 3 * poWK->nSrcXSize);
XMMLoad4Values(pBand1 + iOffset + 3 * poWK->nSrcXSize);

const float fValue_0 =
Convolute4x4(row0_0, row1_0, row2_0, row3_0, weightsX,
Expand All @@ -5952,32 +5950,32 @@ static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
Convolute4x4(row0_1, row1_1, row2_1, row3_1, weightsX,
weightsY0, weightsY1, weightsY2, weightsY3);

GByte *CPL_RESTRICT pabyDstBand0 =
reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand]);
pabyDstBand0[iDstOffset] = GWKClampValueT<GByte>(fValue_0);
T *CPL_RESTRICT pDstBand0 =
reinterpret_cast<T *>(poWK->papabyDstImage[iBand]);
pDstBand0[iDstOffset] = GWKClampValueT<T>(fValue_0);

GByte *CPL_RESTRICT pabyDstBand1 =
reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand + 1]);
pabyDstBand1[iDstOffset] = GWKClampValueT<GByte>(fValue_1);
T *CPL_RESTRICT pDstBand1 =
reinterpret_cast<T *>(poWK->papabyDstImage[iBand + 1]);
pDstBand1[iDstOffset] = GWKClampValueT<T>(fValue_1);
}
if (iBand < poWK->nBands)
{
const GByte *pabyBand0 =
reinterpret_cast<const GByte *>(poWK->papabySrcImage[iBand]);
const auto row0 = XMMLoad4Values(pabyBand0 + iOffset);
const T *pBand0 =
reinterpret_cast<const T *>(poWK->papabySrcImage[iBand]);
const auto row0 = XMMLoad4Values(pBand0 + iOffset);
const auto row1 =
XMMLoad4Values(pabyBand0 + iOffset + poWK->nSrcXSize);
XMMLoad4Values(pBand0 + iOffset + poWK->nSrcXSize);
const auto row2 =
XMMLoad4Values(pabyBand0 + iOffset + 2 * poWK->nSrcXSize);
XMMLoad4Values(pBand0 + iOffset + 2 * poWK->nSrcXSize);
const auto row3 =
XMMLoad4Values(pabyBand0 + iOffset + 3 * poWK->nSrcXSize);
XMMLoad4Values(pBand0 + iOffset + 3 * poWK->nSrcXSize);

const float fValue =
Convolute4x4(row0, row1, row2, row3, weightsX, weightsY0,
weightsY1, weightsY2, weightsY3);

reinterpret_cast<GByte *>(poWK->papabyDstImage[iBand])[iDstOffset] =
GWKClampValueT<GByte>(fValue);
reinterpret_cast<T *>(poWK->papabyDstImage[iBand])[iDstOffset] =
GWKClampValueT<T>(fValue);
}
}

Expand Down Expand Up @@ -6093,11 +6091,12 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)

#if defined(__x86_64) || defined(_M_X64)
if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic &&
std::is_same<T, GByte>::value)
(std::is_same<T, GByte>::value ||
std::is_same<T, GUInt16>::value))
{
if (poWK->nBands > 1 && !poWK->bApplyVerticalShift)
{
GWKCubicResampleNoMasks4ByteMultiBand(
GWKCubicResampleNoMasks4MultiBandT<T>(
poWK, padfX[iDstX] - poWK->nSrcXOff,
padfY[iDstX] - poWK->nSrcYOff, iDstOffset);

Expand Down
29 changes: 28 additions & 1 deletion autotest/utilities/test_gdalwarp_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -4333,7 +4333,7 @@ def test_gdalwarp_lib_src_is_geog_arc_second():


###############################################################################
# Test GWKCubicResampleNoMasks4ByteMultiBand()
# Test GWKCubicResampleNoMasks4MultiBandT<Byte>()


def test_gdalwarp_lib_cubic_multiband_byte_4sample_optim():
Expand Down Expand Up @@ -4380,3 +4380,30 @@ def test_gdalwarp_lib_cubic_multiband_byte_4sample_optim():
assert out_ds.RasterXSize == 400
assert out_ds.RasterYSize == 200
assert out_ds.ReadRaster() == src_ds.ReadRaster()


###############################################################################
# Test GWKCubicResampleNoMasks4MultiBandT<GUInt16>()


def test_gdalwarp_lib_cubic_multiband_uint16_4sample_optim():

src_ds = gdal.Open("../gdrivers/data/small_world.tif")
src_ds = gdal.Translate(
"", src_ds, options="-f MEM -ot UInt16 -scale 0 255 0 65535"
)

# RGB only
out_ds = gdal.Warp(
"",
src_ds,
options="-f MEM -tr 0.9 0.9 -te -10 40.1 8.9 59 -r cubic",
)
out_ds = gdal.Translate("", out_ds, options="-f MEM -ot Byte -scale 0 65535 0 255")
assert out_ds.RasterXSize == 21
assert out_ds.RasterYSize == 21
assert [out_ds.GetRasterBand(i + 1).Checksum() for i in range(3)] == [
4785,
4689,
5007,
]

0 comments on commit 2509f96

Please sign in to comment.