@@ -2996,7 +2996,6 @@ static CPL_INLINE __m128 XMMLoad4Values(const GByte *ptr)
2996
2996
return _mm_cvtepi32_ps (xmm_i);
2997
2997
}
2998
2998
2999
- #if defined(USE_SSE_CUBIC_IMPL)
3000
2999
static CPL_INLINE __m128 XMMLoad4Values (const GUInt16 *ptr)
3001
3000
{
3002
3001
GUInt64 i;
@@ -3011,7 +3010,6 @@ static CPL_INLINE __m128 XMMLoad4Values(const GUInt16 *ptr)
3011
3010
#endif
3012
3011
return _mm_cvtepi32_ps (xmm_i);
3013
3012
}
3014
- #endif
3015
3013
3016
3014
/* ***********************************************************************/
3017
3015
/* XMMHorizontalAdd() */
@@ -5857,7 +5855,7 @@ static CPLErr GWKRealCase(GDALWarpKernel *poWK)
5857
5855
}
5858
5856
5859
5857
/* ***********************************************************************/
5860
- /* GWKCubicResampleNoMasks4ByteMultiBand() */
5858
+ /* GWKCubicResampleNoMasks4MultiBandT() */
5861
5859
/* ***********************************************************************/
5862
5860
5863
5861
/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
@@ -5878,9 +5876,10 @@ static inline float Convolute4x4(const __m128 row0, const __m128 row1,
5878
5876
_mm_mul_ps (_mm_mul_ps (row3, weightsX), weightsY3)));
5879
5877
}
5880
5878
5881
- static void GWKCubicResampleNoMasks4ByteMultiBand (const GDALWarpKernel *poWK,
5882
- double dfSrcX, double dfSrcY,
5883
- const GPtrDiff_t iDstOffset)
5879
+ template <class T >
5880
+ static void GWKCubicResampleNoMasks4MultiBandT (const GDALWarpKernel *poWK,
5881
+ double dfSrcX, double dfSrcY,
5882
+ const GPtrDiff_t iDstOffset)
5884
5883
{
5885
5884
const double dfSrcXShifted = dfSrcX - 0.5 ;
5886
5885
const int iSrcX = static_cast <int >(dfSrcXShifted);
@@ -5895,10 +5894,10 @@ static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
5895
5894
{
5896
5895
for (int iBand = 0 ; iBand < poWK->nBands ; iBand++)
5897
5896
{
5898
- GByte value;
5897
+ T value;
5899
5898
GWKBilinearResampleNoMasks4SampleT (poWK, iBand, dfSrcX, dfSrcY,
5900
5899
&value);
5901
- reinterpret_cast <GByte *>(poWK->papabyDstImage [iBand])[iDstOffset] =
5900
+ reinterpret_cast <T *>(poWK->papabyDstImage [iBand])[iDstOffset] =
5902
5901
value;
5903
5902
}
5904
5903
}
@@ -5923,26 +5922,25 @@ static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
5923
5922
// Process 2 bands at a time
5924
5923
for (; iBand + 1 < poWK->nBands ; iBand += 2 )
5925
5924
{
5926
- const GByte *CPL_RESTRICT pabyBand0 =
5927
- reinterpret_cast <const GByte *>(poWK->papabySrcImage [iBand]);
5928
- const auto row0_0 = XMMLoad4Values (pabyBand0 + iOffset);
5925
+ const T *CPL_RESTRICT pBand0 =
5926
+ reinterpret_cast <const T *>(poWK->papabySrcImage [iBand]);
5927
+ const auto row0_0 = XMMLoad4Values (pBand0 + iOffset);
5929
5928
const auto row1_0 =
5930
- XMMLoad4Values (pabyBand0 + iOffset + poWK->nSrcXSize );
5929
+ XMMLoad4Values (pBand0 + iOffset + poWK->nSrcXSize );
5931
5930
const auto row2_0 =
5932
- XMMLoad4Values (pabyBand0 + iOffset + 2 * poWK->nSrcXSize );
5931
+ XMMLoad4Values (pBand0 + iOffset + 2 * poWK->nSrcXSize );
5933
5932
const auto row3_0 =
5934
- XMMLoad4Values (pabyBand0 + iOffset + 3 * poWK->nSrcXSize );
5933
+ XMMLoad4Values (pBand0 + iOffset + 3 * poWK->nSrcXSize );
5935
5934
5936
- const GByte *CPL_RESTRICT pabyBand1 =
5937
- reinterpret_cast <const GByte *>(
5938
- poWK->papabySrcImage [iBand + 1 ]);
5939
- const auto row0_1 = XMMLoad4Values (pabyBand1 + iOffset);
5935
+ const T *CPL_RESTRICT pBand1 =
5936
+ reinterpret_cast <const T *>(poWK->papabySrcImage [iBand + 1 ]);
5937
+ const auto row0_1 = XMMLoad4Values (pBand1 + iOffset);
5940
5938
const auto row1_1 =
5941
- XMMLoad4Values (pabyBand1 + iOffset + poWK->nSrcXSize );
5939
+ XMMLoad4Values (pBand1 + iOffset + poWK->nSrcXSize );
5942
5940
const auto row2_1 =
5943
- XMMLoad4Values (pabyBand1 + iOffset + 2 * poWK->nSrcXSize );
5941
+ XMMLoad4Values (pBand1 + iOffset + 2 * poWK->nSrcXSize );
5944
5942
const auto row3_1 =
5945
- XMMLoad4Values (pabyBand1 + iOffset + 3 * poWK->nSrcXSize );
5943
+ XMMLoad4Values (pBand1 + iOffset + 3 * poWK->nSrcXSize );
5946
5944
5947
5945
const float fValue_0 =
5948
5946
Convolute4x4 (row0_0, row1_0, row2_0, row3_0, weightsX,
@@ -5952,32 +5950,32 @@ static void GWKCubicResampleNoMasks4ByteMultiBand(const GDALWarpKernel *poWK,
5952
5950
Convolute4x4 (row0_1, row1_1, row2_1, row3_1, weightsX,
5953
5951
weightsY0, weightsY1, weightsY2, weightsY3);
5954
5952
5955
- GByte *CPL_RESTRICT pabyDstBand0 =
5956
- reinterpret_cast <GByte *>(poWK->papabyDstImage [iBand]);
5957
- pabyDstBand0 [iDstOffset] = GWKClampValueT<GByte >(fValue_0 );
5953
+ T *CPL_RESTRICT pDstBand0 =
5954
+ reinterpret_cast <T *>(poWK->papabyDstImage [iBand]);
5955
+ pDstBand0 [iDstOffset] = GWKClampValueT<T >(fValue_0 );
5958
5956
5959
- GByte *CPL_RESTRICT pabyDstBand1 =
5960
- reinterpret_cast <GByte *>(poWK->papabyDstImage [iBand + 1 ]);
5961
- pabyDstBand1 [iDstOffset] = GWKClampValueT<GByte >(fValue_1 );
5957
+ T *CPL_RESTRICT pDstBand1 =
5958
+ reinterpret_cast <T *>(poWK->papabyDstImage [iBand + 1 ]);
5959
+ pDstBand1 [iDstOffset] = GWKClampValueT<T >(fValue_1 );
5962
5960
}
5963
5961
if (iBand < poWK->nBands )
5964
5962
{
5965
- const GByte *pabyBand0 =
5966
- reinterpret_cast <const GByte *>(poWK->papabySrcImage [iBand]);
5967
- const auto row0 = XMMLoad4Values (pabyBand0 + iOffset);
5963
+ const T *pBand0 =
5964
+ reinterpret_cast <const T *>(poWK->papabySrcImage [iBand]);
5965
+ const auto row0 = XMMLoad4Values (pBand0 + iOffset);
5968
5966
const auto row1 =
5969
- XMMLoad4Values (pabyBand0 + iOffset + poWK->nSrcXSize );
5967
+ XMMLoad4Values (pBand0 + iOffset + poWK->nSrcXSize );
5970
5968
const auto row2 =
5971
- XMMLoad4Values (pabyBand0 + iOffset + 2 * poWK->nSrcXSize );
5969
+ XMMLoad4Values (pBand0 + iOffset + 2 * poWK->nSrcXSize );
5972
5970
const auto row3 =
5973
- XMMLoad4Values (pabyBand0 + iOffset + 3 * poWK->nSrcXSize );
5971
+ XMMLoad4Values (pBand0 + iOffset + 3 * poWK->nSrcXSize );
5974
5972
5975
5973
const float fValue =
5976
5974
Convolute4x4 (row0, row1, row2, row3, weightsX, weightsY0,
5977
5975
weightsY1, weightsY2, weightsY3);
5978
5976
5979
- reinterpret_cast <GByte *>(poWK->papabyDstImage [iBand])[iDstOffset] =
5980
- GWKClampValueT<GByte >(fValue );
5977
+ reinterpret_cast <T *>(poWK->papabyDstImage [iBand])[iDstOffset] =
5978
+ GWKClampValueT<T >(fValue );
5981
5979
}
5982
5980
}
5983
5981
@@ -6093,11 +6091,12 @@ static void GWKResampleNoMasksOrDstDensityOnlyThreadInternal(void *pData)
6093
6091
6094
6092
#if defined(__x86_64) || defined(_M_X64)
6095
6093
if constexpr (bUse4SamplesFormula && eResample == GRA_Cubic &&
6096
- std::is_same<T, GByte>::value)
6094
+ (std::is_same<T, GByte>::value ||
6095
+ std::is_same<T, GUInt16>::value))
6097
6096
{
6098
6097
if (poWK->nBands > 1 && !poWK->bApplyVerticalShift )
6099
6098
{
6100
- GWKCubicResampleNoMasks4ByteMultiBand (
6099
+ GWKCubicResampleNoMasks4MultiBandT<T> (
6101
6100
poWK, padfX[iDstX] - poWK->nSrcXOff ,
6102
6101
padfY[iDstX] - poWK->nSrcYOff , iDstOffset);
6103
6102
0 commit comments