@@ -910,22 +910,22 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
910910 * Table structure for a 16x 8-bit entry table.
911911 */
912912struct vtable8_16x8 {
913- vint8 t0;
913+ svuint8_8_t t0;
914914};
915915
916916/*
917917 * Table structure for a 32x 8-bit entry table.
918918 */
919919struct vtable8_32x8 {
920- vint8 t0;
920+ svuint8_8_t t0;
921921};
922922
923923/*
924924 * Table structure for a 64x 8-bit entry table.
925925 */
926926struct vtable8_64x8 {
927- vint8 t0;
928- vint8 t1;
927+ svuint8_8_t t0;
928+ svuint8_8_t t1;
929929};
930930
931931/* *
@@ -936,7 +936,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
936936 const uint8_t * data
937937) {
938938 // Top half of register will be zeros
939- table.t0 = vint8 ( svld1_u8 (svptrue_pat_b8 (SV_VL16), data) );
939+ table.t0 = svld1_u8 (svptrue_pat_b8 (SV_VL16), data);
940940}
941941
942942/* *
@@ -946,7 +946,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
946946 vtable8_32x8& table,
947947 const uint8_t * data
948948) {
949- table.t0 = vint8 ( svld1_u8 (svptrue_b8 (), data) );
949+ table.t0 = svld1_u8 (svptrue_b8 (), data);
950950}
951951
952952/* *
@@ -956,8 +956,8 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
956956 vtable8_64x8& table,
957957 const uint8_t * data
958958) {
959- table.t0 = vint8 ( svld1_u8 (svptrue_b8 (), data) );
960- table.t1 = vint8 ( svld1_u8 (svptrue_b8 (), data + 32 ) );
959+ table.t0 = svld1_u8 (svptrue_b8 (), data);
960+ table.t1 = svld1_u8 (svptrue_b8 (), data + 32 );
961961}
962962
963963/* *
@@ -969,11 +969,9 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
969969) {
970970 // Set index byte above max index for unused bytes so table lookup returns zero
971971 svint32_8_t idx_masked = svorr_s32_x (svptrue_b32 (), idx.m , svdup_s32 (0xFFFFFF00 ));
972-
973972 svuint8_8_t idx_bytes = svreinterpret_u8_s32 (idx_masked);
974- svuint8_8_t tbl_bytes = svreinterpret_u8_s32 (tbl.t0 .m );
975- svuint8_8_t result = svtbl_u8 (tbl_bytes, idx_bytes);
976973
974+ svuint8_8_t result = svtbl_u8 (tbl.t0 , idx_bytes);
977975 return vint8 (svreinterpret_s32_u8 (result));
978976}
979977
@@ -986,40 +984,32 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
986984) {
987985 // Set index byte above max index for unused bytes so table lookup returns zero
988986 svint32_8_t idx_masked = svorr_s32_x (svptrue_b32 (), idx.m , svdup_s32 (0xFFFFFF00 ));
989-
990987 svuint8_8_t idx_bytes = svreinterpret_u8_s32 (idx_masked);
991- svuint8_8_t tbl_bytes = svreinterpret_u8_s32 (tbl.t0 .m );
992- svuint8_8_t result = svtbl_u8 (tbl_bytes, idx_bytes);
993988
989+ svuint8_8_t result = svtbl_u8 (tbl.t0 , idx_bytes);
994990 return vint8 (svreinterpret_s32_u8 (result));
995991}
996992
997993/* *
998994 * @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
995+ *
996+ * Future: SVE2 can directly do svtbl2_u8() for a two register table.
999997 */
1000998ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit (
1001999 const vtable8_64x8& tbl,
10021000 vint8 idx
10031001) {
10041002 // Set index byte above max index for unused bytes so table lookup returns zero
1005- svint32_8_t literal32 = svdup_s32 (32 );
1006- svbool_8_t idx_lo_select = svcmplt (svptrue_b32 (), idx.m , literal32);
1007- svint32_8_t idx_lo_masked = svorr_s32_x (svptrue_b32 (), idx.m , svdup_s32 (0xFFFFFF00 ));
1008- svint32_8_t idx_hi_masked = svorr_s32_x (svptrue_b32 (), idx.m - literal32, svdup_s32 (0xFFFFFF00 ));
1003+ svint32_8_t idxm = svorr_s32_x (svptrue_b32 (), idx.m , svdup_s32 (0xFFFFFF00 ));
10091004
1010- svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32 (idx_lo_masked );
1011- svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32 (idx_hi_masked );
1005+ svuint8_8_t idxm8 = svreinterpret_u8_s32 (idxm );
1006+ svuint8_8_t t0_lookup = svtbl_u8 (tbl. t0 , idxm8 );
10121007
1013- svuint8_8_t tbl0_bytes = svreinterpret_u8_s32 (tbl. t0 . m );
1014- svuint8_8_t tbl1_bytes = svreinterpret_u8_s32 (tbl.t1 . m );
1008+ idxm8 = svsub_u8_x ( svptrue_b8 (), idxm8, svdup_u8 ( 32 ) );
1009+ svuint8_8_t t1_lookup = svtbl_u8 (tbl.t1 , idxm8 );
10151010
1016- svint32_8_t t0_lookup = svreinterpret_s32_u8 (svtbl_u8 (tbl0_bytes, idx_lo_bytes));
1017- svint32_8_t t1_lookup = svreinterpret_s32_u8 (svtbl_u8 (tbl1_bytes, idx_hi_bytes));
1018-
1019- svint32_8_t result = svsel_s32 (idx_lo_select, t0_lookup, t1_lookup);
1020-
1021- // Future: SVE2 can directly do svtbl2_u8() for a two register table
1022- return vint8 (result);
1011+ svuint8_8_t result = svorr_u8_x (svptrue_b32 (), t0_lookup, t1_lookup);
1012+ return vint8 (svreinterpret_s32_u8 (result));
10231013}
10241014
10251015/* *
0 commit comments