Skip to content

Commit f94cbf0

Browse files
jirkamarsikansalond
authored andcommitted
[GR-68894] Backport to 25.0: GraalWasm performance regressions due to unsupported Vector API expansions.
PullRequest: graal/21922
2 parents b00e6bf + 046ee7c commit f94cbf0

File tree

1 file changed

+34
-26
lines changed

1 file changed

+34
-26
lines changed

wasm/src/org.graalvm.wasm.jdk25/src/org/graalvm/wasm/api/Vector128OpsVectorAPI.java

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@ public ByteVector unary(ByteVector xVec, int vectorOpcode) {
323323
case Bytecode.VECTOR_V128_NOT -> unop(x, I8X16, VectorOperators.NOT);
324324
case Bytecode.VECTOR_I8X16_ABS -> unop(x, I8X16, VectorOperators.ABS);
325325
case Bytecode.VECTOR_I8X16_NEG -> unop(x, I8X16, VectorOperators.NEG);
326-
case Bytecode.VECTOR_I8X16_POPCNT -> unop(x, I8X16, VectorOperators.BIT_COUNT);
326+
case Bytecode.VECTOR_I8X16_POPCNT -> i8x16_popcnt(x); // GR-68892
327327
case Bytecode.VECTOR_I16X8_EXTADD_PAIRWISE_I8X16_S -> extadd_pairwise(x, I8X16, VectorOperators.B2S);
328328
case Bytecode.VECTOR_I16X8_EXTADD_PAIRWISE_I8X16_U -> extadd_pairwise(x, I8X16, VectorOperators.ZERO_EXTEND_B2S);
329329
case Bytecode.VECTOR_I16X8_EXTEND_LOW_I8X16_S -> extend(x, 0, I8X16, VectorOperators.B2S);
@@ -366,16 +366,16 @@ public ByteVector unary(ByteVector xVec, int vectorOpcode) {
366366
case Bytecode.VECTOR_F64X2_TRUNC -> trunc(x, F64X2, I64X2, VectorOperators.REINTERPRET_D2L, VectorOperators.REINTERPRET_L2D,
367367
Vector128OpsVectorAPI::getExponentDoubles, DOUBLE_SIGNIFICAND_WIDTH, I64X2.broadcast(DOUBLE_SIGNIF_BIT_MASK));
368368
case Bytecode.VECTOR_F64X2_NEAREST -> nearest(x, F64X2, 1L << (DOUBLE_SIGNIFICAND_WIDTH - 1));
369-
case Bytecode.VECTOR_I32X4_TRUNC_SAT_F32X4_S, Bytecode.VECTOR_I32X4_RELAXED_TRUNC_F32X4_S -> I8X16.species().fromArray(fallbackOps.unary(x.toArray(), vectorOpcode), 0); // GR-51421
370-
case Bytecode.VECTOR_I32X4_TRUNC_SAT_F32X4_U, Bytecode.VECTOR_I32X4_RELAXED_TRUNC_F32X4_U -> I8X16.species().fromArray(fallbackOps.unary(x.toArray(), vectorOpcode), 0); // GR-51421
369+
case Bytecode.VECTOR_I32X4_TRUNC_SAT_F32X4_S, Bytecode.VECTOR_I32X4_RELAXED_TRUNC_F32X4_S -> fromArray(fallbackOps.unary(x.toArray(), vectorOpcode)); // GR-51421
370+
case Bytecode.VECTOR_I32X4_TRUNC_SAT_F32X4_U, Bytecode.VECTOR_I32X4_RELAXED_TRUNC_F32X4_U -> fromArray(fallbackOps.unary(x.toArray(), vectorOpcode)); // GR-51421
371371
case Bytecode.VECTOR_F32X4_CONVERT_I32X4_S -> convert(x, I32X4, VectorOperators.I2F);
372-
case Bytecode.VECTOR_F32X4_CONVERT_I32X4_U -> f32x4_convert_i32x4_u(x);
373-
case Bytecode.VECTOR_I32X4_TRUNC_SAT_F64X2_S_ZERO, Bytecode.VECTOR_I32X4_RELAXED_TRUNC_F64X2_S_ZERO -> I8X16.species().fromArray(fallbackOps.unary(x.toArray(), vectorOpcode), 0); // GR-51421
374-
case Bytecode.VECTOR_I32X4_TRUNC_SAT_F64X2_U_ZERO, Bytecode.VECTOR_I32X4_RELAXED_TRUNC_F64X2_U_ZERO -> I8X16.species().fromArray(fallbackOps.unary(x.toArray(), vectorOpcode), 0); // GR-51421
372+
case Bytecode.VECTOR_F32X4_CONVERT_I32X4_U -> fromArray(fallbackOps.unary(x.toArray(), vectorOpcode)); // GR-68843
373+
case Bytecode.VECTOR_I32X4_TRUNC_SAT_F64X2_S_ZERO, Bytecode.VECTOR_I32X4_RELAXED_TRUNC_F64X2_S_ZERO -> fromArray(fallbackOps.unary(x.toArray(), vectorOpcode)); // GR-51421
374+
case Bytecode.VECTOR_I32X4_TRUNC_SAT_F64X2_U_ZERO, Bytecode.VECTOR_I32X4_RELAXED_TRUNC_F64X2_U_ZERO -> fromArray(fallbackOps.unary(x.toArray(), vectorOpcode)); // GR-51421
375375
case Bytecode.VECTOR_F64X2_CONVERT_LOW_I32X4_S -> convert(x, I32X4, VectorOperators.I2D);
376376
case Bytecode.VECTOR_F64X2_CONVERT_LOW_I32X4_U -> f64x2_convert_low_i32x4_u(x);
377-
case Bytecode.VECTOR_F32X4_DEMOTE_F64X2_ZERO -> f32X4_demote_f64X2_zero(x);
378-
case Bytecode.VECTOR_F64X2_PROMOTE_LOW_F32X4 -> convert(x, F32X4, VectorOperators.F2D);
377+
case Bytecode.VECTOR_F32X4_DEMOTE_F64X2_ZERO -> fromArray(fallbackOps.unary(x.toArray(), vectorOpcode)); // GR-68843
378+
case Bytecode.VECTOR_F64X2_PROMOTE_LOW_F32X4 -> fromArray(fallbackOps.unary(x.toArray(), vectorOpcode)); // GR-68843
379379
default -> throw CompilerDirectives.shouldNotReachHere();
380380
});
381381
}
@@ -441,30 +441,30 @@ public ByteVector binary(ByteVector xVec, ByteVector yVec, int vectorOpcode) {
441441
case Bytecode.VECTOR_I8X16_NARROW_I16X8_S -> narrow(x, y, I16X8, I8X16, Byte.MIN_VALUE, Byte.MAX_VALUE);
442442
case Bytecode.VECTOR_I8X16_NARROW_I16X8_U -> narrow(x, y, I16X8, I8X16, (short) 0, (short) 0xff);
443443
case Bytecode.VECTOR_I8X16_ADD -> binop(x, y, I8X16, VectorOperators.ADD);
444-
case Bytecode.VECTOR_I8X16_ADD_SAT_S -> binop(x, y, I8X16, VectorOperators.SADD);
445-
case Bytecode.VECTOR_I8X16_ADD_SAT_U -> binop_sat_u(x, y, I8X16, I16X8, VectorOperators.ZERO_EXTEND_B2S, VectorOperators.ADD, 0, 0xff);
444+
case Bytecode.VECTOR_I8X16_ADD_SAT_S -> binop_sat(x, y, I8X16, I16X8, VectorOperators.B2S, VectorOperators.ADD, Byte.MIN_VALUE, Byte.MAX_VALUE); // GR-68891
445+
case Bytecode.VECTOR_I8X16_ADD_SAT_U -> binop_sat(x, y, I8X16, I16X8, VectorOperators.ZERO_EXTEND_B2S, VectorOperators.ADD, 0, 0xff); // GR-68891
446446
case Bytecode.VECTOR_I8X16_SUB -> binop(x, y, I8X16, VectorOperators.SUB);
447-
case Bytecode.VECTOR_I8X16_SUB_SAT_S -> binop(x, y, I8X16, VectorOperators.SSUB);
448-
case Bytecode.VECTOR_I8X16_SUB_SAT_U -> binop_sat_u(x, y, I8X16, I16X8, VectorOperators.ZERO_EXTEND_B2S, VectorOperators.SUB, 0, 0xff);
447+
case Bytecode.VECTOR_I8X16_SUB_SAT_S -> binop_sat(x, y, I8X16, I16X8, VectorOperators.B2S, VectorOperators.SUB, Byte.MIN_VALUE, Byte.MAX_VALUE); // GR-68891
448+
case Bytecode.VECTOR_I8X16_SUB_SAT_U -> binop_sat(x, y, I8X16, I16X8, VectorOperators.ZERO_EXTEND_B2S, VectorOperators.SUB, 0, 0xff); // GR-68891
449449
case Bytecode.VECTOR_I8X16_MIN_S -> binop(x, y, I8X16, VectorOperators.MIN);
450-
case Bytecode.VECTOR_I8X16_MIN_U -> binop(x, y, I8X16, VectorOperators.UMIN);
450+
case Bytecode.VECTOR_I8X16_MIN_U -> fromArray(fallbackOps.binary(x.toArray(), y.toArray(), vectorOpcode)); // GR-68891
451451
case Bytecode.VECTOR_I8X16_MAX_S -> binop(x, y, I8X16, VectorOperators.MAX);
452-
case Bytecode.VECTOR_I8X16_MAX_U -> binop(x, y, I8X16, VectorOperators.UMAX);
452+
case Bytecode.VECTOR_I8X16_MAX_U -> fromArray(fallbackOps.binary(x.toArray(), y.toArray(), vectorOpcode)); // GR-68891
453453
case Bytecode.VECTOR_I8X16_AVGR_U -> avgr_u(x, y, I8X16, I16X8, VectorOperators.ZERO_EXTEND_B2S);
454454
case Bytecode.VECTOR_I16X8_NARROW_I32X4_S -> narrow(x, y, I32X4, I16X8, Short.MIN_VALUE, Short.MAX_VALUE);
455455
case Bytecode.VECTOR_I16X8_NARROW_I32X4_U -> narrow(x, y, I32X4, I16X8, 0, 0xffff);
456456
case Bytecode.VECTOR_I16X8_Q15MULR_SAT_S, Bytecode.VECTOR_I16X8_RELAXED_Q15MULR_S -> i16x8_q15mulr_sat_s(x, y);
457457
case Bytecode.VECTOR_I16X8_ADD -> binop(x, y, I16X8, VectorOperators.ADD);
458-
case Bytecode.VECTOR_I16X8_ADD_SAT_S -> binop(x, y, I16X8, VectorOperators.SADD);
459-
case Bytecode.VECTOR_I16X8_ADD_SAT_U -> binop_sat_u(x, y, I16X8, I32X4, VectorOperators.ZERO_EXTEND_S2I, VectorOperators.ADD, 0, 0xffff);
458+
case Bytecode.VECTOR_I16X8_ADD_SAT_S -> binop_sat(x, y, I16X8, I32X4, VectorOperators.S2I, VectorOperators.ADD, Short.MIN_VALUE, Short.MAX_VALUE); // GR-68891
459+
case Bytecode.VECTOR_I16X8_ADD_SAT_U -> binop_sat(x, y, I16X8, I32X4, VectorOperators.ZERO_EXTEND_S2I, VectorOperators.ADD, 0, 0xffff); // GR-68891
460460
case Bytecode.VECTOR_I16X8_SUB -> binop(x, y, I16X8, VectorOperators.SUB);
461-
case Bytecode.VECTOR_I16X8_SUB_SAT_S -> binop(x, y, I16X8, VectorOperators.SSUB);
462-
case Bytecode.VECTOR_I16X8_SUB_SAT_U -> binop_sat_u(x, y, I16X8, I32X4, VectorOperators.ZERO_EXTEND_S2I, VectorOperators.SUB, 0, 0xffff);
461+
case Bytecode.VECTOR_I16X8_SUB_SAT_S -> binop_sat(x, y, I16X8, I32X4, VectorOperators.S2I, VectorOperators.SUB, Short.MIN_VALUE, Short.MAX_VALUE); // GR-68891
462+
case Bytecode.VECTOR_I16X8_SUB_SAT_U -> binop_sat(x, y, I16X8, I32X4, VectorOperators.ZERO_EXTEND_S2I, VectorOperators.SUB, 0, 0xffff); // GR-68891
463463
case Bytecode.VECTOR_I16X8_MUL -> binop(x, y, I16X8, VectorOperators.MUL);
464464
case Bytecode.VECTOR_I16X8_MIN_S -> binop(x, y, I16X8, VectorOperators.MIN);
465-
case Bytecode.VECTOR_I16X8_MIN_U -> binop(x, y, I16X8, VectorOperators.UMIN);
465+
case Bytecode.VECTOR_I16X8_MIN_U -> fromArray(fallbackOps.binary(x.toArray(), y.toArray(), vectorOpcode)); // GR-68891
466466
case Bytecode.VECTOR_I16X8_MAX_S -> binop(x, y, I16X8, VectorOperators.MAX);
467-
case Bytecode.VECTOR_I16X8_MAX_U -> binop(x, y, I16X8, VectorOperators.UMAX);
467+
case Bytecode.VECTOR_I16X8_MAX_U -> fromArray(fallbackOps.binary(x.toArray(), y.toArray(), vectorOpcode)); // GR-68891
468468
case Bytecode.VECTOR_I16X8_AVGR_U -> avgr_u(x, y, I16X8, I32X4, VectorOperators.ZERO_EXTEND_S2I);
469469
case Bytecode.VECTOR_I16X8_EXTMUL_LOW_I8X16_S -> extmul(x, y, I8X16, VectorOperators.B2S, 0);
470470
case Bytecode.VECTOR_I16X8_EXTMUL_LOW_I8X16_U -> extmul(x, y, I8X16, VectorOperators.ZERO_EXTEND_B2S, 0);
@@ -474,9 +474,9 @@ public ByteVector binary(ByteVector xVec, ByteVector yVec, int vectorOpcode) {
474474
case Bytecode.VECTOR_I32X4_SUB -> binop(x, y, I32X4, VectorOperators.SUB);
475475
case Bytecode.VECTOR_I32X4_MUL -> binop(x, y, I32X4, VectorOperators.MUL);
476476
case Bytecode.VECTOR_I32X4_MIN_S -> binop(x, y, I32X4, VectorOperators.MIN);
477-
case Bytecode.VECTOR_I32X4_MIN_U -> binop(x, y, I32X4, VectorOperators.UMIN);
477+
case Bytecode.VECTOR_I32X4_MIN_U -> fromArray(fallbackOps.binary(x.toArray(), y.toArray(), vectorOpcode)); // GR-68891
478478
case Bytecode.VECTOR_I32X4_MAX_S -> binop(x, y, I32X4, VectorOperators.MAX);
479-
case Bytecode.VECTOR_I32X4_MAX_U -> binop(x, y, I32X4, VectorOperators.UMAX);
479+
case Bytecode.VECTOR_I32X4_MAX_U -> fromArray(fallbackOps.binary(x.toArray(), y.toArray(), vectorOpcode)); // GR-68891
480480
case Bytecode.VECTOR_I32X4_DOT_I16X8_S -> i32x4_dot_i16x8_s(x, y);
481481
case Bytecode.VECTOR_I32X4_EXTMUL_LOW_I16X8_S -> extmul(x, y, I16X8, VectorOperators.S2I, 0);
482482
case Bytecode.VECTOR_I32X4_EXTMUL_LOW_I16X8_U -> extmul(x, y, I16X8, VectorOperators.ZERO_EXTEND_S2I, 0);
@@ -537,7 +537,7 @@ public int vectorToInt(ByteVector xVec, int vectorOpcode) {
537537
case Bytecode.VECTOR_I16X8_BITMASK -> bitmask(x, I16X8);
538538
case Bytecode.VECTOR_I32X4_ALL_TRUE -> all_true(x, I32X4);
539539
case Bytecode.VECTOR_I32X4_BITMASK -> bitmask(x, I32X4);
540-
case Bytecode.VECTOR_I64X2_ALL_TRUE -> all_true(x, I64X2);
540+
case Bytecode.VECTOR_I64X2_ALL_TRUE -> fallbackOps.vectorToInt(x.toArray(), vectorOpcode); // GR-68893
541541
case Bytecode.VECTOR_I64X2_BITMASK -> bitmask(x, I64X2);
542542
default -> throw CompilerDirectives.shouldNotReachHere();
543543
};
@@ -747,6 +747,13 @@ private static <E> ByteVector unop(ByteVector xBytes, Shape<E> shape, VectorOper
747747
return result.reinterpretAsBytes();
748748
}
749749

750+
private static ByteVector i8x16_popcnt(ByteVector x) {
751+
// Based on the same approach as Integer#bitCount
752+
ByteVector popcnt = x.sub(x.lanewise(VectorOperators.LSHR, 1).and((byte) 0x55));
753+
popcnt = popcnt.and((byte) 0x33).add(popcnt.lanewise(VectorOperators.LSHR, 2).and((byte) 0x33));
754+
return popcnt.add(popcnt.lanewise(VectorOperators.LSHR, 4)).and((byte) 0x0F);
755+
}
756+
750757
private static <E, F> ByteVector extadd_pairwise(ByteVector xBytes, Shape<E> shape, VectorOperators.Conversion<E, F> conv) {
751758
Vector<E> x = shape.reinterpret(xBytes);
752759
Vector<F> evens = x.compress(shape.evensMask).convert(conv, 0);
@@ -889,6 +896,7 @@ private static ByteVector i32x4_trunc_sat_f32x4_u(ByteVector xBytes) {
889896
return result.reinterpretAsBytes();
890897
}
891898

899+
@SuppressWarnings("unused")
892900
private static ByteVector f32x4_convert_i32x4_u(ByteVector xBytes) {
893901
IntVector x = xBytes.reinterpretAsInts();
894902
LongVector xUnsignedLow = castLong128(x.convert(VectorOperators.ZERO_EXTEND_I2L, 0));
@@ -915,6 +923,7 @@ private static ByteVector f64x2_convert_low_i32x4_u(ByteVector xBytes) {
915923
return result.reinterpretAsBytes();
916924
}
917925

926+
@SuppressWarnings("unused")
918927
private static ByteVector f32X4_demote_f64X2_zero(ByteVector xBytes) {
919928
DoubleVector x = F64X2.reinterpret(xBytes);
920929
Vector<Float> result = compactGeneral(x, 0, I64X2, F32X4, VectorOperators.D2F, VectorOperators.REINTERPRET_F2I, VectorOperators.ZERO_EXTEND_I2L);
@@ -1018,7 +1027,7 @@ private static <E, F> ByteVector narrow(ByteVector xBytes, ByteVector yBytes, Sh
10181027
return result.reinterpretAsBytes();
10191028
}
10201029

1021-
private static <E, F> ByteVector binop_sat_u(ByteVector xBytes, ByteVector yBytes,
1030+
private static <E, F> ByteVector binop_sat(ByteVector xBytes, ByteVector yBytes,
10221031
Shape<E> shape, Shape<F> extendedShape,
10231032
VectorOperators.Conversion<E, F> upcast,
10241033
VectorOperators.Binary op, long min, long max) {
@@ -1033,8 +1042,7 @@ private static <E, F> ByteVector avgr_u(ByteVector xBytes, ByteVector yBytes,
10331042
Shape<E> shape, Shape<F> extendedShape,
10341043
VectorOperators.Conversion<E, F> upcast) {
10351044
Vector<F> one = extendedShape.broadcast(1);
1036-
Vector<F> two = extendedShape.broadcast(2);
1037-
return upcastBinopDowncast(xBytes, yBytes, shape, extendedShape, upcast, (x, y) -> x.add(y).add(one).div(two));
1045+
return upcastBinopDowncast(xBytes, yBytes, shape, extendedShape, upcast, (x, y) -> x.add(y).add(one).lanewise(VectorOperators.LSHR, 1));
10381046
}
10391047

10401048
private static ByteVector i16x8_q15mulr_sat_s(ByteVector xBytes, ByteVector yBytes) {

0 commit comments

Comments
 (0)