Skip to content

Commit

Permalink
[simd/jit]: Implement i8x16 comparison instructions (#82 from haoyu-z…
Browse files Browse the repository at this point in the history
…c/jit-i8x16-cmp)
  • Loading branch information
titzer authored Jul 21, 2023
2 parents 1d151df + 385d1e8 commit 301e52a
Show file tree
Hide file tree
Showing 5 changed files with 375 additions and 414 deletions.
93 changes: 26 additions & 67 deletions src/engine/x86-64/X86_64Interpreter.v3
Original file line number Diff line number Diff line change
Expand Up @@ -2455,6 +2455,10 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
genSimdBinop(Opcode.I16X8_EQ, asm.pcmpeqw_s_s);
genSimdBinop(Opcode.F32X4_EQ, asm.cmpeqps_s_s);
genSimdBinop(Opcode.F64X2_EQ, asm.cmpeqpd_s_s);
genSimdBinop(Opcode.I8X16_NE, masm.emit_i8x16_ne);
genSimdBinop(Opcode.I16X8_NE, masm.emit_i16x8_ne);
genSimdBinop(Opcode.I32X4_NE, masm.emit_i32x4_ne);
genSimdBinop(Opcode.I64X2_NE, masm.emit_i64x2_ne);
genSimdBinop(Opcode.F32X4_NE, asm.cmpneqps_s_s);
genSimdBinop(Opcode.F64X2_NE, asm.cmpneqpd_s_s);

Expand Down Expand Up @@ -2495,21 +2499,6 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
decrementVsp();
endHandler();
}
for (t in [
(Opcode.I8X16_NE, asm.pcmpeqb_s_s),
(Opcode.I16X8_NE, asm.pcmpeqw_s_s),
(Opcode.I32X4_NE, asm.pcmpeqd_s_s),
(Opcode.I64X2_NE, asm.pcmpeqq_s_s)
]) {
bindHandler(t.0);
load_v128_xmm0_xmm1();
t.1(r_xmm0, r_xmm1);
t.1(r_xmm1, r_xmm1);
asm.xorps_s_s(r_xmm0, r_xmm1);
asm.movdqu_m_s(vsph[-2].value, r_xmm0);
decrementVsp();
endHandler();
}

genSimdBinop(Opcode.I8X16_GT_S, asm.pcmpgtb_s_s);
genSimdBinop(Opcode.I16X8_GT_S, asm.pcmpgtw_s_s);
Expand All @@ -2535,59 +2524,29 @@ class X86_64InterpreterGen(ic: X86_64InterpreterCode, w: DataWriter) {
genSimdBinopCommute(Opcode.F32X4_GE, asm.cmpleps_s_s);
genSimdBinopCommute(Opcode.F64X2_GE, asm.cmplepd_s_s);

for (t in [
(Opcode.I8X16_GE_S, asm.pminsb_s_s, asm.pcmpeqb_s_s, load_v128_xmm0_xmm1),
(Opcode.I16X8_GE_S, asm.pminsw_s_s, asm.pcmpeqw_s_s, load_v128_xmm0_xmm1),
(Opcode.I32X4_GE_S, asm.pminsd_s_s, asm.pcmpeqd_s_s, load_v128_xmm0_xmm1),
(Opcode.I8X16_GE_U, asm.pminub_s_s, asm.pcmpeqb_s_s, load_v128_xmm0_xmm1),
(Opcode.I16X8_GE_U, asm.pminuw_s_s, asm.pcmpeqw_s_s, load_v128_xmm0_xmm1),
(Opcode.I32X4_GE_U, asm.pminud_s_s, asm.pcmpeqd_s_s, load_v128_xmm0_xmm1),

(Opcode.I8X16_LE_S, asm.pminsb_s_s, asm.pcmpeqb_s_s, load_v128_xmm1_xmm0),
(Opcode.I16X8_LE_S, asm.pminsw_s_s, asm.pcmpeqw_s_s, load_v128_xmm1_xmm0),
(Opcode.I32X4_LE_S, asm.pminsd_s_s, asm.pcmpeqd_s_s, load_v128_xmm1_xmm0),
(Opcode.I8X16_LE_U, asm.pminub_s_s, asm.pcmpeqb_s_s, load_v128_xmm1_xmm0),
(Opcode.I16X8_LE_U, asm.pminuw_s_s, asm.pcmpeqw_s_s, load_v128_xmm1_xmm0),
(Opcode.I32X4_LE_U, asm.pminud_s_s, asm.pcmpeqd_s_s, load_v128_xmm1_xmm0)
]) {
bindHandler(t.0);
t.3();
t.1(r_xmm0, r_xmm1);
t.2(r_xmm0, r_xmm1);
asm.movdqu_m_s(vsph[-2].value, r_xmm0);
decrementVsp();
endHandler();
}
for (t in [
(Opcode.I64X2_GE_S, load_v128_xmm0_xmm1),
(Opcode.I64X2_LE_S, load_v128_xmm1_xmm0)
]) {
bindHandler(t.0);
t.1();
masm.emit_i64x2_ge_s(r_xmm0, r_xmm1, r_xmm2);
asm.movdqu_m_s(vsph[-2].value, r_xmm1); // The result is in r_xmm1
decrementVsp();
endHandler();
}
for (t in [
(Opcode.I8X16_GT_U, asm.pmaxub_s_s, asm.pcmpeqb_s_s, load_v128_xmm0_xmm1),
(Opcode.I16X8_GT_U, asm.pmaxuw_s_s, asm.pcmpeqw_s_s, load_v128_xmm0_xmm1),
(Opcode.I32X4_GT_U, asm.pmaxud_s_s, asm.pcmpeqd_s_s, load_v128_xmm0_xmm1),
genSimdBinop(Opcode.I8X16_GE_S, masm.emit_i8x16_ge_s);
genSimdBinop(Opcode.I16X8_GE_S, masm.emit_i16x8_ge_s);
genSimdBinop(Opcode.I32X4_GE_S, masm.emit_i32x4_ge_s);
genSimdBinop(Opcode.I64X2_GE_S, masm.emit_i64x2_ge_s(_, _, r_xmm2));
genSimdBinop(Opcode.I8X16_GE_U, masm.emit_i8x16_ge_u);
genSimdBinop(Opcode.I16X8_GE_U, masm.emit_i16x8_ge_u);
genSimdBinop(Opcode.I32X4_GE_U, masm.emit_i32x4_ge_u);

genSimdBinopCommute(Opcode.I8X16_LE_S, masm.emit_i8x16_ge_s);
genSimdBinopCommute(Opcode.I16X8_LE_S, masm.emit_i16x8_ge_s);
genSimdBinopCommute(Opcode.I32X4_LE_S, masm.emit_i32x4_ge_s);
genSimdBinopCommute(Opcode.I64X2_LE_S, masm.emit_i64x2_ge_s(_, _, r_xmm2));
genSimdBinopCommute(Opcode.I8X16_LE_U, masm.emit_i8x16_ge_u);
genSimdBinopCommute(Opcode.I16X8_LE_U, masm.emit_i16x8_ge_u);
genSimdBinopCommute(Opcode.I32X4_LE_U, masm.emit_i32x4_ge_u);

genSimdBinop(Opcode.I8X16_GT_U, masm.emit_i8x16_gt_u(_, _, r_xmm2));
genSimdBinop(Opcode.I16X8_GT_U, masm.emit_i16x8_gt_u(_, _, r_xmm2));
genSimdBinop(Opcode.I32X4_GT_U, masm.emit_i32x4_gt_u(_, _, r_xmm2));
genSimdBinopCommute(Opcode.I8X16_LT_U, masm.emit_i8x16_gt_u(_, _, r_xmm2));
genSimdBinopCommute(Opcode.I16X8_LT_U, masm.emit_i16x8_gt_u(_, _, r_xmm2));
genSimdBinopCommute(Opcode.I32X4_LT_U, masm.emit_i32x4_gt_u(_, _, r_xmm2));

(Opcode.I8X16_LT_U, asm.pmaxub_s_s, asm.pcmpeqb_s_s, load_v128_xmm1_xmm0),
(Opcode.I16X8_LT_U, asm.pmaxuw_s_s, asm.pcmpeqw_s_s, load_v128_xmm1_xmm0),
(Opcode.I32X4_LT_U, asm.pmaxud_s_s, asm.pcmpeqd_s_s, load_v128_xmm1_xmm0)
]) {
bindHandler(t.0);
t.3();
t.1(r_xmm0, r_xmm1);
t.2(r_xmm0, r_xmm1);
t.2(r_xmm2, r_xmm2);
asm.xorps_s_s(r_xmm0, r_xmm2);
asm.movdqu_m_s(vsph[-2].value, r_xmm0);
decrementVsp();
endHandler();
}
for (t in [
(Opcode.I8X16_NEG, masm.emit_i8x16_neg),
(Opcode.I16X8_NEG, masm.emit_i16x8_neg),
Expand Down
63 changes: 60 additions & 3 deletions src/engine/x86-64/X86_64MacroAssembler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -868,10 +868,67 @@ class X86_64MacroAssembler extends MacroAssembler {
// 5. Add 3 and 4.
asm.paddq_s_s(lhs, tmp2);
}
def emit_i64x2_ge_s(s0: X86_64Xmmr, s1: X86_64Xmmr, scratch: X86_64Xmmr) {
asm.pcmpgtq_s_s(s1, s0);
def emit_v128_ne<T>(dst: X86_64Xmmr, src: X86_64Xmmr, f: (X86_64Xmmr, X86_64Xmmr) -> T) {
f(dst, src);
f(src, src);
asm.pxor_s_s(dst, src);
}
def emit_i8x16_ne(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ne(dst, src, asm.pcmpeqb_s_s);
}
def emit_i16x8_ne(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ne(dst, src, asm.pcmpeqw_s_s);
}
def emit_i32x4_ne(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ne(dst, src, asm.pcmpeqd_s_s);
}
def emit_i64x2_ne(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ne(dst, src, asm.pcmpeqq_s_s);
}
def emit_v128_gt_u<T>(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr,
pmax: (X86_64Xmmr, X86_64Xmmr) -> T, pcmp: (X86_64Xmmr, X86_64Xmmr) -> T) {
pmax(dst, src);
pcmp(dst, src);
pcmp(scratch, scratch);
asm.xorps_s_s(dst, scratch);
}
def emit_i8x16_gt_u(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
emit_v128_gt_u(dst, src, scratch, asm.pmaxub_s_s, asm.pcmpeqb_s_s);
}
def emit_i16x8_gt_u(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
emit_v128_gt_u(dst, src, scratch, asm.pmaxuw_s_s, asm.pcmpeqw_s_s);
}
def emit_i32x4_gt_u(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
emit_v128_gt_u(dst, src, scratch, asm.pmaxud_s_s, asm.pcmpeqd_s_s);
}
def emit_v128_ge<T>(dst: X86_64Xmmr, src: X86_64Xmmr,
pmin: (X86_64Xmmr, X86_64Xmmr) -> T, pcmp: (X86_64Xmmr, X86_64Xmmr) -> T) {
pmin(dst, src);
pcmp(dst, src);
}
def emit_i8x16_ge_s(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ge(dst, src, asm.pminsb_s_s, asm.pcmpeqb_s_s);
}
def emit_i16x8_ge_s(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ge(dst, src, asm.pminsw_s_s, asm.pcmpeqw_s_s);
}
def emit_i32x4_ge_s(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ge(dst, src, asm.pminsd_s_s, asm.pcmpeqd_s_s);
}
def emit_i8x16_ge_u(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ge(dst, src, asm.pminub_s_s, asm.pcmpeqb_s_s);
}
def emit_i16x8_ge_u(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ge(dst, src, asm.pminuw_s_s, asm.pcmpeqw_s_s);
}
def emit_i32x4_ge_u(dst: X86_64Xmmr, src: X86_64Xmmr) {
emit_v128_ge(dst, src, asm.pminud_s_s, asm.pcmpeqd_s_s);
}
def emit_i64x2_ge_s(dst: X86_64Xmmr, src: X86_64Xmmr, scratch: X86_64Xmmr) {
asm.pcmpgtq_s_s(src, dst);
asm.pcmpeqd_s_s(scratch, scratch);
emit_v128_xorps(s1, scratch); // The result is stored in s1
emit_v128_xorps(src, scratch);
asm.movaps_s_s(dst, src);
}
def emit_i8x16_shl(dst: X86_64Xmmr, shift: X86_64Gpr, tmp1: X86_64Gpr, tmp2: X86_64Xmmr, tmp3: X86_64Xmmr) {
// Take shift value modulo 8.
Expand Down
18 changes: 18 additions & 0 deletions src/engine/x86-64/X86_64SinglePassCompiler.v3
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,16 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
def visit_I8X16_ADD() { do_op2_x_x(ValueKind.V128, asm.paddb_s_s); }
def visit_I8X16_SUB() { do_op2_x_x(ValueKind.V128, asm.psubb_s_s); }
def visit_I8X16_NEG() { visit_V128_I_NEG(mmasm.emit_i8x16_neg); }
def visit_I8X16_EQ() { do_op2_x_x(ValueKind.V128, asm.pcmpeqb_s_s); }
def visit_I8X16_NE() { do_op2_x_x(ValueKind.V128, mmasm.emit_i8x16_ne); }
def visit_I8X16_GT_S() { do_op2_x_x(ValueKind.V128, asm.pcmpgtb_s_s); }
def visit_I8X16_GT_U() { do_op2_x_x(ValueKind.V128, mmasm.emit_i8x16_gt_u(_, _, X(allocTmp(ValueKind.V128)))); }
def visit_I8X16_LT_S() { do_c_op2_x_x(ValueKind.V128, asm.pcmpgtb_s_s); }
def visit_I8X16_LT_U() { do_c_op2_x_x(ValueKind.V128, mmasm.emit_i8x16_gt_u(_, _, X(allocTmp(ValueKind.V128)))); }
def visit_I8X16_GE_S() { do_op2_x_x(ValueKind.V128, mmasm.emit_i8x16_ge_s); }
def visit_I8X16_GE_U() { do_op2_x_x(ValueKind.V128, mmasm.emit_i8x16_ge_u); }
def visit_I8X16_LE_S() { do_c_op2_x_x(ValueKind.V128, mmasm.emit_i8x16_ge_s); }
def visit_I8X16_LE_U() { do_c_op2_x_x(ValueKind.V128, mmasm.emit_i8x16_ge_u); }

def visit_I16X8_ADD() { do_op2_x_x(ValueKind.V128, asm.paddw_s_s); }
def visit_I16X8_SUB() { do_op2_x_x(ValueKind.V128, asm.psubw_s_s); }
Expand Down Expand Up @@ -591,6 +601,14 @@ class X86_64SinglePassCompiler extends SinglePassCompiler {
state.push(a.kindFlagsMatching(kind, IN_REG), a.reg, 0);
return true;
}
// x2 = op(x2, x1), commuted version of do_op2_x_x
private def do_c_op2_x_x<T>(kind: ValueKind, emit: (X86_64Xmmr, X86_64Xmmr) -> T) -> bool {
var b = popRegToOverwrite();
var a = popReg();
emit(X(b.reg), X(a.reg));
state.push(b.kindFlagsMatching(kind, IN_REG), b.reg, 0);
return true;
}
}

def ucontext_rip_offset = 168;
Expand Down
74 changes: 0 additions & 74 deletions test/regress/simd/simd_i8x16_cmp.bin.wast
Original file line number Diff line number Diff line change
Expand Up @@ -9706,80 +9706,6 @@
)
"type mismatch"
)
(module binary
"\00\61\73\6d\01\00\00\00\01\84\80\80\80\00\01\60"
"\00\00\03\8e\80\80\80\00\0d\00\00\00\00\00\00\00"
"\00\00\00\00\00\00\05\83\80\80\80\00\01\00\01\07"
"\b8\81\80\80\00\0d\0b\65\71\2d\69\6e\2d\62\6c\6f"
"\63\6b\00\00\0b\6e\65\2d\69\6e\2d\62\6c\6f\63\6b"
"\00\01\0d\6c\74\5f\73\2d\69\6e\2d\62\6c\6f\63\6b"
"\00\02\0d\6c\65\5f\75\2d\69\6e\2d\62\6c\6f\63\6b"
"\00\03\0d\67\74\5f\75\2d\69\6e\2d\62\6c\6f\63\6b"
"\00\04\0d\67\65\5f\73\2d\69\6e\2d\62\6c\6f\63\6b"
"\00\05\09\6e\65\73\74\65\64\2d\65\71\00\06\09\6e"
"\65\73\74\65\64\2d\6e\65\00\07\0b\6e\65\73\74\65"
"\64\2d\6c\74\5f\73\00\08\0b\6e\65\73\74\65\64\2d"
"\6c\65\5f\75\00\09\0b\6e\65\73\74\65\64\2d\67\74"
"\5f\75\00\0a\0b\6e\65\73\74\65\64\2d\67\65\5f\73"
"\00\0b\08\61\73\2d\70\61\72\61\6d\00\0c\0a\b7\85"
"\80\80\00\0d\9d\80\80\80\00\00\02\40\02\7b\02\7b"
"\41\00\fd\00\04\00\0b\02\7b\41\01\fd\00\04\00\0b"
"\fd\23\0b\1a\0b\0b\9d\80\80\80\00\00\02\40\02\7b"
"\02\7b\41\00\fd\00\04\00\0b\02\7b\41\01\fd\00\04"
"\00\0b\fd\24\0b\1a\0b\0b\9d\80\80\80\00\00\02\40"
"\02\7b\02\7b\41\00\fd\00\04\00\0b\02\7b\41\01\fd"
"\00\04\00\0b\fd\25\0b\1a\0b\0b\9d\80\80\80\00\00"
"\02\40\02\7b\02\7b\41\00\fd\00\04\00\0b\02\7b\41"
"\01\fd\00\04\00\0b\fd\2a\0b\1a\0b\0b\9d\80\80\80"
"\00\00\02\40\02\7b\02\7b\41\00\fd\00\04\00\0b\02"
"\7b\41\01\fd\00\04\00\0b\fd\28\0b\1a\0b\0b\9d\80"
"\80\80\00\00\02\40\02\7b\02\7b\41\00\fd\00\04\00"
"\0b\02\7b\41\01\fd\00\04\00\0b\fd\2b\0b\1a\0b\0b"
"\c1\80\80\80\00\00\41\00\fd\00\04\00\41\01\fd\00"
"\04\00\fd\23\41\02\fd\00\04\00\41\03\fd\00\04\00"
"\fd\23\fd\23\41\00\fd\00\04\00\41\01\fd\00\04\00"
"\fd\23\41\02\fd\00\04\00\41\03\fd\00\04\00\fd\23"
"\fd\23\fd\23\1a\0b\c1\80\80\80\00\00\41\00\fd\00"
"\04\00\41\01\fd\00\04\00\fd\24\41\02\fd\00\04\00"
"\41\03\fd\00\04\00\fd\24\fd\24\41\00\fd\00\04\00"
"\41\01\fd\00\04\00\fd\24\41\02\fd\00\04\00\41\03"
"\fd\00\04\00\fd\24\fd\24\fd\24\1a\0b\c1\80\80\80"
"\00\00\41\00\fd\00\04\00\41\01\fd\00\04\00\fd\25"
"\41\02\fd\00\04\00\41\03\fd\00\04\00\fd\25\fd\25"
"\41\00\fd\00\04\00\41\01\fd\00\04\00\fd\25\41\02"
"\fd\00\04\00\41\03\fd\00\04\00\fd\25\fd\25\fd\25"
"\1a\0b\c1\80\80\80\00\00\41\00\fd\00\04\00\41\01"
"\fd\00\04\00\fd\2a\41\02\fd\00\04\00\41\03\fd\00"
"\04\00\fd\2a\fd\2a\41\00\fd\00\04\00\41\01\fd\00"
"\04\00\fd\2a\41\02\fd\00\04\00\41\03\fd\00\04\00"
"\fd\2a\fd\2a\fd\2a\1a\0b\c1\80\80\80\00\00\41\00"
"\fd\00\04\00\41\01\fd\00\04\00\fd\28\41\02\fd\00"
"\04\00\41\03\fd\00\04\00\fd\28\fd\28\41\00\fd\00"
"\04\00\41\01\fd\00\04\00\fd\28\41\02\fd\00\04\00"
"\41\03\fd\00\04\00\fd\28\fd\28\fd\28\1a\0b\c1\80"
"\80\80\00\00\41\00\fd\00\04\00\41\01\fd\00\04\00"
"\fd\2b\41\02\fd\00\04\00\41\03\fd\00\04\00\fd\2b"
"\fd\2b\41\00\fd\00\04\00\41\01\fd\00\04\00\fd\2b"
"\41\02\fd\00\04\00\41\03\fd\00\04\00\fd\2b\fd\2b"
"\fd\2b\1a\0b\c1\80\80\80\00\00\41\00\fd\00\04\00"
"\41\01\fd\00\04\00\fd\25\41\02\fd\00\04\00\41\03"
"\fd\00\04\00\fd\2a\fd\23\41\00\fd\00\04\00\41\01"
"\fd\00\04\00\fd\27\41\02\fd\00\04\00\41\03\fd\00"
"\04\00\fd\26\fd\24\fd\2c\1a\0b"
)
(assert_return (invoke "eq-in-block"))
(assert_return (invoke "ne-in-block"))
(assert_return (invoke "lt_s-in-block"))
(assert_return (invoke "le_u-in-block"))
(assert_return (invoke "gt_u-in-block"))
(assert_return (invoke "ge_s-in-block"))
(assert_return (invoke "nested-eq"))
(assert_return (invoke "nested-ne"))
(assert_return (invoke "nested-lt_s"))
(assert_return (invoke "nested-le_u"))
(assert_return (invoke "nested-gt_u"))
(assert_return (invoke "nested-ge_s"))
(assert_return (invoke "as-param"))
(assert_invalid
(module binary
"\00\61\73\6d\01\00\00\00\01\85\80\80\80\00\01\60"
Expand Down
Loading

0 comments on commit 301e52a

Please sign in to comment.