AOSC-Tracking
diff --git a/‎src/hotspot/cpu/loongarch/loongarch_64.ad‎
Lines changed: 78 additions & 12 deletions b/‎src/hotspot/cpu/loongarch/loongarch_64.ad‎
Lines changed: 78 additions & 12 deletions
diff --git a/‎src/hotspot/cpu/loongarch/macroAssembler_loongarch.cpp‎
Lines changed: 99 additions & 66 deletions b/‎src/hotspot/cpu/loongarch/macroAssembler_loongarch.cpp‎
Lines changed: 99 additions & 66 deletions
@@ -1040,6 +1040,9 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
       if (vlen < 4 || !UseLASX)
         return false;
       break;
+    case Op_VectorUCastB2X:
+    case Op_VectorUCastS2X:
+    case Op_VectorUCastI2X:
     case Op_VectorCastB2X:
     case Op_VectorCastS2X:
     case Op_VectorCastI2X:
@@ -2434,6 +2437,11 @@ encode %{
         ciEnv::current()->record_failure("CodeCache is full");
         return;
       }
+    } else if (_method->intrinsic_id() == vmIntrinsicID::_ensureMaterializedForStackWalk) {
+      // The NOP here is purely to ensure that eliding a call to
+      // JVM_EnsureMaterializedForStackWalk doesn't change the code size.
+      __ nop();
+      __ block_comment("call JVM_EnsureMaterializedForStackWalk (elided)");
     } else {
       int method_index = resolved_method_index(cbuf);
       RelocationHolder rspec = _optimized_virtual ? opt_virtual_call_Relocation::spec(method_index)
@@ -10885,24 +10893,30 @@ instruct convHF2F_reg_reg(regF dst, mRegI src, regF tmp) %{
   ins_pipe(pipe_slow);
 %}
 
-instruct round_float_reg(mRegI dst, regF src, mRegL tmp)
+instruct round_float_reg(mRegI dst, regF src, regF vtemp1)
 %{
   match(Set dst (RoundF src));
-  effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "round_float $dst, $src\t# @round_float_reg" %}
+  effect(TEMP_DEF dst, TEMP vtemp1);
+  format %{ "round_float    $dst, $src\t# "
+            "TEMP($vtemp1) @round_float_reg" %}
   ins_encode %{
-    __ java_round_float($dst$$Register, $src$$FloatRegister, $tmp$$Register);
+    __ java_round_float($dst$$Register,
+                        $src$$FloatRegister,
+                        $vtemp1$$FloatRegister);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct round_double_reg(mRegL dst, regD src, mRegL tmp)
+instruct round_double_reg(mRegL dst, regD src, regD vtemp1)
 %{
   match(Set dst (RoundD src));
-  effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "round_double $dst, $src\t# @round_double_reg" %}
+  effect(TEMP_DEF dst, TEMP vtemp1);
+  format %{ "round_double    $dst, $src\t# "
+            "TEMP($vtemp1) @round_double_reg" %}
   ins_encode %{
-    __ java_round_double($dst$$Register, $src$$FloatRegister, $tmp$$Register);
+    __ java_round_double($dst$$Register,
+                         $src$$FloatRegister,
+                         $vtemp1$$FloatRegister);
   %}
   ins_pipe( pipe_slow );
 %}
@@ -14443,7 +14457,8 @@ instruct round_float_lsx(vReg dst, vReg src, vReg vtemp1, vReg vtemp2) %{
   predicate(Matcher::vector_length_in_bytes(n) <= 16);
   match(Set dst (RoundVF src));
   effect(TEMP_DEF dst, TEMP vtemp1, TEMP vtemp2);
-  format %{ "round_float_lsx $dst, $src\t# @round_float_lsx" %}
+  format %{ "round_float_lsx    $dst, $src\t# "
+            "TEMP($vtemp1, $vtemp2) @round_float_lsx" %}
   ins_encode %{
     __ java_round_float_lsx($dst$$FloatRegister,
                             $src$$FloatRegister,
@@ -14457,7 +14472,8 @@ instruct round_float_lasx(vReg dst, vReg src, vReg vtemp1, vReg vtemp2) %{
   predicate(Matcher::vector_length_in_bytes(n) > 16);
   match(Set dst (RoundVF src));
   effect(TEMP_DEF dst, TEMP vtemp1, TEMP vtemp2);
-  format %{ "round_float_lasx $dst, $src\t# @round_float_lasx" %}
+  format %{ "round_float_lasx    $dst, $src\t# "
+            "TEMP($vtemp1, $vtemp2) @round_float_lasx" %}
   ins_encode %{
     __ java_round_float_lasx($dst$$FloatRegister,
                              $src$$FloatRegister,
@@ -14471,7 +14487,8 @@ instruct round_double_lsx(vReg dst, vReg src, vReg vtemp1, vReg vtemp2) %{
   predicate(Matcher::vector_length_in_bytes(n) <= 16);
   match(Set dst (RoundVD src));
   effect(TEMP_DEF dst, TEMP vtemp1, TEMP vtemp2);
-  format %{ "round_double_lsx $dst, $src\t# @round_double_lsx" %}
+  format %{ "round_double_lsx $dst, $src\t# "
+            "TEMP($vtemp1, $vtemp2) @round_double_lsx" %}
   ins_encode %{
     __ java_round_double_lsx($dst$$FloatRegister,
                              $src$$FloatRegister,
@@ -14485,7 +14502,8 @@ instruct round_double_lasx(vReg dst, vReg src, vReg vtemp1, vReg vtemp2) %{
   predicate(Matcher::vector_length_in_bytes(n) > 16);
   match(Set dst (RoundVD src));
   effect(TEMP_DEF dst, TEMP vtemp1, TEMP vtemp2);
-  format %{ "round_double_lasx $dst, $src\t# @round_double_lasx" %}
+  format %{ "round_double_lasx $dst, $src\t# "
+            "TEMP($vtemp1, $vtemp2) @round_double_lasx" %}
   ins_encode %{
     __ java_round_double_lasx($dst$$FloatRegister,
                               $src$$FloatRegister,
@@ -14520,6 +14538,54 @@ instruct roundVD(vReg dst, vReg src, immI rmode) %{
   ins_pipe( pipe_slow );
 %}
 
+// ---------------------------- Vector UCast B2X -------------------------------
+
+instruct cvtVUB(vReg dst, vReg src) %{
+  match(Set dst (VectorUCastB2X src));
+  format %{ "(x)vconvert    $dst, $src\t# @cvtVUB" %}
+  ins_encode %{
+    switch (Matcher::vector_element_basic_type(this)) {
+      case T_SHORT : __ vext2xv_hu_bu($dst$$FloatRegister, $src$$FloatRegister); break;
+      case T_INT   : __ vext2xv_wu_bu($dst$$FloatRegister, $src$$FloatRegister); break;
+      case T_LONG  : __ vext2xv_du_bu($dst$$FloatRegister, $src$$FloatRegister); break;
+      default:
+        ShouldNotReachHere();
+    }
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ---------------------------- Vector UCast S2X -------------------------------
+
+instruct cvtVUS(vReg dst, vReg src) %{
+  match(Set dst (VectorUCastS2X src));
+  format %{ "(x)vconvert    $dst, $src\t# @cvtVUS" %}
+  ins_encode %{
+    switch (Matcher::vector_element_basic_type(this)) {
+      case T_INT   : __ vext2xv_wu_hu($dst$$FloatRegister, $src$$FloatRegister); break;
+      case T_LONG  : __ vext2xv_du_hu($dst$$FloatRegister, $src$$FloatRegister); break;
+      default:
+        ShouldNotReachHere();
+    }
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// ---------------------------- Vector UCast I2X -------------------------------
+
+instruct cvtVUI(vReg dst, vReg src) %{
+  match(Set dst (VectorUCastI2X src));
+  format %{ "(x)vconvert    $dst, $src\t# @cvtVUI" %}
+  ins_encode %{
+    switch (Matcher::vector_element_basic_type(this)) {
+      case T_LONG  : __ vext2xv_du_wu($dst$$FloatRegister, $src$$FloatRegister); break;
+      default:
+        ShouldNotReachHere();
+    }
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ---------------------------- Vector Cast B2X -------------------------------
 
 instruct cvtVB(vReg dst, vReg src) %{
 
@@ -3717,25 +3717,31 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
 // in the IEEE-754-2008. For single-precision floatings,
 // the following algorithm can be used to effectively
 // implement rounding via standard operations.
-//
-// if src >= 0:
-//   dst = floor(src + 0.49999997f)
-// else:
-//   dst = floor(src + 0.5f)
 void MacroAssembler::java_round_float(Register dst,
                                       FloatRegister src,
-                                      Register tmp) {
+                                      FloatRegister vtemp1) {
   block_comment("java_round_float: { ");
+
+  Label L_abnormal, L_done;
+
   li(AT, StubRoutines::la::round_float_imm());
 
-  movfr2gr_s(tmp, src);
-  bstrpick_w(tmp, tmp, 31, 31);
-  slli_w(tmp, tmp, 2);
-  fldx_s(fscratch, AT, tmp);
-  fadd_s(fscratch, fscratch, src);
+  // if src is -0.5f, return 0 as result
+  fld_s(vtemp1, AT, 0);
+  fcmp_ceq_s(FCC0, vtemp1, src);
+  bceqz(FCC0, L_abnormal);
+  move(dst, R0);
+  b(L_done);
 
+  // else, floor src with the magic number
+  bind(L_abnormal);
+  fld_s(vtemp1, AT, 4);
+  fadd_s(fscratch, vtemp1, src);
   ftintrm_w_s(fscratch, fscratch);
   movfr2gr_s(dst, fscratch);
+
+  bind(L_done);
+
   block_comment("} java_round_float");
 }
 
@@ -3745,18 +3751,13 @@ void MacroAssembler::java_round_float_lsx(FloatRegister dst,
                                           FloatRegister vtemp2) {
   block_comment("java_round_float_lsx: { ");
   li(AT, StubRoutines::la::round_float_imm());
+  vldrepl_w(vtemp1, AT, 0);  // repl -0.5f
+  vldrepl_w(vtemp2, AT, 1);  // repl 0.49999997f
 
-  vldrepl_w(vtemp2, AT, 1);  // repl 0.5f
-  vslti_w(fscratch, src, 0);  // masked add
-  vand_v(vtemp2, fscratch, vtemp2);
-  vfadd_s(dst, src, vtemp2);
-
-  vldrepl_w(vtemp1, AT, 0);  // repl 0.49999997f
-  vnor_v(fscratch, fscratch, fscratch);  // rev mask
-  vand_v(vtemp1, fscratch, vtemp1);
-  vfadd_s(dst, dst, vtemp1);
-
-  vftintrm_w_s(dst, dst);
+  vfcmp_cne_s(fscratch, src, vtemp1);  // generate the mask
+  vand_v(fscratch, fscratch, src);     // clear the special
+  vfadd_s(dst, fscratch, vtemp2);      // plus the magic
+  vftintrm_w_s(dst, dst);              // floor the result
   block_comment("} java_round_float_lsx");
 }
 
@@ -3766,18 +3767,13 @@ void MacroAssembler::java_round_float_lasx(FloatRegister dst,
                                            FloatRegister vtemp2) {
   block_comment("java_round_float_lasx: { ");
   li(AT, StubRoutines::la::round_float_imm());
+  xvldrepl_w(vtemp1, AT, 0);  // repl -0.5f
+  xvldrepl_w(vtemp2, AT, 1);  // repl 0.49999997f
 
-  xvldrepl_w(vtemp2, AT, 1);  // repl 0.5f
-  xvslti_w(fscratch, src, 0);  // masked add
-  xvand_v(vtemp2, fscratch, vtemp2);
-  xvfadd_s(dst, src, vtemp2);
-
-  xvldrepl_w(vtemp1, AT, 0);  // repl 0.49999997f
-  xvnor_v(fscratch, fscratch, fscratch);  // rev mask
-  xvand_v(vtemp1, fscratch, vtemp1);
-  xvfadd_s(dst, dst, vtemp1);
-
-  xvftintrm_w_s(dst, dst);
+  xvfcmp_cne_s(fscratch, src, vtemp1);  // generate the mask
+  xvand_v(fscratch, fscratch, src);     // clear the special
+  xvfadd_s(dst, fscratch, vtemp2);      // plus the magic
+  xvftintrm_w_s(dst, dst);              // floor the result
   block_comment("} java_round_float_lasx");
 }
 
@@ -3786,25 +3782,31 @@ void MacroAssembler::java_round_float_lasx(FloatRegister dst,
 // in the IEEE-754-2008. For double-precision floatings,
 // the following algorithm can be used to effectively
 // implement rounding via standard operations.
-//
-// if src >= 0:
-//   dst = floor(src + 0.49999999999999994d)
-// else:
-//   dst = floor(src + 0.5d)
 void MacroAssembler::java_round_double(Register dst,
                                        FloatRegister src,
-                                       Register tmp) {
+                                       FloatRegister vtemp1) {
   block_comment("java_round_double: { ");
+
+  Label L_abnormal, L_done;
+
   li(AT, StubRoutines::la::round_double_imm());
 
-  movfr2gr_d(tmp, src);
-  bstrpick_d(tmp, tmp, 63, 63);
-  slli_d(tmp, tmp, 3);
-  fldx_d(fscratch, AT, tmp);
-  fadd_d(fscratch, fscratch, src);
+  // if src is -0.5d, return 0 as result
+  fld_d(vtemp1, AT, 0);
+  fcmp_ceq_d(FCC0, vtemp1, src);
+  bceqz(FCC0, L_abnormal);
+  move(dst, R0);
+  b(L_done);
 
+  // else, floor src with the magic number
+  bind(L_abnormal);
+  fld_d(vtemp1, AT, 8);
+  fadd_d(fscratch, vtemp1, src);
   ftintrm_l_d(fscratch, fscratch);
   movfr2gr_d(dst, fscratch);
+
+  bind(L_done);
+
   block_comment("} java_round_double");
 }
 
@@ -3814,18 +3816,13 @@ void MacroAssembler::java_round_double_lsx(FloatRegister dst,
                                            FloatRegister vtemp2) {
   block_comment("java_round_double_lsx: { ");
   li(AT, StubRoutines::la::round_double_imm());
+  vldrepl_d(vtemp1, AT, 0);  // repl -0.5d
+  vldrepl_d(vtemp2, AT, 1);  // repl 0.49999999999999994d
 
-  vldrepl_d(vtemp2, AT, 1);  // repl 0.5d
-  vslti_d(fscratch, src, 0);  // masked add
-  vand_v(vtemp2, fscratch, vtemp2);
-  vfadd_d(dst, src, vtemp2);
-
-  vldrepl_d(vtemp1, AT, 0);  // repl 0.49999999999999994d
-  vnor_v(fscratch, fscratch, fscratch);  // rev mask
-  vand_v(vtemp1, fscratch, vtemp1);
-  vfadd_d(dst, dst, vtemp1);
-
-  vftintrm_l_d(dst, dst);
+  vfcmp_cne_d(fscratch, src, vtemp1);  // generate the mask
+  vand_v(fscratch, fscratch, src);     // clear the special
+  vfadd_d(dst, fscratch, vtemp2);      // plus the magic
+  vftintrm_l_d(dst, dst);              // floor the result
   block_comment("} java_round_double_lsx");
 }
 
@@ -3835,18 +3832,13 @@ void MacroAssembler::java_round_double_lasx(FloatRegister dst,
                                             FloatRegister vtemp2) {
   block_comment("java_round_double_lasx: { ");
   li(AT, StubRoutines::la::round_double_imm());
+  xvldrepl_d(vtemp1, AT, 0);  // repl -0.5d
+  xvldrepl_d(vtemp2, AT, 1);  // repl 0.49999999999999994d
 
-  xvldrepl_d(vtemp2, AT, 1);  // repl 0.5d
-  xvslti_d(fscratch, src, 0);  // masked add
-  xvand_v(vtemp2, fscratch, vtemp2);
-  xvfadd_d(dst, src, vtemp2);
-
-  xvldrepl_d(vtemp1, AT, 0);  // repl 0.49999999999999994d
-  xvnor_v(fscratch, fscratch, fscratch);  // rev mask
-  xvand_v(vtemp1, fscratch, vtemp1);
-  xvfadd_d(dst, dst, vtemp1);
-
-  xvftintrm_l_d(dst, dst);
+  xvfcmp_cne_d(fscratch, src, vtemp1);  // generate the mask
+  xvand_v(fscratch, fscratch, src);     // clear the special
+  xvfadd_d(dst, fscratch, vtemp2);      // plus the magic
+  xvftintrm_l_d(dst, dst);              // floor the result
   block_comment("} java_round_double_lasx");
 }
 
@@ -3922,6 +3914,47 @@ void MacroAssembler::mul_add(Register out, Register in, Register offset,
   bind(L_end);
 }
 
+// add two unsigned input and output carry
+void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
+{
+  assert_different_registers(dst, carry);
+  assert_different_registers(dst, src2);
+  add_d(dst, src1, src2);
+  sltu(carry, dst, src2);
+}
+
+// add two input with carry
+void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
+  assert_different_registers(dst, carry);
+  add_d(dst, src1, src2);
+  add_d(dst, dst, carry);
+}
+
+// add two unsigned input with carry and output carry
+void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
+  assert_different_registers(dst, src2);
+  adc(dst, src1, src2, carry);
+  sltu(carry, dst, src2);
+}
+
+// Multiply and multiply-accumulate unsigned 64-bit registers.
+void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
+  assert_different_registers(prod_lo, prod_hi);
+
+  mul_d(prod_lo, n, m);
+  mulh_du(prod_hi, n, m);
+}
+
+void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
+                Register m, Register tmp1, Register tmp2) {
+  assert_different_registers(sum_lo, sum_hi);
+  assert_different_registers(sum_hi, tmp2);
+
+  wide_mul(tmp1, tmp2, n, m);
+  cad(sum_lo, sum_lo, tmp1, tmp1);  // Add tmp1 to sum_lo with carry output to tmp1
+  adc(sum_hi, sum_hi, tmp2, tmp1);  // Add tmp2 with carry to sum_hi
+}
+
 #ifndef PRODUCT
 void MacroAssembler::verify_cross_modify_fence_not_required() {
   if (VerifyCrossModifyFence) {