@@ -3717,25 +3717,31 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
37173717// in the IEEE-754-2008. For single-precision floatings,
37183718// the following algorithm can be used to effectively
37193719// implement rounding via standard operations.
3720- //
3721- // if src >= 0:
3722- // dst = floor(src + 0.49999997f)
3723- // else:
3724- // dst = floor(src + 0.5f)
37253720void MacroAssembler::java_round_float (Register dst,
37263721 FloatRegister src,
3727- Register tmp ) {
3722+ FloatRegister vtemp1 ) {
37283723 block_comment (" java_round_float: { " );
3724+
3725+ Label L_abnormal, L_done;
3726+
37293727 li (AT, StubRoutines::la::round_float_imm ());
37303728
3731- movfr2gr_s (tmp, src);
3732- bstrpick_w (tmp, tmp, 31 , 31 );
3733- slli_w (tmp, tmp, 2 );
3734- fldx_s (fscratch, AT, tmp);
3735- fadd_s (fscratch, fscratch, src);
3729+ // if src is -0.5f, return 0 as result
3730+ fld_s (vtemp1, AT, 0 );
3731+ fcmp_ceq_s (FCC0, vtemp1, src);
3732+ bceqz (FCC0, L_abnormal);
3733+ move (dst, R0);
3734+ b (L_done);
37363735
3736+ // else, floor src with the magic number
3737+ bind (L_abnormal);
3738+ fld_s (vtemp1, AT, 4 );
3739+ fadd_s (fscratch, vtemp1, src);
37373740 ftintrm_w_s (fscratch, fscratch);
37383741 movfr2gr_s (dst, fscratch);
3742+
3743+ bind (L_done);
3744+
37393745 block_comment (" } java_round_float" );
37403746}
37413747
@@ -3745,18 +3751,13 @@ void MacroAssembler::java_round_float_lsx(FloatRegister dst,
37453751 FloatRegister vtemp2) {
37463752 block_comment (" java_round_float_lsx: { " );
37473753 li (AT, StubRoutines::la::round_float_imm ());
3754+ vldrepl_w (vtemp1, AT, 0 ); // repl -0.5f
3755+ vldrepl_w (vtemp2, AT, 1 ); // repl 0.49999997f
37483756
3749- vldrepl_w (vtemp2, AT, 1 ); // repl 0.5f
3750- vslti_w (fscratch, src, 0 ); // masked add
3751- vand_v (vtemp2, fscratch, vtemp2);
3752- vfadd_s (dst, src, vtemp2);
3753-
3754- vldrepl_w (vtemp1, AT, 0 ); // repl 0.49999997f
3755- vnor_v (fscratch, fscratch, fscratch); // rev mask
3756- vand_v (vtemp1, fscratch, vtemp1);
3757- vfadd_s (dst, dst, vtemp1);
3758-
3759- vftintrm_w_s (dst, dst);
3757+ vfcmp_cne_s (fscratch, src, vtemp1); // generate the mask
3758+ vand_v (fscratch, fscratch, src); // clear the special
3759+ vfadd_s (dst, fscratch, vtemp2); // plus the magic
3760+ vftintrm_w_s (dst, dst); // floor the result
37603761 block_comment (" } java_round_float_lsx" );
37613762}
37623763
@@ -3766,18 +3767,13 @@ void MacroAssembler::java_round_float_lasx(FloatRegister dst,
37663767 FloatRegister vtemp2) {
37673768 block_comment (" java_round_float_lasx: { " );
37683769 li (AT, StubRoutines::la::round_float_imm ());
3770+ xvldrepl_w (vtemp1, AT, 0 ); // repl -0.5f
3771+ xvldrepl_w (vtemp2, AT, 1 ); // repl 0.49999997f
37693772
3770- xvldrepl_w (vtemp2, AT, 1 ); // repl 0.5f
3771- xvslti_w (fscratch, src, 0 ); // masked add
3772- xvand_v (vtemp2, fscratch, vtemp2);
3773- xvfadd_s (dst, src, vtemp2);
3774-
3775- xvldrepl_w (vtemp1, AT, 0 ); // repl 0.49999997f
3776- xvnor_v (fscratch, fscratch, fscratch); // rev mask
3777- xvand_v (vtemp1, fscratch, vtemp1);
3778- xvfadd_s (dst, dst, vtemp1);
3779-
3780- xvftintrm_w_s (dst, dst);
3773+ xvfcmp_cne_s (fscratch, src, vtemp1); // generate the mask
3774+ xvand_v (fscratch, fscratch, src); // clear the special
3775+ xvfadd_s (dst, fscratch, vtemp2); // plus the magic
3776+ xvftintrm_w_s (dst, dst); // floor the result
37813777 block_comment (" } java_round_float_lasx" );
37823778}
37833779
@@ -3786,25 +3782,31 @@ void MacroAssembler::java_round_float_lasx(FloatRegister dst,
37863782// in the IEEE-754-2008. For double-precision floatings,
37873783// the following algorithm can be used to effectively
37883784// implement rounding via standard operations.
3789- //
3790- // if src >= 0:
3791- // dst = floor(src + 0.49999999999999994d)
3792- // else:
3793- // dst = floor(src + 0.5d)
37943785void MacroAssembler::java_round_double (Register dst,
37953786 FloatRegister src,
3796- Register tmp ) {
3787+ FloatRegister vtemp1 ) {
37973788 block_comment (" java_round_double: { " );
3789+
3790+ Label L_abnormal, L_done;
3791+
37983792 li (AT, StubRoutines::la::round_double_imm ());
37993793
3800- movfr2gr_d (tmp, src);
3801- bstrpick_d (tmp, tmp, 63 , 63 );
3802- slli_d (tmp, tmp, 3 );
3803- fldx_d (fscratch, AT, tmp);
3804- fadd_d (fscratch, fscratch, src);
3794+ // if src is -0.5d, return 0 as result
3795+ fld_d (vtemp1, AT, 0 );
3796+ fcmp_ceq_d (FCC0, vtemp1, src);
3797+ bceqz (FCC0, L_abnormal);
3798+ move (dst, R0);
3799+ b (L_done);
38053800
3801+ // else, floor src with the magic number
3802+ bind (L_abnormal);
3803+ fld_d (vtemp1, AT, 8 );
3804+ fadd_d (fscratch, vtemp1, src);
38063805 ftintrm_l_d (fscratch, fscratch);
38073806 movfr2gr_d (dst, fscratch);
3807+
3808+ bind (L_done);
3809+
38083810 block_comment (" } java_round_double" );
38093811}
38103812
@@ -3814,18 +3816,13 @@ void MacroAssembler::java_round_double_lsx(FloatRegister dst,
38143816 FloatRegister vtemp2) {
38153817 block_comment (" java_round_double_lsx: { " );
38163818 li (AT, StubRoutines::la::round_double_imm ());
3819+ vldrepl_d (vtemp1, AT, 0 ); // repl -0.5d
3820+ vldrepl_d (vtemp2, AT, 1 ); // repl 0.49999999999999994d
38173821
3818- vldrepl_d (vtemp2, AT, 1 ); // repl 0.5d
3819- vslti_d (fscratch, src, 0 ); // masked add
3820- vand_v (vtemp2, fscratch, vtemp2);
3821- vfadd_d (dst, src, vtemp2);
3822-
3823- vldrepl_d (vtemp1, AT, 0 ); // repl 0.49999999999999994d
3824- vnor_v (fscratch, fscratch, fscratch); // rev mask
3825- vand_v (vtemp1, fscratch, vtemp1);
3826- vfadd_d (dst, dst, vtemp1);
3827-
3828- vftintrm_l_d (dst, dst);
3822+ vfcmp_cne_d (fscratch, src, vtemp1); // generate the mask
3823+ vand_v (fscratch, fscratch, src); // clear the special
3824+ vfadd_d (dst, fscratch, vtemp2); // plus the magic
3825+ vftintrm_l_d (dst, dst); // floor the result
38293826 block_comment (" } java_round_double_lsx" );
38303827}
38313828
@@ -3835,18 +3832,13 @@ void MacroAssembler::java_round_double_lasx(FloatRegister dst,
38353832 FloatRegister vtemp2) {
38363833 block_comment (" java_round_double_lasx: { " );
38373834 li (AT, StubRoutines::la::round_double_imm ());
3835+ xvldrepl_d (vtemp1, AT, 0 ); // repl -0.5d
3836+ xvldrepl_d (vtemp2, AT, 1 ); // repl 0.49999999999999994d
38383837
3839- xvldrepl_d (vtemp2, AT, 1 ); // repl 0.5d
3840- xvslti_d (fscratch, src, 0 ); // masked add
3841- xvand_v (vtemp2, fscratch, vtemp2);
3842- xvfadd_d (dst, src, vtemp2);
3843-
3844- xvldrepl_d (vtemp1, AT, 0 ); // repl 0.49999999999999994d
3845- xvnor_v (fscratch, fscratch, fscratch); // rev mask
3846- xvand_v (vtemp1, fscratch, vtemp1);
3847- xvfadd_d (dst, dst, vtemp1);
3848-
3849- xvftintrm_l_d (dst, dst);
3838+ xvfcmp_cne_d (fscratch, src, vtemp1); // generate the mask
3839+ xvand_v (fscratch, fscratch, src); // clear the special
3840+ xvfadd_d (dst, fscratch, vtemp2); // plus the magic
3841+ xvftintrm_l_d (dst, dst); // floor the result
38503842 block_comment (" } java_round_double_lasx" );
38513843}
38523844
@@ -3922,6 +3914,47 @@ void MacroAssembler::mul_add(Register out, Register in, Register offset,
39223914 bind (L_end);
39233915}
39243916
3917+ // add two unsigned input and output carry
3918+ void MacroAssembler::cad (Register dst, Register src1, Register src2, Register carry)
3919+ {
3920+ assert_different_registers (dst, carry);
3921+ assert_different_registers (dst, src2);
3922+ add_d (dst, src1, src2);
3923+ sltu (carry, dst, src2);
3924+ }
3925+
3926+ // add two input with carry
3927+ void MacroAssembler::adc (Register dst, Register src1, Register src2, Register carry) {
3928+ assert_different_registers (dst, carry);
3929+ add_d (dst, src1, src2);
3930+ add_d (dst, dst, carry);
3931+ }
3932+
3933+ // add two unsigned input with carry and output carry
3934+ void MacroAssembler::cadc (Register dst, Register src1, Register src2, Register carry) {
3935+ assert_different_registers (dst, src2);
3936+ adc (dst, src1, src2, carry);
3937+ sltu (carry, dst, src2);
3938+ }
3939+
3940+ // Multiply and multiply-accumulate unsigned 64-bit registers.
3941+ void MacroAssembler::wide_mul (Register prod_lo, Register prod_hi, Register n, Register m) {
3942+ assert_different_registers (prod_lo, prod_hi);
3943+
3944+ mul_d (prod_lo, n, m);
3945+ mulh_du (prod_hi, n, m);
3946+ }
3947+
3948+ void MacroAssembler::wide_madd (Register sum_lo, Register sum_hi, Register n,
3949+ Register m, Register tmp1, Register tmp2) {
3950+ assert_different_registers (sum_lo, sum_hi);
3951+ assert_different_registers (sum_hi, tmp2);
3952+
3953+ wide_mul (tmp1, tmp2, n, m);
3954+ cad (sum_lo, sum_lo, tmp1, tmp1); // Add tmp1 to sum_lo with carry output to tmp1
3955+ adc (sum_hi, sum_hi, tmp2, tmp1); // Add tmp2 with carry to sum_hi
3956+ }
3957+
39253958#ifndef PRODUCT
39263959void MacroAssembler::verify_cross_modify_fence_not_required () {
39273960 if (VerifyCrossModifyFence) {
0 commit comments