Skip to content

Commit 1601fba

Browse files
author
Jatin Bhateja
committed
8252848: Optimize small primitive arrayCopy operations through partial inlining using AVX-512 masked instructions.
1 parent 03a4df0 commit 1601fba

27 files changed

+560
-16
lines changed

src/hotspot/cpu/x86/assembler_x86.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2589,6 +2589,38 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vect
25892589
emit_operand(dst, src);
25902590
}
25912591

2592+
void Assembler::evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type) {
2593+
assert(VM_Version::supports_avx512vlbw(), "");
2594+
InstructionMark im(this);
2595+
bool wide = type == T_SHORT || type == T_LONG || type == T_CHAR;
2596+
bool bwinstr = type == T_BYTE || type == T_SHORT || type == T_CHAR;
2597+
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
2598+
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
2599+
attributes.set_embedded_opmask_register_specifier(mask);
2600+
attributes.set_is_evex_instruction();
2601+
int prefix = bwinstr ? VEX_SIMD_F2 : VEX_SIMD_F3;
2602+
vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
2603+
emit_int8(0x6F);
2604+
emit_operand(dst, src);
2605+
}
2606+
2607+
void Assembler::evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type) {
2608+
assert(VM_Version::supports_avx512vlbw(), "");
2609+
assert(src != xnoreg, "sanity");
2610+
InstructionMark im(this);
2611+
bool wide = type == T_SHORT || type == T_LONG || type == T_CHAR;
2612+
bool bwinstr = type == T_BYTE || type == T_SHORT || type == T_CHAR;
2613+
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
2614+
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
2615+
attributes.reset_is_clear_context();
2616+
attributes.set_embedded_opmask_register_specifier(mask);
2617+
attributes.set_is_evex_instruction();
2618+
int prefix = bwinstr ? VEX_SIMD_F2 : VEX_SIMD_F3;
2619+
vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
2620+
emit_int8(0x7F);
2621+
emit_operand(src, dst);
2622+
}
2623+
25922624
void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
25932625
assert(VM_Version::supports_evex(), "");
25942626
InstructionMark im(this);
@@ -7803,6 +7835,13 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) {
78037835
emit_int16((unsigned char)0xF7, (0xC0 | encode));
78047836
}
78057837

7838+
void Assembler::shrxq(Register dst, Register src1, Register src2) {
7839+
assert(VM_Version::supports_bmi2(), "");
7840+
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
7841+
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
7842+
emit_int16((unsigned char)0xF7, (0xC0 | encode));
7843+
}
7844+
78067845
#ifndef _LP64
78077846

78087847
void Assembler::incl(Register dst) {

src/hotspot/cpu/x86/assembler_x86.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -826,7 +826,6 @@ class Assembler : public AbstractAssembler {
826826

827827
void decl(Register dst);
828828
void decl(Address dst);
829-
void decq(Register dst);
830829
void decq(Address dst);
831830

832831
void incl(Register dst);
@@ -911,6 +910,7 @@ class Assembler : public AbstractAssembler {
911910
void popa_uncached();
912911
#endif
913912
void vzeroupper_uncached();
913+
void decq(Register dst);
914914

915915
void pusha();
916916
void popa();
@@ -1519,6 +1519,10 @@ class Assembler : public AbstractAssembler {
15191519
void evmovdquq(XMMRegister dst, Address src, int vector_len);
15201520
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
15211521

1522+
// Generic move instructions.
1523+
void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type);
1524+
void evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type);
1525+
15221526
// Move lower 64bit to high 64bit in 128bit register
15231527
void movlhps(XMMRegister dst, XMMRegister src);
15241528

@@ -2021,6 +2025,8 @@ class Assembler : public AbstractAssembler {
20212025

20222026
void shlxl(Register dst, Register src1, Register src2);
20232027
void shlxq(Register dst, Register src1, Register src2);
2028+
void shrxq(Register dst, Register src1, Register src2);
2029+
20242030

20252031
//====================VECTOR ARITHMETIC=====================================
20262032

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1218,6 +1218,20 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
12181218
reduce_operation_256(opcode, vtmp2, vtmp2, src2);
12191219
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
12201220
}
1221+
1222+
void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
1223+
if (ArrayCopyPartialInlineSize <= 32) {
1224+
mov64(dst, 1);
1225+
shlxq(dst, dst, len);
1226+
decq(dst);
1227+
} else {
1228+
mov64(dst, -1);
1229+
movq(temp, len);
1230+
negptr(temp);
1231+
addptr(temp, 64);
1232+
shrxq(dst, dst, temp);
1233+
}
1234+
}
12211235
#endif // _LP64
12221236

12231237
void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {

src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
9494
#ifdef _LP64
9595
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
96+
void genmask(Register dst, Register len, Register temp);
9697
#endif // _LP64
9798

9899
// dst = reduce(op, src2) using vtmp as temps

src/hotspot/cpu/x86/vm_version_x86.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,8 @@ void VM_Version::get_processor_features() {
761761
if (is_intel()) { // Intel cpus specific settings
762762
if (is_knights_family()) {
763763
_features &= ~CPU_VZEROUPPER;
764+
_features &= ~CPU_AVX512BW;
765+
_features &= ~CPU_AVX512VL;
764766
}
765767
}
766768

@@ -1162,7 +1164,7 @@ void VM_Version::get_processor_features() {
11621164
#endif // COMPILER2 && ASSERT
11631165

11641166
if (!FLAG_IS_DEFAULT(AVX3Threshold)) {
1165-
if (!is_power_of_2(AVX3Threshold)) {
1167+
if (AVX3Threshold !=0 && !is_power_of_2(AVX3Threshold)) {
11661168
warning("AVX3Threshold must be a power of 2");
11671169
FLAG_SET_DEFAULT(AVX3Threshold, 4096);
11681170
}
@@ -1411,6 +1413,29 @@ void VM_Version::get_processor_features() {
14111413
MaxLoopPad = 11;
14121414
}
14131415
#endif // COMPILER2
1416+
1417+
if (FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize) ||
1418+
(!FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize) &&
1419+
ArrayCopyPartialInlineSize != 0 &&
1420+
ArrayCopyPartialInlineSize != 32 &&
1421+
ArrayCopyPartialInlineSize != 64)) {
1422+
int pi_size = 0;
1423+
if (MaxVectorSize > 32 && AVX3Threshold == 0) {
1424+
pi_size = 64;
1425+
} else if (MaxVectorSize >= 32) {
1426+
pi_size = 32;
1427+
}
1428+
if(!FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize)) {
1429+
warning("Setting ArrayCopyPartialInlineSize as %d", pi_size);
1430+
}
1431+
ArrayCopyPartialInlineSize = pi_size;
1432+
}
1433+
1434+
if (ArrayCopyPartialInlineSize > MaxVectorSize) {
1435+
ArrayCopyPartialInlineSize = MaxVectorSize;
1436+
warning("Setting ArrayCopyPartialInlineSize as MaxVectorSize");
1437+
}
1438+
14141439
if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
14151440
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
14161441
}

src/hotspot/cpu/x86/x86.ad

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,6 +1405,13 @@ const bool Matcher::match_rule_supported(int opcode) {
14051405
return false;
14061406
}
14071407
break;
1408+
case Op_VectorMaskGen:
1409+
case Op_VectorMaskedLoad:
1410+
case Op_VectorMaskedStore:
1411+
if (UseAVX < 3) {
1412+
return false;
1413+
}
1414+
break;
14081415
#ifndef _LP64
14091416
case Op_AddReductionVF:
14101417
case Op_AddReductionVD:
@@ -1477,6 +1484,16 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
14771484
return false;
14781485
}
14791486
break;
1487+
case Op_VectorMaskGen:
1488+
case Op_VectorMaskedLoad:
1489+
case Op_VectorMaskedStore:
1490+
if (!VM_Version::supports_avx512bw()) {
1491+
return false;
1492+
}
1493+
if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1494+
return false;
1495+
}
1496+
break;
14801497
case Op_CMoveVD:
14811498
if (vlen != 4) {
14821499
return false; // implementation limitation (only vcmov4D_reg is present)
@@ -5444,3 +5461,52 @@ instruct vprorate(vec dst, vec src, vec shift) %{
54445461
ins_pipe( pipe_slow );
54455462
%}
54465463

5464+
#ifdef _LP64
5465+
// ---------------------------------- Masked Block Copy ------------------------------------
5466+
5467+
instruct vmasked_load64(vec dst, memory mem, rRegL mask) %{
5468+
match(Set dst (VectorMaskedLoad mem mask));
5469+
format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
5470+
ins_encode %{
5471+
BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
5472+
int vector_len = vector_length_encoding(this);
5473+
//TODO: KRegister to be made valid "bound" operand to promote sharing.
5474+
__ kmovql(k2, $mask$$Register);
5475+
__ evmovdqu($dst$$XMMRegister, k2, $mem$$Address, vector_len, elmType);
5476+
%}
5477+
ins_pipe( pipe_slow );
5478+
%}
5479+
5480+
instruct vmask_gen(rRegL dst, rRegL len, rRegL tempLen) %{
5481+
match(Set dst (VectorMaskGen len));
5482+
effect(TEMP_DEF dst, TEMP tempLen);
5483+
format %{ "vector_mask_gen $len \t! vector mask generator" %}
5484+
ins_encode %{
5485+
__ genmask($dst$$Register, $len$$Register, $tempLen$$Register);
5486+
%}
5487+
ins_pipe( pipe_slow );
5488+
%}
5489+
5490+
instruct vmask_gen_imm(rRegL dst, immL len) %{
5491+
match(Set dst (VectorMaskGen len));
5492+
format %{ "vector_mask_gen $len \t! vector mask generator" %}
5493+
ins_encode %{
5494+
__ mov64($dst$$Register, (1L << ($len$$constant & 63)) -1);
5495+
%}
5496+
ins_pipe( pipe_slow );
5497+
%}
5498+
5499+
instruct vmasked_store64(memory mem, vec src, rRegL mask) %{
5500+
match(Set mem (VectorMaskedStore mem (Binary src mask)));
5501+
format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
5502+
ins_encode %{
5503+
const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
5504+
BasicType elmType = src_node->bottom_type()->is_vect()->element_basic_type();
5505+
int vector_len = vector_length_encoding(src_node);
5506+
//TODO: KRegister to be made valid "bound" operand to promote sharing.
5507+
__ kmovql(k2, $mask$$Register);
5508+
__ evmovdqu($mem$$Address, k2, $src$$XMMRegister, vector_len, elmType);
5509+
%}
5510+
ins_pipe( pipe_slow );
5511+
%}
5512+
#endif // _LP64

src/hotspot/share/adlc/forms.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ Form::DataType Form::is_load_from_memory(const char *opType) const {
268268
if( strcmp(opType,"LoadRange")==0 ) return Form::idealI;
269269
if( strcmp(opType,"LoadS")==0 ) return Form::idealS;
270270
if( strcmp(opType,"LoadVector")==0 ) return Form::idealV;
271+
if( strcmp(opType,"VectorMaskedLoad")==0 ) return Form::idealV;
271272
assert( strcmp(opType,"Load") != 0, "Must type Loads" );
272273
return Form::none;
273274
}
@@ -284,6 +285,7 @@ Form::DataType Form::is_store_to_memory(const char *opType) const {
284285
if( strcmp(opType,"StoreN")==0) return Form::idealN;
285286
if( strcmp(opType,"StoreNKlass")==0) return Form::idealNKlass;
286287
if( strcmp(opType,"StoreVector")==0 ) return Form::idealV;
288+
if( strcmp(opType,"VectorMaskedStore")==0 ) return Form::idealV;
287289
assert( strcmp(opType,"Store") != 0, "Must type Stores" );
288290
return Form::none;
289291
}

src/hotspot/share/adlc/formssel.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,7 @@ bool InstructForm::captures_bottom_type(FormDict &globals) const {
779779
!strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeP") ||
780780
!strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeN") ||
781781
#endif
782+
!strcmp(_matrule->_rChild->_opType,"VectorMaskGen")||
782783
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeP") ||
783784
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeN"))) return true;
784785
else if ( is_ideal_load() == Form::idealP ) return true;
@@ -3484,7 +3485,7 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const {
34843485
"StoreB","StoreC","Store" ,"StoreFP",
34853486
"LoadI", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF" ,
34863487
"LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" ,
3487-
"StoreVector", "LoadVector",
3488+
"StoreVector", "LoadVector", "VectorMaskedLoad", "VectorMaskedStore",
34883489
"LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned",
34893490
"LoadPLocked",
34903491
"StorePConditional", "StoreIConditional", "StoreLConditional",
@@ -4168,8 +4169,8 @@ bool MatchRule::is_vector() const {
41684169
"RShiftVB","RShiftVS","RShiftVI","RShiftVL",
41694170
"URShiftVB","URShiftVS","URShiftVI","URShiftVL",
41704171
"ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
4171-
"RoundDoubleModeV","RotateLeftV" , "RotateRightV", "LoadVector","StoreVector",
4172-
"FmaVD", "FmaVF","PopCountVI",
4172+
"RoundDoubleModeV", "RotateLeftV" , "RotateRightV", "LoadVector","StoreVector",
4173+
"FmaVD", "FmaVF","PopCountVI","VectorMaskedLoad","VectorMaskedStore",
41734174
// Next are not supported currently.
41744175
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
41754176
"ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"

src/hotspot/share/opto/arraycopynode.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ void ArrayCopyNode::connect_outputs(GraphKit* kit, bool deoptimize_on_exception)
8585
kit->set_all_memory_call(this);
8686
}
8787

88+
8889
#ifndef PRODUCT
8990
const char* ArrayCopyNode::_kind_names[] = {"arraycopy", "arraycopy, validated arguments", "clone", "oop array clone", "CopyOf", "CopyOfRange"};
9091

@@ -670,13 +671,28 @@ bool ArrayCopyNode::may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTra
670671
CallNode* call = NULL;
671672
guarantee(c != NULL, "step_over_gc_barrier failed, there must be something to step to.");
672673
if (c->is_Region()) {
674+
PhiNode* phi = NULL;
673675
for (uint i = 1; i < c->req(); i++) {
674676
if (c->in(i) != NULL) {
675677
Node* n = c->in(i)->in(0);
676678
if (may_modify_helper(t_oop, n, phase, call)) {
677679
ac = call->isa_ArrayCopy();
678680
assert(c == mb->in(0), "only for clone");
679681
return true;
682+
} else if (n != NULL && n->is_Region() &&
683+
(phi = n->as_Region()->has_phi()) &&
684+
phi->in(1)->Opcode() == Op_VectorMaskedStore) {
685+
return true;
686+
} else {
687+
for (DUIterator_Fast imax, i = c->fast_outs(imax); i < imax; i++) {
688+
Node* phi = c->fast_out(i);
689+
if (phi->is_Phi()) {
690+
assert(phi->in(0) == c, "phi region validation");
691+
if(phi->in(1) && phi->in(1)->Opcode() == Op_VectorMaskedStore) {
692+
return true;
693+
}
694+
}
695+
}
680696
}
681697
}
682698
}
@@ -734,3 +750,16 @@ bool ArrayCopyNode::modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransf
734750
}
735751
return false;
736752
}
753+
754+
// As an optimization, choose optimum vector size for copy length known at compile time.
755+
int ArrayCopyNode::get_partial_inline_vector_lane_count(BasicType type, int con_len) {
756+
int lane_count = ArrayCopyPartialInlineSize/type2aelembytes(type);
757+
if (con_len > 0) {
758+
int size_in_bytes = con_len * type2aelembytes(type);
759+
if (size_in_bytes <= 16)
760+
lane_count = 16/type2aelembytes(type);
761+
else if (size_in_bytes > 16 && size_in_bytes <= 32)
762+
lane_count = 32/type2aelembytes(type);
763+
}
764+
return lane_count;
765+
}

src/hotspot/share/opto/arraycopynode.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,9 @@ class ArrayCopyNode : public CallNode {
180180
bool has_negative_length_guard() const { return _has_negative_length_guard; }
181181

182182
static bool may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase, ArrayCopyNode*& ac);
183+
184+
static int get_partial_inline_vector_lane_count(BasicType type, int con_len);
185+
183186
bool modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransform* phase, bool must_modify) const;
184187

185188
#ifndef PRODUCT

0 commit comments

Comments
 (0)