Skip to content

Commit 38a8191

Browse files
author
Jatin Bhateja
committed
8290322: Optimize Vector.rearrange over byte vectors for AVX512BW targets.
Reviewed-by: kvn, sviswanathan
1 parent 27af014 commit 38a8191

File tree

6 files changed

+191
-5
lines changed

6 files changed

+191
-5
lines changed

src/hotspot/cpu/x86/assembler_x86.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5121,6 +5121,18 @@ void Assembler::pshufb(XMMRegister dst, XMMRegister src) {
51215121
emit_int16(0x00, (0xC0 | encode));
51225122
}
51235123

5124+
void Assembler::evpshufb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
5125+
assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
5126+
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
5127+
attributes.set_is_evex_instruction();
5128+
attributes.set_embedded_opmask_register_specifier(mask);
5129+
if (merge) {
5130+
attributes.reset_is_clear_context();
5131+
}
5132+
int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
5133+
emit_int16(0x00, (0xC0 | encode));
5134+
}
5135+
51245136
void Assembler::vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
51255137
assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
51265138
vector_len == AVX_256bit? VM_Version::supports_avx2() :

src/hotspot/cpu/x86/assembler_x86.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1915,6 +1915,8 @@ class Assembler : public AbstractAssembler {
19151915
void pshufb(XMMRegister dst, XMMRegister src);
19161916
void pshufb(XMMRegister dst, Address src);
19171917
void vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1918+
void evpshufb(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);
1919+
19181920

19191921
// Shuffle Packed Doublewords
19201922
void pshufd(XMMRegister dst, XMMRegister src, int mode);

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5680,3 +5680,49 @@ void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, R
56805680
}
56815681
#endif
56825682

5683+
void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
5684+
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
5685+
int vlen_enc) {
5686+
assert(VM_Version::supports_avx512bw(), "");
5687+
// Byte shuffles are inlane operations and indices are determined using
5688+
// lower 4 bit of each shuffle lane, thus all shuffle indices are
5689+
// normalized to index range 0-15. This makes sure that all the multiples
5690+
// of an index value are placed at same relative position in 128 bit
5691+
// lane i.e. elements corresponding to shuffle indices 16, 32 and 64
5692+
// will be 16th element in their respective 128 bit lanes.
5693+
movl(rtmp, 16);
5694+
evpbroadcastb(xtmp1, rtmp, vlen_enc);
5695+
5696+
// Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
5697+
// Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
5698+
// original shuffle indices and move the shuffled lanes corresponding to true
5699+
// mask to destination vector.
5700+
evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
5701+
evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
5702+
evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
5703+
5704+
// Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
5705+
// and broadcasting second 128 bit lane.
5706+
evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
5707+
vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
5708+
evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
5709+
evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
5710+
evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5711+
5712+
// Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
5713+
// and broadcasting third 128 bit lane.
5714+
evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
5715+
vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
5716+
evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
5717+
evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
5718+
evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5719+
5720+
// Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
5721+
// and broadcasting third 128 bit lane.
5722+
evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
5723+
vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
5724+
evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
5725+
evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
5726+
evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
5727+
}
5728+

src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,4 +458,7 @@
458458

459459
void vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, int vec_enc);
460460

461+
void rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
462+
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, int vlen_enc);
463+
461464
#endif // CPU_X86_C2_MACROASSEMBLER_X86_HPP

src/hotspot/cpu/x86/x86.ad

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1848,10 +1848,6 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
18481848
return false; // Implementation limitation due to how shuffle is loaded
18491849
} else if (size_in_bits == 256 && UseAVX < 2) {
18501850
return false; // Implementation limitation
1851-
} else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi()) {
1852-
return false; // Implementation limitation
1853-
} else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
1854-
return false; // Implementation limitation
18551851
}
18561852
break;
18571853
case Op_VectorLoadMask:
@@ -8529,7 +8525,23 @@ instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVe
85298525
ins_pipe( pipe_slow );
85308526
%}
85318527

8532-
instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
8528+
8529+
instruct rearrangeB_evex(vec dst, vec src, vec shuffle, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegI rtmp) %{
8530+
predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8531+
Matcher::vector_length(n) > 32 && !VM_Version::supports_avx512_vbmi());
8532+
match(Set dst (VectorRearrange src shuffle));
8533+
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
8534+
format %{ "vector_rearrange $dst, $shuffle, $src!\t using $xtmp1, $xtmp2, $xtmp3, $rtmp and $ktmp as TEMP" %}
8535+
ins_encode %{
8536+
int vlen_enc = vector_length_encoding(this);
8537+
__ rearrange_bytes($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister,
8538+
$xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister,
8539+
$rtmp$$Register, $ktmp$$KRegister, vlen_enc);
8540+
%}
8541+
ins_pipe( pipe_slow );
8542+
%}
8543+
8544+
instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{
85338545
predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
85348546
Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
85358547
match(Set dst (VectorRearrange src shuffle));
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
/*
2+
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*
23+
*/
24+
25+
package org.openjdk.bench.jdk.incubator.vector;
26+
27+
import java.util.Random;
28+
import jdk.incubator.vector.*;
29+
import java.util.concurrent.TimeUnit;
30+
import org.openjdk.jmh.annotations.*;
31+
import org.openjdk.jmh.infra.Blackhole;
32+
33+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
34+
@State(Scope.Thread)
35+
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
36+
public class RearrangeBytesBenchmark {
37+
@Param({"256", "512", "1024"})
38+
int size;
39+
40+
int [][] shuffles;
41+
byte[] byteinp;
42+
byte[] byteres;
43+
44+
static final VectorSpecies<Byte> bspecies64 = ByteVector.SPECIES_64;
45+
static final VectorSpecies<Byte> bspecies128 = ByteVector.SPECIES_128;
46+
static final VectorSpecies<Byte> bspecies256 = ByteVector.SPECIES_256;
47+
static final VectorSpecies<Byte> bspecies512 = ByteVector.SPECIES_512;
48+
49+
static final byte[] specialvalsbyte = {0, -0, Byte.MIN_VALUE, Byte.MAX_VALUE};
50+
51+
@Setup(Level.Trial)
52+
public void BmSetup() {
53+
Random r = new Random(1024);
54+
int [] bits = {64, 128, 256, 512};
55+
byteinp = new byte[size];
56+
byteres = new byte[size];
57+
58+
for (int i = 4; i < size; i++) {
59+
byteinp[i] = (byte)i;
60+
}
61+
for (int i = 0; i < specialvalsbyte.length; i++) {
62+
byteinp[i] = specialvalsbyte[i];
63+
}
64+
65+
shuffles = new int[4][];
66+
for (int i = 0; i < bits.length; i++) {
67+
int bytes = bits[i] >> 3;
68+
shuffles[i] = new int[bytes];
69+
for (int j = 0; j < bytes ; j++) {
70+
shuffles[i][j] = r.nextInt(bytes - 1);
71+
}
72+
}
73+
}
74+
75+
@Benchmark
76+
public void testRearrangeBytes64() {
77+
VectorShuffle<Byte> shuffle = VectorShuffle.fromArray(bspecies512, shuffles[3], 0);
78+
for (int j = 0; j < bspecies512.loopBound(size); j += bspecies512.length()) {
79+
ByteVector.fromArray(bspecies512, byteinp, j)
80+
.rearrange(shuffle)
81+
.intoArray(byteres, j);
82+
}
83+
}
84+
@Benchmark
85+
public void testRearrangeBytes32() {
86+
VectorShuffle<Byte> shuffle = VectorShuffle.fromArray(bspecies256, shuffles[2], 0);
87+
for (int j = 0; j < bspecies256.loopBound(size); j += bspecies256.length()) {
88+
ByteVector.fromArray(bspecies256, byteinp, j)
89+
.rearrange(shuffle)
90+
.intoArray(byteres, j);
91+
}
92+
}
93+
@Benchmark
94+
public void testRearrangeBytes16() {
95+
VectorShuffle<Byte> shuffle = VectorShuffle.fromArray(bspecies128, shuffles[1], 0);
96+
for (int j = 0; j < bspecies128.loopBound(size); j += bspecies128.length()) {
97+
ByteVector.fromArray(bspecies128, byteinp, j)
98+
.rearrange(shuffle)
99+
.intoArray(byteres, j);
100+
}
101+
}
102+
@Benchmark
103+
public void testRearrangeBytes8() {
104+
VectorShuffle<Byte> shuffle = VectorShuffle.fromArray(bspecies64, shuffles[0], 0);
105+
for (int j = 0; j < bspecies64.loopBound(size); j += bspecies64.length()) {
106+
ByteVector.fromArray(bspecies64, byteinp, j)
107+
.rearrange(shuffle)
108+
.intoArray(byteres, j);
109+
}
110+
}
111+
}

0 commit comments

Comments
 (0)