Closed
Description
Here is the code from gcc testsuite.
https://godbolt.org/z/jzdcsfxx4
typedef char __attribute__ ((vector_size (16))) v16qi;
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
v2di
G1 (v2di r)
{
return (r >> 32) | (r << 32);
}
v4si
G2 (v4si r)
{
return (r >> 16) | (r << 16);
}
v8hi
G3 (v8hi r)
{
return (r >> 8) | (r << 8);
}
v2si
G4 (v2si r)
{
return (r >> 16) | (r << 16);
}
v4hi
G5 (v4hi r)
{
return (r >> 8) | (r << 8);
}
GCC efficiently uses rev32 or rev64 to complete the operation in a single instruction.