Skip to content

Commit

Permalink
[X86] Add reduce_*_ep[i|u]8/16 series intrinsics.
Browse files Browse the repository at this point in the history
Reviewed By: pengfei, skan

Differential Revision: https://reviews.llvm.org/D140531
  • Loading branch information
FreddyLeaf committed Dec 23, 2022
1 parent c89db6a commit 68a8880
Show file tree
Hide file tree
Showing 3 changed files with 774 additions and 0 deletions.
2 changes: 2 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,8 @@ X86 Support in Clang
- ``-march=raptorlake`` and ``-march=meteorlake`` are now supported.
- ``-march=sierraforest``, ``-march=graniterapids`` and ``-march=grandridge`` are now supported.
- Lift _BitInt() supported max width from 128 to 8388608.
- Support intrinsics of ``_mm(256)_reduce_(add|mul|or|and)_epi8/16``.
- Support intrinsics of ``_mm(256)_reduce_(max|min)_ep[i|u]8/16``.

WebAssembly Support in Clang
----------------------------
Expand Down
352 changes: 352 additions & 0 deletions clang/lib/Headers/avx512vlbwintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -2803,6 +2803,358 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
(__v16hi)_mm256_setzero_si256()))

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_add_epi16(__m128i __W) {
return __builtin_reduce_add((__v8hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_mul_epi16(__m128i __W) {
return __builtin_reduce_mul((__v8hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_and_epi16(__m128i __W) {
return __builtin_reduce_and((__v8hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_or_epi16(__m128i __W) {
return __builtin_reduce_or((__v8hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_add_epi16( __mmask8 __M, __m128i __W) {
__W = _mm_maskz_mov_epi16(__M, __W);
return __builtin_reduce_add((__v8hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_mul_epi16( __mmask8 __M, __m128i __W) {
__W = _mm_mask_mov_epi16(_mm_set1_epi16(1), __M, __W);
return __builtin_reduce_mul((__v8hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_and_epi16( __mmask8 __M, __m128i __W) {
__W = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __W);
return __builtin_reduce_and((__v8hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W) {
__W = _mm_maskz_mov_epi16(__M, __W);
return __builtin_reduce_or((__v8hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_max_epi16(__m128i __V) {
return __builtin_reduce_max((__v8hi)__V);
}

static __inline__ unsigned short __DEFAULT_FN_ATTRS128
_mm_reduce_max_epu16(__m128i __V) {
return __builtin_reduce_max((__v8hu)__V);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_reduce_min_epi16(__m128i __V) {
return __builtin_reduce_min((__v8hi)__V);
}

static __inline__ unsigned short __DEFAULT_FN_ATTRS128
_mm_reduce_min_epu16(__m128i __V) {
return __builtin_reduce_min((__v8hu)__V);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_max_epi16(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi16(_mm_set1_epi16(-32767-1), __M, __V);
return __builtin_reduce_max((__v8hi)__V);
}

static __inline__ unsigned short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_max_epu16(__mmask16 __M, __m128i __V) {
__V = _mm_maskz_mov_epi16(__M, __V);
return __builtin_reduce_max((__v8hu)__V);
}

static __inline__ short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_min_epi16(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi16(_mm_set1_epi16(32767), __M, __V);
return __builtin_reduce_min((__v8hi)__V);
}

static __inline__ unsigned short __DEFAULT_FN_ATTRS128
_mm_mask_reduce_min_epu16(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __V);
return __builtin_reduce_min((__v8hu)__V);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_add_epi16(__m256i __W) {
return __builtin_reduce_add((__v16hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_mul_epi16(__m256i __W) {
return __builtin_reduce_mul((__v16hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_and_epi16(__m256i __W) {
return __builtin_reduce_and((__v16hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_or_epi16(__m256i __W) {
return __builtin_reduce_or((__v16hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_add_epi16( __mmask16 __M, __m256i __W) {
__W = _mm256_maskz_mov_epi16(__M, __W);
return __builtin_reduce_add((__v16hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_mul_epi16( __mmask16 __M, __m256i __W) {
__W = _mm256_mask_mov_epi16(_mm256_set1_epi16(1), __M, __W);
return __builtin_reduce_mul((__v16hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_and_epi16( __mmask16 __M, __m256i __W) {
__W = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __W);
return __builtin_reduce_and((__v16hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W) {
__W = _mm256_maskz_mov_epi16(__M, __W);
return __builtin_reduce_or((__v16hi)__W);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_max_epi16(__m256i __V) {
return __builtin_reduce_max((__v16hi)__V);
}

static __inline__ unsigned short __DEFAULT_FN_ATTRS256
_mm256_reduce_max_epu16(__m256i __V) {
return __builtin_reduce_max((__v16hu)__V);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_reduce_min_epi16(__m256i __V) {
return __builtin_reduce_min((__v16hi)__V);
}

static __inline__ unsigned short __DEFAULT_FN_ATTRS256
_mm256_reduce_min_epu16(__m256i __V) {
return __builtin_reduce_min((__v16hu)__V);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __V) {
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-32767-1), __M, __V);
return __builtin_reduce_max((__v16hi)__V);
}

static __inline__ unsigned short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __V) {
__V = _mm256_maskz_mov_epi16(__M, __V);
return __builtin_reduce_max((__v16hu)__V);
}

static __inline__ short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __V) {
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(32767), __M, __V);
return __builtin_reduce_min((__v16hi)__V);
}

static __inline__ unsigned short __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __V) {
__V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __V);
return __builtin_reduce_min((__v16hu)__V);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_add_epi8(__m128i __W) {
return __builtin_reduce_add((__v16qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_mul_epi8(__m128i __W) {
return __builtin_reduce_mul((__v16qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_and_epi8(__m128i __W) {
return __builtin_reduce_and((__v16qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_or_epi8(__m128i __W) {
return __builtin_reduce_or((__v16qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_add_epi8( __mmask16 __M, __m128i __W) {
__W = _mm_maskz_mov_epi8(__M, __W);
return __builtin_reduce_add((__v16qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_mul_epi8( __mmask16 __M, __m128i __W) {
__W = _mm_mask_mov_epi8(_mm_set1_epi8(1), __M, __W);
return __builtin_reduce_mul((__v16qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_and_epi8( __mmask16 __M, __m128i __W) {
__W = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __W);
return __builtin_reduce_and((__v16qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W) {
__W = _mm_maskz_mov_epi8(__M, __W);
return __builtin_reduce_or((__v16qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_max_epi8(__m128i __V) {
return __builtin_reduce_max((__v16qs)__V);
}

static __inline__ unsigned char __DEFAULT_FN_ATTRS128
_mm_reduce_max_epu8(__m128i __V) {
return __builtin_reduce_max((__v16qu)__V);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_reduce_min_epi8(__m128i __V) {
return __builtin_reduce_min((__v16qs)__V);
}

static __inline__ unsigned char __DEFAULT_FN_ATTRS128
_mm_reduce_min_epu8(__m128i __V) {
return __builtin_reduce_min((__v16qu)__V);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi8(_mm_set1_epi8(-127-1), __M, __V);
return __builtin_reduce_max((__v16qs)__V);
}

static __inline__ unsigned char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __V) {
__V = _mm_maskz_mov_epi8(__M, __V);
return __builtin_reduce_max((__v16qu)__V);
}

static __inline__ signed char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi8(_mm_set1_epi8(127), __M, __V);
return __builtin_reduce_min((__v16qs)__V);
}

static __inline__ unsigned char __DEFAULT_FN_ATTRS128
_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __V) {
__V = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __V);
return __builtin_reduce_min((__v16qu)__V);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_add_epi8(__m256i __W) {
return __builtin_reduce_add((__v32qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_mul_epi8(__m256i __W) {
return __builtin_reduce_mul((__v32qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_and_epi8(__m256i __W) {
return __builtin_reduce_and((__v32qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_or_epi8(__m256i __W) {
return __builtin_reduce_or((__v32qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_add_epi8( __mmask32 __M, __m256i __W) {
__W = _mm256_maskz_mov_epi8(__M, __W);
return __builtin_reduce_add((__v32qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_mul_epi8( __mmask32 __M, __m256i __W) {
__W = _mm256_mask_mov_epi8(_mm256_set1_epi8(1), __M, __W);
return __builtin_reduce_mul((__v32qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_and_epi8( __mmask32 __M, __m256i __W) {
__W = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __W);
return __builtin_reduce_and((__v32qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W) {
__W = _mm256_maskz_mov_epi8(__M, __W);
return __builtin_reduce_or((__v32qs)__W);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_max_epi8(__m256i __V) {
return __builtin_reduce_max((__v32qs)__V);
}

static __inline__ unsigned char __DEFAULT_FN_ATTRS256
_mm256_reduce_max_epu8(__m256i __V) {
return __builtin_reduce_max((__v32qu)__V);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_reduce_min_epi8(__m256i __V) {
return __builtin_reduce_min((__v32qs)__V);
}

static __inline__ unsigned char __DEFAULT_FN_ATTRS256
_mm256_reduce_min_epu8(__m256i __V) {
return __builtin_reduce_min((__v32qu)__V);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __V) {
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-127-1), __M, __V);
return __builtin_reduce_max((__v32qs)__V);
}

static __inline__ unsigned char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __V) {
__V = _mm256_maskz_mov_epi8(__M, __V);
return __builtin_reduce_max((__v32qu)__V);
}

static __inline__ signed char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __V) {
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(127), __M, __V);
return __builtin_reduce_min((__v32qs)__V);
}

static __inline__ unsigned char __DEFAULT_FN_ATTRS256
_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) {
__V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __V);
return __builtin_reduce_min((__v32qu)__V);
}

#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256

Expand Down
Loading

0 comments on commit 68a8880

Please sign in to comment.