|
15 | 15 | #define __AMXFP8INTRIN_H
|
16 | 16 | #ifdef __x86_64__
|
17 | 17 |
|
18 |
| -/// Peform the dot product of a BF8 value \a a by a BF8 value \a b accumulating |
19 |
| -/// into a Single Precision (FP32) source/dest \a dst. |
| 18 | +#define __DEFAULT_FN_ATTRS_FP8 \ |
| 19 | + __attribute__((__always_inline__, __nodebug__, __target__("amx-fp8"))) |
| 20 | + |
| 21 | +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8 |
| 22 | +_tile_dpbf8ps_internal(unsigned short m, unsigned short n, unsigned short k, |
| 23 | + _tile1024i dst, _tile1024i src1, _tile1024i src2) { |
| 24 | + return __builtin_ia32_tdpbf8ps_internal(m, n, k, dst, src1, src2); |
| 25 | +} |
| 26 | + |
| 27 | +/// Perform the dot product of a BF8 value \a src1 by a BF8 value \a src2 |
| 28 | +/// accumulating into a Single Precision (FP32) source/dest \a dst. |
20 | 29 | ///
|
21 | 30 | /// \headerfile <immintrin.h>
|
22 | 31 | ///
|
23 | 32 | /// \code
|
24 |
| -/// void _tile_dpbf8ps (__tile dst, __tile a, __tile b) |
| 33 | +/// void __tile_dpbf8ps (__tile1024i *dst, __tile1024i src1, __tile1024i src2) |
| 34 | +/// \endcode |
| 35 | +/// |
| 36 | +/// \code{.operation} |
| 37 | +/// FOR m := 0 TO dst.rows - 1 |
| 38 | +/// temp1[(dst.colsb / 4 - 1) : 0] = 0 |
| 39 | +/// FOR k := 0 TO src1.colsb / 4 - 1 |
| 40 | +/// FOR n := 0 TO dst.colsb / 4 - 1 |
| 41 | +/// temp1[n] += |
| 42 | +/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0]) |
| 43 | +/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1]) |
| 44 | +/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2]) |
| 45 | +/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3]) |
| 46 | +/// ENDFOR |
| 47 | +/// ENDFOR |
| 48 | +/// FOR n := 0 TO dst.colsb / 4 - 1 |
| 49 | +/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n]) |
| 50 | +/// ENDFOR |
| 51 | +/// write_row_and_zero(dst, m, tmp, dst.colsb) |
| 52 | +/// zero_upper_rows(dst, dst.rows) |
| 53 | +/// zero_tileconfig_start() |
25 | 54 | /// \endcode
|
26 | 55 | ///
|
27 | 56 | /// This intrinsic corresponds to the \c TDPBF8PS instruction.
|
28 | 57 | ///
|
29 | 58 | /// \param dst
|
30 | 59 | /// The destination tile. Max size is 1024 Bytes.
|
31 |
| -/// \param a |
| 60 | +/// \param src1 |
32 | 61 | /// The 1st source tile. Max size is 1024 Bytes.
|
33 |
| -/// \param b |
| 62 | +/// \param src2 |
34 | 63 | /// The 2nd source tile. Max size is 1024 Bytes.
|
35 |
| -#define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b)) |
| 64 | +__DEFAULT_FN_ATTRS_FP8 static void |
| 65 | +__tile_dpbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { |
| 66 | + dst->tile = _tile_dpbf8ps_internal(src1.row, src2.col, src1.col, dst->tile, |
| 67 | + src1.tile, src2.tile); |
| 68 | +} |
36 | 69 |
|
37 |
| -/// Perform the dot product of a BF8 value \a a by an HF8 value \a b |
| 70 | +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8 |
| 71 | +_tile_dpbhf8ps_internal(unsigned short m, unsigned short n, unsigned short k, |
| 72 | + _tile1024i dst, _tile1024i src1, _tile1024i src2) { |
| 73 | + return __builtin_ia32_tdpbhf8ps_internal(m, n, k, dst, src1, src2); |
| 74 | +} |
| 75 | + |
| 76 | +/// Perform the dot product of a BF8 value \a src1 by an HF8 value \a src2 |
38 | 77 | /// accumulating into a Single Precision (FP32) source/dest \a dst.
|
39 | 78 | ///
|
40 | 79 | /// \headerfile <immintrin.h>
|
41 | 80 | ///
|
42 | 81 | /// \code
|
43 |
| -/// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b) |
| 82 | +/// void __tile_dpbhf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2) |
| 83 | +/// \endcode |
| 84 | +/// |
| 85 | +/// \code{.operation} |
| 86 | +/// FOR m := 0 TO dst.rows - 1 |
| 87 | +/// temp1[(dst.colsb / 4 - 1) : 0] = 0 |
| 88 | +/// FOR k := 0 TO src1.colsb / 4 - 1 |
| 89 | +/// FOR n := 0 TO dst.colsb / 4 - 1 |
| 90 | +/// temp1[n] += |
| 91 | +/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0]) |
| 92 | +/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1]) |
| 93 | +/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2]) |
| 94 | +/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3]) |
| 95 | +/// ENDFOR |
| 96 | +/// ENDFOR |
| 97 | +/// FOR n := 0 TO dst.colsb / 4 - 1 |
| 98 | +/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n]) |
| 99 | +/// ENDFOR |
| 100 | +/// write_row_and_zero(dst, m, tmp, dst.colsb) |
| 101 | +/// zero_upper_rows(dst, dst.rows) |
| 102 | +/// zero_tileconfig_start() |
44 | 103 | /// \endcode
|
45 | 104 | ///
|
46 | 105 | /// This intrinsic corresponds to the \c TDPBHF8PS instruction.
|
47 | 106 | ///
|
48 | 107 | /// \param dst
|
49 | 108 | /// The destination tile. Max size is 1024 Bytes.
|
50 |
| -/// \param a |
| 109 | +/// \param src1 |
51 | 110 | /// The 1st source tile. Max size is 1024 Bytes.
|
52 |
| -/// \param b |
| 111 | +/// \param src2 |
53 | 112 | /// The 2nd source tile. Max size is 1024 Bytes.
|
54 |
| -#define _tile_dpbhf8ps(dst, a, b) __builtin_ia32_tdpbhf8ps((dst), (a), (b)) |
| 113 | +__DEFAULT_FN_ATTRS_FP8 static void |
| 114 | +__tile_dpbhf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { |
| 115 | + dst->tile = _tile_dpbhf8ps_internal(src1.row, src2.col, src1.col, dst->tile, |
| 116 | + src1.tile, src2.tile); |
| 117 | +} |
| 118 | + |
| 119 | +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8 |
| 120 | +_tile_dphbf8ps_internal(unsigned short m, unsigned short n, unsigned short k, |
| 121 | + _tile1024i dst, _tile1024i src1, _tile1024i src2) { |
| 122 | + return __builtin_ia32_tdphbf8ps_internal(m, n, k, dst, src1, src2); |
| 123 | +} |
55 | 124 |
|
56 |
| -/// Perform the dot product of an HF8 value \a a by a BF8 value \a b |
| 125 | +/// Perform the dot product of an HF8 value \a src1 by a BF8 value \a src2 |
57 | 126 | /// accumulating into a Single Precision (FP32) source/dest \a dst.
|
58 | 127 | ///
|
59 | 128 | /// \headerfile <immintrin.h>
|
60 | 129 | ///
|
61 | 130 | /// \code
|
62 |
| -/// void _tile_dphbf8ps (__tile dst, __tile a, __tile b) |
| 131 | +/// void __tile_dphbf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2) |
| 132 | +/// \endcode |
| 133 | +/// |
| 134 | +/// \code{.operation} |
| 135 | +/// FOR m := 0 TO dst.rows - 1 |
| 136 | +/// temp1[(dst.colsb / 4 - 1) : 0] = 0 |
| 137 | +/// FOR k := 0 TO src1.colsb / 4 - 1 |
| 138 | +/// FOR n := 0 TO dst.colsb / 4 - 1 |
| 139 | +/// temp1[n] += |
| 140 | +/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0]) |
| 141 | +/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1]) |
| 142 | +/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2]) |
| 143 | +/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3]) |
| 144 | +/// ENDFOR |
| 145 | +/// ENDFOR |
| 146 | +/// FOR n := 0 TO dst.colsb / 4 - 1 |
| 147 | +/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n]) |
| 148 | +/// ENDFOR |
| 149 | +/// write_row_and_zero(dst, m, tmp, dst.colsb) |
| 150 | +/// zero_upper_rows(dst, dst.rows) |
| 151 | +/// zero_tileconfig_start() |
63 | 152 | /// \endcode
|
64 | 153 | ///
|
65 | 154 | /// This intrinsic corresponds to the \c TDPHBF8PS instruction.
|
66 | 155 | ///
|
67 | 156 | /// \param dst
|
68 | 157 | /// The destination tile. Max size is 1024 Bytes.
|
69 |
| -/// \param a |
| 158 | +/// \param src1 |
70 | 159 | /// The 1st source tile. Max size is 1024 Bytes.
|
71 |
| -/// \param b |
| 160 | +/// \param src2 |
72 | 161 | /// The 2nd source tile. Max size is 1024 Bytes.
|
73 |
| -#define _tile_dphbf8ps(dst, a, b) __builtin_ia32_tdphbf8ps((dst), (a), (b)) |
74 | 162 |
|
75 |
| -/// Perform the dot product of an HF8 value \a a by an HF8 value \a b |
| 163 | +__DEFAULT_FN_ATTRS_FP8 static void |
| 164 | +__tile_dphbf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { |
| 165 | + dst->tile = _tile_dphbf8ps_internal(src1.row, src2.col, src1.col, dst->tile, |
| 166 | + src1.tile, src2.tile); |
| 167 | +} |
| 168 | + |
| 169 | +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP8 |
| 170 | +_tile_dphf8ps_internal(unsigned short m, unsigned short n, unsigned short k, |
| 171 | + _tile1024i dst, _tile1024i src1, _tile1024i src2) { |
| 172 | + return __builtin_ia32_tdphf8ps_internal(m, n, k, dst, src1, src2); |
| 173 | +} |
| 174 | + |
| 175 | +/// Perform the dot product of an HF8 value \a src1 by an HF8 value \a src2 |
76 | 176 | /// accumulating into a Single Precision (FP32) source/dest \a dst.
|
77 | 177 | ///
|
78 | 178 | /// \headerfile <immintrin.h>
|
79 | 179 | ///
|
80 | 180 | /// \code
|
81 |
| -/// void _tile_dphf8ps (__tile dst, __tile a, __tile b) |
| 181 | +/// void __tile_dphf8ps (__tile1024i dst, __tile1024i src1, __tile1024i src2) |
| 182 | +/// \endcode |
| 183 | +/// |
| 184 | +/// \code{.operation} |
| 185 | +/// FOR m := 0 TO dst.rows - 1 |
| 186 | +/// temp1[(dst.colsb / 4 - 1) : 0] = 0 |
| 187 | +/// FOR k := 0 TO src1.colsb / 4 - 1 |
| 188 | +/// FOR n := 0 TO dst.colsb / 4 - 1 |
| 189 | +/// temp1[n] += |
| 190 | +/// INT64(src1.row[m].float8[4*k+0]) * INT64(src2.row[k].float8[4*n+0]) |
| 191 | +/// + INT64(src1.row[m].float8[4*k+1]) * INT64(src2.row[k].float8[4*n+1]) |
| 192 | +/// + INT64(src1.row[m].float8[4*k+2]) * INT64(src2.row[k].float8[4*n+2]) |
| 193 | +/// + INT64(src1.row[m].float8[4*k+3]) * INT64(src2.row[k].float8[4*n+3]) |
| 194 | +/// ENDFOR |
| 195 | +/// ENDFOR |
| 196 | +/// FOR n := 0 TO dst.colsb / 4 - 1 |
| 197 | +/// tmp.row[m].fp32[n] = dst.row[m].fp32[n] + FP32(temp1[n]) |
| 198 | +/// ENDFOR |
| 199 | +/// write_row_and_zero(dst, m, tmp, dst.colsb) |
| 200 | +/// zero_upper_rows(dst, dst.rows) |
| 201 | +/// zero_tileconfig_start() |
82 | 202 | /// \endcode
|
83 | 203 | ///
|
84 | 204 | /// This intrinsic corresponds to the \c TDPHF8PS instruction.
|
85 | 205 | ///
|
86 | 206 | /// \param dst
|
87 | 207 | /// The destination tile. Max size is 1024 Bytes.
|
88 |
| -/// \param a |
| 208 | +/// \param src1 |
89 | 209 | /// The 1st source tile. Max size is 1024 Bytes.
|
90 |
| -/// \param b |
| 210 | +/// \param src2 |
91 | 211 | /// The 2nd source tile. Max size is 1024 Bytes.
|
92 |
| -#define _tile_dphf8ps(dst, a, b) __builtin_ia32_tdphf8ps((dst), (a), (b)) |
| 212 | +__DEFAULT_FN_ATTRS_FP8 static void |
| 213 | +__tile_dphf8ps(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { |
| 214 | + dst->tile = _tile_dphf8ps_internal(src1.row, src2.col, src1.col, dst->tile, |
| 215 | + src1.tile, src2.tile); |
| 216 | +} |
| 217 | + |
| 218 | +#define _tile_dpbf8ps(dst, src1, src2) \ |
| 219 | + __builtin_ia32_tdpbf8ps((dst), (src1), (src2)) |
| 220 | +#define _tile_dpbhf8ps(dst, src1, src2) \ |
| 221 | + __builtin_ia32_tdpbhf8ps((dst), (src1), (src2)) |
| 222 | +#define _tile_dphbf8ps(dst, src1, src2) \ |
| 223 | + __builtin_ia32_tdphbf8ps((dst), (src1), (src2)) |
| 224 | +#define _tile_dphf8ps(dst, src1, src2) \ |
| 225 | + __builtin_ia32_tdphf8ps((dst), (src1), (src2)) |
| 226 | + |
| 227 | +#undef __DEFAULT_FN_ATTRS_FP8 |
93 | 228 |
|
94 | 229 | #endif /* __x86_64__ */
|
95 | 230 | #endif /* __AMXFP8INTRIN_H */
|
0 commit comments