Skip to content

Commit d3d5254

Browse files
committed
[main] Combine SME slice parameters
Previously the (alpha) SME intrinsics were documented to take two slice parameters: a 32-bit variable index and a 64-bit constant offset. However, it isn't very C-like to split an addition in this way, and as Sander points out, it isn't really consistent with the way that we handle vnum parameters. The patch also removes a specific reference to w12-w15, since some SME2 instructions use w8-w11.
1 parent 8f74df9 commit d3d5254

File tree

1 file changed

+58
-48
lines changed

1 file changed

+58
-48
lines changed

main/acle.md

Lines changed: 58 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9001,9 +9001,18 @@ following it. --><span id="__arm_za_disable"></span>
90019001

90029002
The intrinsics in this section have the following properties in common:
90039003

9004-
* Every argument named `tile`, `slice_offset` or `tile_mask` must
9005-
be an integer constant expression in the range of the underlying
9006-
instruction.
9004+
* Every argument named `tile` or `tile_mask` must be an integer constant
9005+
expression in the range of the underlying instruction.
9006+
9007+
* Some SME instructions index ZA using the sum of a 32-bit general-purpose
9008+
register and a constant offset. Instead of having arguments for the
9009+
two individual fields, the associated intrinsics have a single
9010+
32-bit index called `slice` that holds the sum.
9011+
9012+
* However, load and store intrinsics that take both a `vnum` parameter
9013+
and a `slice` parameter add `vnum` to `slice`. This helps to ensure
9014+
that the load/store address and ZA index remain balanced, and
9015+
increases the chances that an immediate offset can be used.
90079016

90089017
* ZA loads and stores do not use typed pointers, since there is
90099018
no C or C++ type information associated with the contents of ZA.
@@ -9017,74 +9026,85 @@ The intrinsics in this section have the following properties in common:
90179026
``` c
90189027
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90199028
__attribute__((arm_streaming, arm_shared_za))
9020-
void svld1_hor_za8(uint64_t tile, uint32_t slice_base,
9021-
uint64_t slice_offset, svbool_t pg, const void *ptr);
9029+
void svld1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
9030+
const void *ptr);
90229031

9023-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9032+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9033+
// address given by ptr.
9034+
//
90249035
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90259036
__attribute__((arm_streaming, arm_shared_za))
9026-
void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9027-
uint64_t slice_offset, svbool_t pg,
9037+
void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90289038
const void *ptr, int64_t vnum);
90299039

90309040
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90319041
__attribute__((arm_streaming, arm_shared_za))
9032-
void svld1_ver_za8(uint64_t tile, uint32_t slice_base,
9033-
uint64_t slice_offset, svbool_t pg, const void *ptr);
9042+
void svld1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
9043+
const void *ptr);
90349044

9035-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9045+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9046+
// address given by ptr.
9047+
//
90369048
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90379049
__attribute__((arm_streaming, arm_shared_za))
9038-
void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9039-
uint64_t slice_offset, svbool_t pg,
9050+
void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90409051
const void *ptr, int64_t vnum);
90419052
```
90429053

90439054
#### LDR
90449055

90459056
``` c
9046-
// slice_offset fills the role of the usual vnum parameter.
90479057
__attribute__((arm_streaming_compatible, arm_shared_za))
9048-
void svldr_vnum_za(uint32_t slice_base, uint64_t slice_offset,
9049-
const void *ptr);
9058+
void svldr_za(uint32_t slice, const void *ptr);
9059+
9060+
// Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9061+
// This can be done in a single instruction if vnum is a constant in the
9062+
// range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9063+
__attribute__((arm_streaming_compatible, arm_shared_za))
9064+
void svldr_vnum_za(uint32_t slice, const void *ptr, int64_t vnum);
90509065
```
90519066

90529067
#### ST1B, ST1H, ST1W, ST1D, ST1Q
90539068

90549069
``` c
90559070
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90569071
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9057-
void svst1_hor_za8(uint64_t tile, uint32_t slice_base,
9058-
uint64_t slice_offset, svbool_t pg,
9072+
void svst1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90599073
void *ptr);
90609074

9061-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9075+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9076+
// address given by ptr.
9077+
//
90629078
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90639079
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9064-
void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9065-
uint64_t slice_offset, svbool_t pg,
9080+
void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90669081
void *ptr, int64_t vnum);
90679082

90689083
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90699084
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9070-
void svst1_ver_za8(uint64_t tile, uint32_t slice_base,
9071-
uint64_t slice_offset, svbool_t pg,
9085+
void svst1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90729086
void *ptr);
90739087

9074-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9088+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9089+
// address given by ptr.
9090+
//
90759091
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90769092
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9077-
void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9078-
uint64_t slice_offset, svbool_t pg,
9093+
void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90799094
void *ptr, int64_t vnum);
90809095
```
90819096

90829097
#### STR
90839098

90849099
``` c
9085-
// slice_offset fills the role of the usual vnum parameter.
90869100
__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9087-
void svstr_vnum_za(uint32_t slice_base, uint64_t slice_offset, void *ptr);
9101+
void svstr_za(uint32_t slice, void *ptr);
9102+
9103+
// Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9104+
// This can be done in a single instruction if vnum is a constant in the
9105+
// range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9106+
__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9107+
void svstr_vnum_za(uint32_t slice, void *ptr, int64_t vnum);
90889108
```
90899109

90909110
#### MOVA
@@ -9098,32 +9118,27 @@ parameter both have type `svuint8_t`.
90989118
// And similarly for u8.
90999119
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91009120
svint8_t svread_hor_za8[_s8]_m(svint8_t zd, svbool_t pg,
9101-
uint64_t tile, uint32_t slice_base,
9102-
uint64_t slice_offset);
9121+
uint64_t tile, uint32_t slice);
91039122

91049123
// And similarly for u16, bf16 and f16.
91059124
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91069125
svint16_t svread_hor_za16[_s16]_m(svint16_t zd, svbool_t pg,
9107-
uint64_t tile, uint32_t slice_base,
9108-
uint64_t slice_offset);
9126+
uint64_t tile, uint32_t slice);
91099127

91109128
// And similarly for u32 and f32.
91119129
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91129130
svint32_t svread_hor_za32[_s32]_m(svint32_t zd, svbool_t pg,
9113-
uint64_t tile, uint32_t slice_base,
9114-
uint64_t slice_offset);
9131+
uint64_t tile, uint32_t slice);
91159132

91169133
// And similarly for u64 and f64.
91179134
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91189135
svint64_t svread_hor_za64[_s64]_m(svint64_t zd, svbool_t pg,
9119-
uint64_t tile, uint32_t slice_base,
9120-
uint64_t slice_offset);
9136+
uint64_t tile, uint32_t slice);
91219137

91229138
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91239139
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91249140
svint8_t svread_hor_za128[_s8]_m(svint8_t zd, svbool_t pg,
9125-
uint64_t tile, uint32_t slice_base,
9126-
uint64_t slice_offset);
9141+
uint64_t tile, uint32_t slice);
91279142
```
91289143

91299144
Replacing `_hor` with `_ver` gives the associated vertical forms.
@@ -9135,32 +9150,27 @@ the `zn` parameter to the `_u8` intrinsic has type `svuint8_t`.
91359150
``` c
91369151
// And similarly for u8.
91379152
__attribute__((arm_streaming, arm_shared_za))
9138-
void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice_base,
9139-
uint64_t slice_offset, svbool_t pg,
9153+
void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91409154
svint8_t zn);
91419155

91429156
// And similarly for u16, bf16 and f16.
91439157
__attribute__((arm_streaming, arm_shared_za))
9144-
void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice_base,
9145-
uint64_t slice_offset, svbool_t pg,
9158+
void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91469159
svint16_t zn);
91479160

91489161
// And similarly for u32 and f32.
91499162
__attribute__((arm_streaming, arm_shared_za))
9150-
void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice_base,
9151-
uint64_t slice_offset, svbool_t pg,
9163+
void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91529164
svint32_t zn);
91539165

91549166
// And similarly for u64 and f64.
91559167
__attribute__((arm_streaming, arm_shared_za))
9156-
void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice_base,
9157-
uint64_t slice_offset, svbool_t pg,
9168+
void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91589169
svint64_t zn);
91599170

91609171
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91619172
__attribute__((arm_streaming, arm_shared_za))
9162-
void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice_base,
9163-
uint64_t slice_offset, svbool_t pg,
9173+
void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91649174
svint8_t zn);
91659175
```
91669176

0 commit comments

Comments
 (0)