Skip to content

Commit 69c87cc

Browse files
committed
[main] Combine SME slice parameters
Previously the (alpha) SME intrinsics were documented to take two slice parameters: a 32-bit variable index and a 64-bit constant offset. However, it isn't very C-like to split an addition in this way, and as Sander points out, it isn't really consistent with the way that we handle vnum parameters. The patch also removes a specific reference to w12-w15, since some SME2 instructions use w8-w11.
1 parent 8f74df9 commit 69c87cc

File tree

1 file changed

+61
-48
lines changed

1 file changed

+61
-48
lines changed

main/acle.md

Lines changed: 61 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9001,9 +9001,21 @@ following it. --><span id="__arm_za_disable"></span>
90019001

90029002
The intrinsics in this section have the following properties in common:
90039003

9004-
* Every argument named `tile`, `slice_offset` or `tile_mask` must
9005-
be an integer constant expression in the range of the underlying
9006-
instruction.
9004+
* Every argument named `tile` or `tile_mask` must be an integer constant
9005+
expression in the range of the underlying instruction.
9006+
9007+
* Some SME instructions identify a slice of ZA using the sum of a 32-bit
9008+
general-purpose register and an immediate offset. The intrinsics for
9009+
these instructions have a 32-bit argument called `slice`, which is
9010+
interpreted as follows:
9011+
9012+
* If the intrinsic also has a `vnum` argument, the ZA slice number
9013+
is calculated by adding `vnum` to `slice`. Both `slice` and `vnum`
9014+
can both be variable.
9015+
9016+
* Otherwise, `slice` specifies the ZA slice number directly; that is,
9017+
it represents the sum of the 32-bit register and the immediate
9018+
offset. `slice` can be variable.
90079019

90089020
* ZA loads and stores do not use typed pointers, since there is
90099021
no C or C++ type information associated with the contents of ZA.
@@ -9017,74 +9029,85 @@ The intrinsics in this section have the following properties in common:
90179029
``` c
90189030
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90199031
__attribute__((arm_streaming, arm_shared_za))
9020-
void svld1_hor_za8(uint64_t tile, uint32_t slice_base,
9021-
uint64_t slice_offset, svbool_t pg, const void *ptr);
9032+
void svld1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
9033+
const void *ptr);
90229034

9023-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9035+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9036+
// address given by ptr.
9037+
//
90249038
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90259039
__attribute__((arm_streaming, arm_shared_za))
9026-
void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9027-
uint64_t slice_offset, svbool_t pg,
9040+
void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90289041
const void *ptr, int64_t vnum);
90299042

90309043
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90319044
__attribute__((arm_streaming, arm_shared_za))
9032-
void svld1_ver_za8(uint64_t tile, uint32_t slice_base,
9033-
uint64_t slice_offset, svbool_t pg, const void *ptr);
9045+
void svld1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
9046+
const void *ptr);
90349047

9035-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9048+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9049+
// address given by ptr.
9050+
//
90369051
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90379052
__attribute__((arm_streaming, arm_shared_za))
9038-
void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9039-
uint64_t slice_offset, svbool_t pg,
9053+
void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90409054
const void *ptr, int64_t vnum);
90419055
```
90429056

90439057
#### LDR
90449058

90459059
``` c
9046-
// slice_offset fills the role of the usual vnum parameter.
90479060
__attribute__((arm_streaming_compatible, arm_shared_za))
9048-
void svldr_vnum_za(uint32_t slice_base, uint64_t slice_offset,
9049-
const void *ptr);
9061+
void svldr_za(uint32_t slice, const void *ptr);
9062+
9063+
// Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9064+
// This can be done in a single instruction if vnum is a constant in the
9065+
// range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9066+
__attribute__((arm_streaming_compatible, arm_shared_za))
9067+
void svldr_vnum_za(uint32_t slice, const void *ptr, int64_t vnum);
90509068
```
90519069

90529070
#### ST1B, ST1H, ST1W, ST1D, ST1Q
90539071

90549072
``` c
90559073
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90569074
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9057-
void svst1_hor_za8(uint64_t tile, uint32_t slice_base,
9058-
uint64_t slice_offset, svbool_t pg,
9075+
void svst1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90599076
void *ptr);
90609077

9061-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9078+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9079+
// address given by ptr.
9080+
//
90629081
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90639082
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9064-
void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9065-
uint64_t slice_offset, svbool_t pg,
9083+
void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90669084
void *ptr, int64_t vnum);
90679085

90689086
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90699087
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9070-
void svst1_ver_za8(uint64_t tile, uint32_t slice_base,
9071-
uint64_t slice_offset, svbool_t pg,
9088+
void svst1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90729089
void *ptr);
90739090

9074-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9091+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9092+
// address given by ptr.
9093+
//
90759094
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90769095
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9077-
void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9078-
uint64_t slice_offset, svbool_t pg,
9096+
void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90799097
void *ptr, int64_t vnum);
90809098
```
90819099

90829100
#### STR
90839101

90849102
``` c
9085-
// slice_offset fills the role of the usual vnum parameter.
90869103
__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9087-
void svstr_vnum_za(uint32_t slice_base, uint64_t slice_offset, void *ptr);
9104+
void svstr_za(uint32_t slice, void *ptr);
9105+
9106+
// Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9107+
// This can be done in a single instruction if vnum is a constant in the
9108+
// range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9109+
__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9110+
void svstr_vnum_za(uint32_t slice, void *ptr, int64_t vnum);
90889111
```
90899112

90909113
#### MOVA
@@ -9098,32 +9121,27 @@ parameter both have type `svuint8_t`.
90989121
// And similarly for u8.
90999122
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91009123
svint8_t svread_hor_za8[_s8]_m(svint8_t zd, svbool_t pg,
9101-
uint64_t tile, uint32_t slice_base,
9102-
uint64_t slice_offset);
9124+
uint64_t tile, uint32_t slice);
91039125

91049126
// And similarly for u16, bf16 and f16.
91059127
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91069128
svint16_t svread_hor_za16[_s16]_m(svint16_t zd, svbool_t pg,
9107-
uint64_t tile, uint32_t slice_base,
9108-
uint64_t slice_offset);
9129+
uint64_t tile, uint32_t slice);
91099130

91109131
// And similarly for u32 and f32.
91119132
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91129133
svint32_t svread_hor_za32[_s32]_m(svint32_t zd, svbool_t pg,
9113-
uint64_t tile, uint32_t slice_base,
9114-
uint64_t slice_offset);
9134+
uint64_t tile, uint32_t slice);
91159135

91169136
// And similarly for u64 and f64.
91179137
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91189138
svint64_t svread_hor_za64[_s64]_m(svint64_t zd, svbool_t pg,
9119-
uint64_t tile, uint32_t slice_base,
9120-
uint64_t slice_offset);
9139+
uint64_t tile, uint32_t slice);
91219140

91229141
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91239142
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91249143
svint8_t svread_hor_za128[_s8]_m(svint8_t zd, svbool_t pg,
9125-
uint64_t tile, uint32_t slice_base,
9126-
uint64_t slice_offset);
9144+
uint64_t tile, uint32_t slice);
91279145
```
91289146

91299147
Replacing `_hor` with `_ver` gives the associated vertical forms.
@@ -9135,32 +9153,27 @@ the `zn` parameter to the `_u8` intrinsic has type `svuint8_t`.
91359153
``` c
91369154
// And similarly for u8.
91379155
__attribute__((arm_streaming, arm_shared_za))
9138-
void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice_base,
9139-
uint64_t slice_offset, svbool_t pg,
9156+
void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91409157
svint8_t zn);
91419158

91429159
// And similarly for u16, bf16 and f16.
91439160
__attribute__((arm_streaming, arm_shared_za))
9144-
void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice_base,
9145-
uint64_t slice_offset, svbool_t pg,
9161+
void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91469162
svint16_t zn);
91479163

91489164
// And similarly for u32 and f32.
91499165
__attribute__((arm_streaming, arm_shared_za))
9150-
void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice_base,
9151-
uint64_t slice_offset, svbool_t pg,
9166+
void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91529167
svint32_t zn);
91539168

91549169
// And similarly for u64 and f64.
91559170
__attribute__((arm_streaming, arm_shared_za))
9156-
void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice_base,
9157-
uint64_t slice_offset, svbool_t pg,
9171+
void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91589172
svint64_t zn);
91599173

91609174
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91619175
__attribute__((arm_streaming, arm_shared_za))
9162-
void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice_base,
9163-
uint64_t slice_offset, svbool_t pg,
9176+
void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91649177
svint8_t zn);
91659178
```
91669179

0 commit comments

Comments
 (0)