Skip to content

Commit 6410c04

Browse files
authored
[main] Combine SME slice parameters (#225)
Previously the (alpha) SME intrinsics were documented to take two slice parameters: a 32-bit variable index and a 64-bit constant offset. However, it isn't very C-like to split an addition in this way, and as Sander points out, it isn't really consistent with the way that we handle vnum parameters. The patch also removes a specific reference to w12-w15, since some SME2 instructions use w8-w11.
1 parent ce770f6 commit 6410c04

File tree

1 file changed

+61
-48
lines changed

1 file changed

+61
-48
lines changed

main/acle.md

Lines changed: 61 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9057,9 +9057,21 @@ following it. --><span id="__arm_za_disable"></span>
90579057

90589058
The intrinsics in this section have the following properties in common:
90599059

9060-
* Every argument named `tile`, `slice_offset` or `tile_mask` must
9061-
be an integer constant expression in the range of the underlying
9062-
instruction.
9060+
* Every argument named `tile` or `tile_mask` must be an integer constant
9061+
expression in the range of the underlying instruction.
9062+
9063+
* Some SME instructions identify a slice of ZA using the sum of a 32-bit
9064+
general-purpose register and an immediate offset. The intrinsics for
9065+
these instructions have a 32-bit argument called `slice`, which is
9066+
interpreted as follows:
9067+
9068+
* If the intrinsic also has a `vnum` argument, the ZA slice number
9069+
is calculated by adding `vnum` to `slice`. Both `slice` and `vnum`
9070+
can both be variable.
9071+
9072+
* Otherwise, `slice` specifies the ZA slice number directly; that is,
9073+
it represents the sum of the 32-bit register and the immediate
9074+
offset. `slice` can be variable.
90639075

90649076
* ZA loads and stores do not use typed pointers, since there is
90659077
no C or C++ type information associated with the contents of ZA.
@@ -9073,74 +9085,85 @@ The intrinsics in this section have the following properties in common:
90739085
``` c
90749086
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90759087
__attribute__((arm_streaming, arm_shared_za))
9076-
void svld1_hor_za8(uint64_t tile, uint32_t slice_base,
9077-
uint64_t slice_offset, svbool_t pg, const void *ptr);
9088+
void svld1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
9089+
const void *ptr);
90789090

9079-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9091+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9092+
// address given by ptr.
9093+
//
90809094
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90819095
__attribute__((arm_streaming, arm_shared_za))
9082-
void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9083-
uint64_t slice_offset, svbool_t pg,
9096+
void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90849097
const void *ptr, int64_t vnum);
90859098

90869099
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90879100
__attribute__((arm_streaming, arm_shared_za))
9088-
void svld1_ver_za8(uint64_t tile, uint32_t slice_base,
9089-
uint64_t slice_offset, svbool_t pg, const void *ptr);
9101+
void svld1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
9102+
const void *ptr);
90909103

9091-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9104+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9105+
// address given by ptr.
9106+
//
90929107
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90939108
__attribute__((arm_streaming, arm_shared_za))
9094-
void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9095-
uint64_t slice_offset, svbool_t pg,
9109+
void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90969110
const void *ptr, int64_t vnum);
90979111
```
90989112

90999113
#### LDR
91009114

91019115
``` c
9102-
// slice_offset fills the role of the usual vnum parameter.
91039116
__attribute__((arm_streaming_compatible, arm_shared_za))
9104-
void svldr_vnum_za(uint32_t slice_base, uint64_t slice_offset,
9105-
const void *ptr);
9117+
void svldr_za(uint32_t slice, const void *ptr);
9118+
9119+
// Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9120+
// This can be done in a single instruction if vnum is a constant in the
9121+
// range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9122+
__attribute__((arm_streaming_compatible, arm_shared_za))
9123+
void svldr_vnum_za(uint32_t slice, const void *ptr, int64_t vnum);
91069124
```
91079125

91089126
#### ST1B, ST1H, ST1W, ST1D, ST1Q
91099127

91109128
``` c
91119129
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
91129130
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9113-
void svst1_hor_za8(uint64_t tile, uint32_t slice_base,
9114-
uint64_t slice_offset, svbool_t pg,
9131+
void svst1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
91159132
void *ptr);
91169133

9117-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9134+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9135+
// address given by ptr.
9136+
//
91189137
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
91199138
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9120-
void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9121-
uint64_t slice_offset, svbool_t pg,
9139+
void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
91229140
void *ptr, int64_t vnum);
91239141

91249142
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
91259143
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9126-
void svst1_ver_za8(uint64_t tile, uint32_t slice_base,
9127-
uint64_t slice_offset, svbool_t pg,
9144+
void svst1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
91289145
void *ptr);
91299146

9130-
// Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9147+
// Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9148+
// address given by ptr.
9149+
//
91319150
// Also for _za16, _za32, _za64 and _za128 (with the same prototype).
91329151
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9133-
void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9134-
uint64_t slice_offset, svbool_t pg,
9152+
void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
91359153
void *ptr, int64_t vnum);
91369154
```
91379155

91389156
#### STR
91399157

91409158
``` c
9141-
// slice_offset fills the role of the usual vnum parameter.
91429159
__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9143-
void svstr_vnum_za(uint32_t slice_base, uint64_t slice_offset, void *ptr);
9160+
void svstr_za(uint32_t slice, void *ptr);
9161+
9162+
// Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9163+
// This can be done in a single instruction if vnum is a constant in the
9164+
// range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9165+
__attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9166+
void svstr_vnum_za(uint32_t slice, void *ptr, int64_t vnum);
91449167
```
91459168

91469169
#### MOVA
@@ -9154,32 +9177,27 @@ parameter both have type `svuint8_t`.
91549177
// And similarly for u8.
91559178
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91569179
svint8_t svread_hor_za8[_s8]_m(svint8_t zd, svbool_t pg,
9157-
uint64_t tile, uint32_t slice_base,
9158-
uint64_t slice_offset);
9180+
uint64_t tile, uint32_t slice);
91599181

91609182
// And similarly for u16, bf16 and f16.
91619183
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91629184
svint16_t svread_hor_za16[_s16]_m(svint16_t zd, svbool_t pg,
9163-
uint64_t tile, uint32_t slice_base,
9164-
uint64_t slice_offset);
9185+
uint64_t tile, uint32_t slice);
91659186

91669187
// And similarly for u32 and f32.
91679188
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91689189
svint32_t svread_hor_za32[_s32]_m(svint32_t zd, svbool_t pg,
9169-
uint64_t tile, uint32_t slice_base,
9170-
uint64_t slice_offset);
9190+
uint64_t tile, uint32_t slice);
91719191

91729192
// And similarly for u64 and f64.
91739193
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91749194
svint64_t svread_hor_za64[_s64]_m(svint64_t zd, svbool_t pg,
9175-
uint64_t tile, uint32_t slice_base,
9176-
uint64_t slice_offset);
9195+
uint64_t tile, uint32_t slice);
91779196

91789197
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91799198
__attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91809199
svint8_t svread_hor_za128[_s8]_m(svint8_t zd, svbool_t pg,
9181-
uint64_t tile, uint32_t slice_base,
9182-
uint64_t slice_offset);
9200+
uint64_t tile, uint32_t slice);
91839201
```
91849202

91859203
Replacing `_hor` with `_ver` gives the associated vertical forms.
@@ -9191,32 +9209,27 @@ the `zn` parameter to the `_u8` intrinsic has type `svuint8_t`.
91919209
``` c
91929210
// And similarly for u8.
91939211
__attribute__((arm_streaming, arm_shared_za))
9194-
void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice_base,
9195-
uint64_t slice_offset, svbool_t pg,
9212+
void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91969213
svint8_t zn);
91979214

91989215
// And similarly for u16, bf16 and f16.
91999216
__attribute__((arm_streaming, arm_shared_za))
9200-
void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice_base,
9201-
uint64_t slice_offset, svbool_t pg,
9217+
void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice, svbool_t pg,
92029218
svint16_t zn);
92039219

92049220
// And similarly for u32 and f32.
92059221
__attribute__((arm_streaming, arm_shared_za))
9206-
void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice_base,
9207-
uint64_t slice_offset, svbool_t pg,
9222+
void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice, svbool_t pg,
92089223
svint32_t zn);
92099224

92109225
// And similarly for u64 and f64.
92119226
__attribute__((arm_streaming, arm_shared_za))
9212-
void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice_base,
9213-
uint64_t slice_offset, svbool_t pg,
9227+
void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice, svbool_t pg,
92149228
svint64_t zn);
92159229

92169230
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
92179231
__attribute__((arm_streaming, arm_shared_za))
9218-
void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice_base,
9219-
uint64_t slice_offset, svbool_t pg,
9232+
void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
92209233
svint8_t zn);
92219234
```
92229235

0 commit comments

Comments
 (0)