@@ -9057,9 +9057,21 @@ following it. --><span id="__arm_za_disable"></span>
90579057
90589058The intrinsics in this section have the following properties in common:
90599059
9060- * Every argument named `tile`, `slice_offset` or `tile_mask` must
9061- be an integer constant expression in the range of the underlying
9062- instruction.
9060+ * Every argument named `tile` or `tile_mask` must be an integer constant
9061+ expression in the range of the underlying instruction.
9062+
9063+ * Some SME instructions identify a slice of ZA using the sum of a 32-bit
9064+ general-purpose register and an immediate offset. The intrinsics for
9065+ these instructions have a 32-bit argument called `slice`, which is
9066+ interpreted as follows:
9067+
9068+ * If the intrinsic also has a `vnum` argument, the ZA slice number
9069+ is calculated by adding `vnum` to `slice`. Both `slice` and `vnum`
9070+ can both be variable.
9071+
9072+ * Otherwise, `slice` specifies the ZA slice number directly; that is,
9073+ it represents the sum of the 32-bit register and the immediate
9074+ offset. `slice` can be variable.
90639075
90649076* ZA loads and stores do not use typed pointers, since there is
90659077 no C or C++ type information associated with the contents of ZA.
@@ -9073,74 +9085,85 @@ The intrinsics in this section have the following properties in common:
90739085``` c
90749086 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90759087 __attribute__((arm_streaming, arm_shared_za))
9076- void svld1_hor_za8(uint64_t tile, uint32_t slice_base ,
9077- uint64_t slice_offset, svbool_t pg, const void *ptr);
9088+ void svld1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg ,
9089+ const void *ptr);
90789090
9079- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9091+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9092+ // address given by ptr.
9093+ //
90809094 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90819095 __attribute__((arm_streaming, arm_shared_za))
9082- void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9083- uint64_t slice_offset, svbool_t pg,
9096+ void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90849097 const void *ptr, int64_t vnum);
90859098
90869099 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90879100 __attribute__((arm_streaming, arm_shared_za))
9088- void svld1_ver_za8(uint64_t tile, uint32_t slice_base ,
9089- uint64_t slice_offset, svbool_t pg, const void *ptr);
9101+ void svld1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg ,
9102+ const void *ptr);
90909103
9091- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9104+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9105+ // address given by ptr.
9106+ //
90929107 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90939108 __attribute__((arm_streaming, arm_shared_za))
9094- void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9095- uint64_t slice_offset, svbool_t pg,
9109+ void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90969110 const void *ptr, int64_t vnum);
90979111```
90989112
90999113#### LDR
91009114
91019115``` c
9102- // slice_offset fills the role of the usual vnum parameter.
91039116 __attribute__((arm_streaming_compatible, arm_shared_za))
9104- void svldr_vnum_za(uint32_t slice_base, uint64_t slice_offset,
9105- const void *ptr);
9117+ void svldr_za(uint32_t slice, const void *ptr);
9118+
9119+ // Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9120+ // This can be done in a single instruction if vnum is a constant in the
9121+ // range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9122+ __attribute__((arm_streaming_compatible, arm_shared_za))
9123+ void svldr_vnum_za(uint32_t slice, const void *ptr, int64_t vnum);
91069124```
91079125
91089126#### ST1B, ST1H, ST1W, ST1D, ST1Q
91099127
91109128``` c
91119129 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
91129130 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9113- void svst1_hor_za8(uint64_t tile, uint32_t slice_base,
9114- uint64_t slice_offset, svbool_t pg,
9131+ void svst1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
91159132 void *ptr);
91169133
9117- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9134+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9135+ // address given by ptr.
9136+ //
91189137 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
91199138 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9120- void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9121- uint64_t slice_offset, svbool_t pg,
9139+ void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
91229140 void *ptr, int64_t vnum);
91239141
91249142 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
91259143 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9126- void svst1_ver_za8(uint64_t tile, uint32_t slice_base,
9127- uint64_t slice_offset, svbool_t pg,
9144+ void svst1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
91289145 void *ptr);
91299146
9130- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9147+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9148+ // address given by ptr.
9149+ //
91319150 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
91329151 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9133- void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9134- uint64_t slice_offset, svbool_t pg,
9152+ void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
91359153 void *ptr, int64_t vnum);
91369154```
91379155
91389156#### STR
91399157
91409158``` c
9141- // slice_offset fills the role of the usual vnum parameter.
91429159 __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9143- void svstr_vnum_za(uint32_t slice_base, uint64_t slice_offset, void *ptr);
9160+ void svstr_za(uint32_t slice, void *ptr);
9161+
9162+ // Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9163+ // This can be done in a single instruction if vnum is a constant in the
9164+ // range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9165+ __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9166+ void svstr_vnum_za(uint32_t slice, void *ptr, int64_t vnum);
91449167```
91459168
91469169#### MOVA
@@ -9154,32 +9177,27 @@ parameter both have type `svuint8_t`.
91549177 // And similarly for u8.
91559178 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91569179 svint8_t svread_hor_za8[_s8]_m(svint8_t zd, svbool_t pg,
9157- uint64_t tile, uint32_t slice_base,
9158- uint64_t slice_offset);
9180+ uint64_t tile, uint32_t slice);
91599181
91609182 // And similarly for u16, bf16 and f16.
91619183 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91629184 svint16_t svread_hor_za16[_s16]_m(svint16_t zd, svbool_t pg,
9163- uint64_t tile, uint32_t slice_base,
9164- uint64_t slice_offset);
9185+ uint64_t tile, uint32_t slice);
91659186
91669187 // And similarly for u32 and f32.
91679188 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91689189 svint32_t svread_hor_za32[_s32]_m(svint32_t zd, svbool_t pg,
9169- uint64_t tile, uint32_t slice_base,
9170- uint64_t slice_offset);
9190+ uint64_t tile, uint32_t slice);
91719191
91729192 // And similarly for u64 and f64.
91739193 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91749194 svint64_t svread_hor_za64[_s64]_m(svint64_t zd, svbool_t pg,
9175- uint64_t tile, uint32_t slice_base,
9176- uint64_t slice_offset);
9195+ uint64_t tile, uint32_t slice);
91779196
91789197 // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91799198 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91809199 svint8_t svread_hor_za128[_s8]_m(svint8_t zd, svbool_t pg,
9181- uint64_t tile, uint32_t slice_base,
9182- uint64_t slice_offset);
9200+ uint64_t tile, uint32_t slice);
91839201```
91849202
91859203Replacing `_hor` with `_ver` gives the associated vertical forms.
@@ -9191,32 +9209,27 @@ the `zn` parameter to the `_u8` intrinsic has type `svuint8_t`.
91919209``` c
91929210 // And similarly for u8.
91939211 __attribute__((arm_streaming, arm_shared_za))
9194- void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice_base,
9195- uint64_t slice_offset, svbool_t pg,
9212+ void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91969213 svint8_t zn);
91979214
91989215 // And similarly for u16, bf16 and f16.
91999216 __attribute__((arm_streaming, arm_shared_za))
9200- void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice_base,
9201- uint64_t slice_offset, svbool_t pg,
9217+ void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice, svbool_t pg,
92029218 svint16_t zn);
92039219
92049220 // And similarly for u32 and f32.
92059221 __attribute__((arm_streaming, arm_shared_za))
9206- void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice_base,
9207- uint64_t slice_offset, svbool_t pg,
9222+ void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice, svbool_t pg,
92089223 svint32_t zn);
92099224
92109225 // And similarly for u64 and f64.
92119226 __attribute__((arm_streaming, arm_shared_za))
9212- void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice_base,
9213- uint64_t slice_offset, svbool_t pg,
9227+ void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice, svbool_t pg,
92149228 svint64_t zn);
92159229
92169230 // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
92179231 __attribute__((arm_streaming, arm_shared_za))
9218- void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice_base,
9219- uint64_t slice_offset, svbool_t pg,
9232+ void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
92209233 svint8_t zn);
92219234```
92229235
0 commit comments