@@ -9001,9 +9001,21 @@ following it. --><span id="__arm_za_disable"></span>
90019001
90029002The intrinsics in this section have the following properties in common:
90039003
9004- * Every argument named `tile`, `slice_offset` or `tile_mask` must
9005- be an integer constant expression in the range of the underlying
9006- instruction.
9004+ * Every argument named `tile` or `tile_mask` must be an integer constant
9005+ expression in the range of the underlying instruction.
9006+
9007+ * Some SME instructions identify a slice of ZA using the sum of a 32-bit
9008+ general-purpose register and an immediate offset. The intrinsics for
9009+ these instructions have a 32-bit argument called `slice`, which is
9010+ interpreted as follows:
9011+
9012+ * If the intrinsic also has a `vnum` argument, the ZA slice number
9013+ is calculated by adding `vnum` to `slice`. Both `slice` and `vnum`
9014+ can both be variable.
9015+
9016+ * Otherwise, `slice` specifies the ZA slice number directly; that is,
9017+ it represents the sum of the 32-bit register and the immediate
9018+ offset. `slice` can be variable.
90079019
90089020* ZA loads and stores do not use typed pointers, since there is
90099021 no C or C++ type information associated with the contents of ZA.
@@ -9017,74 +9029,85 @@ The intrinsics in this section have the following properties in common:
90179029``` c
90189030 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90199031 __attribute__((arm_streaming, arm_shared_za))
9020- void svld1_hor_za8(uint64_t tile, uint32_t slice_base ,
9021- uint64_t slice_offset, svbool_t pg, const void *ptr);
9032+ void svld1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg ,
9033+ const void *ptr);
90229034
9023- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9035+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9036+ // address given by ptr.
9037+ //
90249038 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90259039 __attribute__((arm_streaming, arm_shared_za))
9026- void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9027- uint64_t slice_offset, svbool_t pg,
9040+ void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90289041 const void *ptr, int64_t vnum);
90299042
90309043 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90319044 __attribute__((arm_streaming, arm_shared_za))
9032- void svld1_ver_za8(uint64_t tile, uint32_t slice_base ,
9033- uint64_t slice_offset, svbool_t pg, const void *ptr);
9045+ void svld1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg ,
9046+ const void *ptr);
90349047
9035- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9048+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9049+ // address given by ptr.
9050+ //
90369051 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90379052 __attribute__((arm_streaming, arm_shared_za))
9038- void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9039- uint64_t slice_offset, svbool_t pg,
9053+ void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90409054 const void *ptr, int64_t vnum);
90419055```
90429056
90439057#### LDR
90449058
90459059``` c
9046- // slice_offset fills the role of the usual vnum parameter.
90479060 __attribute__((arm_streaming_compatible, arm_shared_za))
9048- void svldr_vnum_za(uint32_t slice_base, uint64_t slice_offset,
9049- const void *ptr);
9061+ void svldr_za(uint32_t slice, const void *ptr);
9062+
9063+ // Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9064+ // This can be done in a single instruction if vnum is a constant in the
9065+ // range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9066+ __attribute__((arm_streaming_compatible, arm_shared_za))
9067+ void svldr_vnum_za(uint32_t slice, const void *ptr, int64_t vnum);
90509068```
90519069
90529070#### ST1B, ST1H, ST1W, ST1D, ST1Q
90539071
90549072``` c
90559073 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90569074 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9057- void svst1_hor_za8(uint64_t tile, uint32_t slice_base,
9058- uint64_t slice_offset, svbool_t pg,
9075+ void svst1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90599076 void *ptr);
90609077
9061- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9078+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9079+ // address given by ptr.
9080+ //
90629081 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90639082 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9064- void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9065- uint64_t slice_offset, svbool_t pg,
9083+ void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90669084 void *ptr, int64_t vnum);
90679085
90689086 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90699087 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9070- void svst1_ver_za8(uint64_t tile, uint32_t slice_base,
9071- uint64_t slice_offset, svbool_t pg,
9088+ void svst1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90729089 void *ptr);
90739090
9074- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9091+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9092+ // address given by ptr.
9093+ //
90759094 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90769095 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9077- void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9078- uint64_t slice_offset, svbool_t pg,
9096+ void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90799097 void *ptr, int64_t vnum);
90809098```
90819099
90829100#### STR
90839101
90849102``` c
9085- // slice_offset fills the role of the usual vnum parameter.
90869103 __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9087- void svstr_vnum_za(uint32_t slice_base, uint64_t slice_offset, void *ptr);
9104+ void svstr_za(uint32_t slice, void *ptr);
9105+
9106+ // Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9107+ // This can be done in a single instruction if vnum is a constant in the
9108+ // range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9109+ __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9110+ void svstr_vnum_za(uint32_t slice, void *ptr, int64_t vnum);
90889111```
90899112
90909113#### MOVA
@@ -9098,32 +9121,27 @@ parameter both have type `svuint8_t`.
90989121 // And similarly for u8.
90999122 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91009123 svint8_t svread_hor_za8[_s8]_m(svint8_t zd, svbool_t pg,
9101- uint64_t tile, uint32_t slice_base,
9102- uint64_t slice_offset);
9124+ uint64_t tile, uint32_t slice);
91039125
91049126 // And similarly for u16, bf16 and f16.
91059127 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91069128 svint16_t svread_hor_za16[_s16]_m(svint16_t zd, svbool_t pg,
9107- uint64_t tile, uint32_t slice_base,
9108- uint64_t slice_offset);
9129+ uint64_t tile, uint32_t slice);
91099130
91109131 // And similarly for u32 and f32.
91119132 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91129133 svint32_t svread_hor_za32[_s32]_m(svint32_t zd, svbool_t pg,
9113- uint64_t tile, uint32_t slice_base,
9114- uint64_t slice_offset);
9134+ uint64_t tile, uint32_t slice);
91159135
91169136 // And similarly for u64 and f64.
91179137 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91189138 svint64_t svread_hor_za64[_s64]_m(svint64_t zd, svbool_t pg,
9119- uint64_t tile, uint32_t slice_base,
9120- uint64_t slice_offset);
9139+ uint64_t tile, uint32_t slice);
91219140
91229141 // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91239142 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91249143 svint8_t svread_hor_za128[_s8]_m(svint8_t zd, svbool_t pg,
9125- uint64_t tile, uint32_t slice_base,
9126- uint64_t slice_offset);
9144+ uint64_t tile, uint32_t slice);
91279145```
91289146
91299147Replacing `_hor` with `_ver` gives the associated vertical forms.
@@ -9135,32 +9153,27 @@ the `zn` parameter to the `_u8` intrinsic has type `svuint8_t`.
91359153``` c
91369154 // And similarly for u8.
91379155 __attribute__((arm_streaming, arm_shared_za))
9138- void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice_base,
9139- uint64_t slice_offset, svbool_t pg,
9156+ void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91409157 svint8_t zn);
91419158
91429159 // And similarly for u16, bf16 and f16.
91439160 __attribute__((arm_streaming, arm_shared_za))
9144- void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice_base,
9145- uint64_t slice_offset, svbool_t pg,
9161+ void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91469162 svint16_t zn);
91479163
91489164 // And similarly for u32 and f32.
91499165 __attribute__((arm_streaming, arm_shared_za))
9150- void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice_base,
9151- uint64_t slice_offset, svbool_t pg,
9166+ void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91529167 svint32_t zn);
91539168
91549169 // And similarly for u64 and f64.
91559170 __attribute__((arm_streaming, arm_shared_za))
9156- void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice_base,
9157- uint64_t slice_offset, svbool_t pg,
9171+ void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91589172 svint64_t zn);
91599173
91609174 // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91619175 __attribute__((arm_streaming, arm_shared_za))
9162- void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice_base,
9163- uint64_t slice_offset, svbool_t pg,
9176+ void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91649177 svint8_t zn);
91659178```
91669179
0 commit comments