@@ -9001,9 +9001,18 @@ following it. --><span id="__arm_za_disable"></span>
90019001
90029002The intrinsics in this section have the following properties in common:
90039003
9004- * Every argument named `tile`, `slice_offset` or `tile_mask` must
9005- be an integer constant expression in the range of the underlying
9006- instruction.
9004+ * Every argument named `tile` or `tile_mask` must be an integer constant
9005+ expression in the range of the underlying instruction.
9006+
9007+ * Some SME instructions index ZA using the sum of a 32-bit general-purpose
9008+ register and a constant offset. Instead of having arguments for the
9009+ two individual fields, the associated intrinsics have a single
9010+ 32-bit index called `slice` that holds the sum.
9011+
9012+ * However, load and store intrinsics that take both a `vnum` parameter
9013+ and a `slice` parameter add `vnum` to `slice`. This helps to ensure
9014+ that the load/store address and ZA index remain balanced, and
9015+ increases the chances that an immediate offset can be used.
90079016
90089017* ZA loads and stores do not use typed pointers, since there is
90099018 no C or C++ type information associated with the contents of ZA.
@@ -9017,74 +9026,85 @@ The intrinsics in this section have the following properties in common:
90179026``` c
90189027 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90199028 __attribute__((arm_streaming, arm_shared_za))
9020- void svld1_hor_za8(uint64_t tile, uint32_t slice_base ,
9021- uint64_t slice_offset, svbool_t pg, const void *ptr);
9029+ void svld1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg ,
9030+ const void *ptr);
90229031
9023- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9032+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9033+ // address given by ptr.
9034+ //
90249035 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90259036 __attribute__((arm_streaming, arm_shared_za))
9026- void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9027- uint64_t slice_offset, svbool_t pg,
9037+ void svld1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90289038 const void *ptr, int64_t vnum);
90299039
90309040 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90319041 __attribute__((arm_streaming, arm_shared_za))
9032- void svld1_ver_za8(uint64_t tile, uint32_t slice_base ,
9033- uint64_t slice_offset, svbool_t pg, const void *ptr);
9042+ void svld1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg ,
9043+ const void *ptr);
90349044
9035- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9045+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9046+ // address given by ptr.
9047+ //
90369048 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90379049 __attribute__((arm_streaming, arm_shared_za))
9038- void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9039- uint64_t slice_offset, svbool_t pg,
9050+ void svld1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90409051 const void *ptr, int64_t vnum);
90419052```
90429053
90439054#### LDR
90449055
90459056``` c
9046- // slice_offset fills the role of the usual vnum parameter.
90479057 __attribute__((arm_streaming_compatible, arm_shared_za))
9048- void svldr_vnum_za(uint32_t slice_base, uint64_t slice_offset,
9049- const void *ptr);
9058+ void svldr_za(uint32_t slice, const void *ptr);
9059+
9060+ // Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9061+ // This can be done in a single instruction if vnum is a constant in the
9062+ // range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9063+ __attribute__((arm_streaming_compatible, arm_shared_za))
9064+ void svldr_vnum_za(uint32_t slice, const void *ptr, int64_t vnum);
90509065```
90519066
90529067#### ST1B, ST1H, ST1W, ST1D, ST1Q
90539068
90549069``` c
90559070 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90569071 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9057- void svst1_hor_za8(uint64_t tile, uint32_t slice_base,
9058- uint64_t slice_offset, svbool_t pg,
9072+ void svst1_hor_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90599073 void *ptr);
90609074
9061- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9075+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9076+ // address given by ptr.
9077+ //
90629078 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90639079 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9064- void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice_base,
9065- uint64_t slice_offset, svbool_t pg,
9080+ void svst1_hor_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90669081 void *ptr, int64_t vnum);
90679082
90689083 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90699084 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9070- void svst1_ver_za8(uint64_t tile, uint32_t slice_base,
9071- uint64_t slice_offset, svbool_t pg,
9085+ void svst1_ver_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90729086 void *ptr);
90739087
9074- // Synthetic intrinsic: adds vnum * svcntsb() to the address given by ptr.
9088+ // Synthetic intrinsic: adds vnum to slice and vnum * svcntsb() to the
9089+ // address given by ptr.
9090+ //
90759091 // Also for _za16, _za32, _za64 and _za128 (with the same prototype).
90769092 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
9077- void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice_base,
9078- uint64_t slice_offset, svbool_t pg,
9093+ void svst1_ver_vnum_za8(uint64_t tile, uint32_t slice, svbool_t pg,
90799094 void *ptr, int64_t vnum);
90809095```
90819096
90829097#### STR
90839098
90849099``` c
9085- // slice_offset fills the role of the usual vnum parameter.
90869100 __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9087- void svstr_vnum_za(uint32_t slice_base, uint64_t slice_offset, void *ptr);
9101+ void svstr_za(uint32_t slice, void *ptr);
9102+
9103+ // Adds vnum to slice and vnum * svcntsb() to the address given by ptr.
9104+ // This can be done in a single instruction if vnum is a constant in the
9105+ // range [0, 15]. The intrinsic is synthetic for other vnum parameters.
9106+ __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za))
9107+ void svstr_vnum_za(uint32_t slice, void *ptr, int64_t vnum);
90889108```
90899109
90909110#### MOVA
@@ -9098,32 +9118,27 @@ parameter both have type `svuint8_t`.
90989118 // And similarly for u8.
90999119 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91009120 svint8_t svread_hor_za8[_s8]_m(svint8_t zd, svbool_t pg,
9101- uint64_t tile, uint32_t slice_base,
9102- uint64_t slice_offset);
9121+ uint64_t tile, uint32_t slice);
91039122
91049123 // And similarly for u16, bf16 and f16.
91059124 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91069125 svint16_t svread_hor_za16[_s16]_m(svint16_t zd, svbool_t pg,
9107- uint64_t tile, uint32_t slice_base,
9108- uint64_t slice_offset);
9126+ uint64_t tile, uint32_t slice);
91099127
91109128 // And similarly for u32 and f32.
91119129 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91129130 svint32_t svread_hor_za32[_s32]_m(svint32_t zd, svbool_t pg,
9113- uint64_t tile, uint32_t slice_base,
9114- uint64_t slice_offset);
9131+ uint64_t tile, uint32_t slice);
91159132
91169133 // And similarly for u64 and f64.
91179134 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91189135 svint64_t svread_hor_za64[_s64]_m(svint64_t zd, svbool_t pg,
9119- uint64_t tile, uint32_t slice_base,
9120- uint64_t slice_offset);
9136+ uint64_t tile, uint32_t slice);
91219137
91229138 // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91239139 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za))
91249140 svint8_t svread_hor_za128[_s8]_m(svint8_t zd, svbool_t pg,
9125- uint64_t tile, uint32_t slice_base,
9126- uint64_t slice_offset);
9141+ uint64_t tile, uint32_t slice);
91279142```
91289143
91299144Replacing `_hor` with `_ver` gives the associated vertical forms.
@@ -9135,32 +9150,27 @@ the `zn` parameter to the `_u8` intrinsic has type `svuint8_t`.
91359150``` c
91369151 // And similarly for u8.
91379152 __attribute__((arm_streaming, arm_shared_za))
9138- void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice_base,
9139- uint64_t slice_offset, svbool_t pg,
9153+ void svwrite_hor_za8[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91409154 svint8_t zn);
91419155
91429156 // And similarly for u16, bf16 and f16.
91439157 __attribute__((arm_streaming, arm_shared_za))
9144- void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice_base,
9145- uint64_t slice_offset, svbool_t pg,
9158+ void svwrite_hor_za16[_s16]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91469159 svint16_t zn);
91479160
91489161 // And similarly for u32 and f32.
91499162 __attribute__((arm_streaming, arm_shared_za))
9150- void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice_base,
9151- uint64_t slice_offset, svbool_t pg,
9163+ void svwrite_hor_za32[_s32]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91529164 svint32_t zn);
91539165
91549166 // And similarly for u64 and f64.
91559167 __attribute__((arm_streaming, arm_shared_za))
9156- void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice_base,
9157- uint64_t slice_offset, svbool_t pg,
9168+ void svwrite_hor_za64[_s64]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91589169 svint64_t zn);
91599170
91609171 // And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
91619172 __attribute__((arm_streaming, arm_shared_za))
9162- void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice_base,
9163- uint64_t slice_offset, svbool_t pg,
9173+ void svwrite_hor_za128[_s8]_m(uint64_t tile, uint32_t slice, svbool_t pg,
91649174 svint8_t zn);
91659175```
91669176
0 commit comments