@@ -104,106 +104,103 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
104
104
; CHECK-NEXT: rev64 v5.4s, v2.4s
105
105
; CHECK-NEXT: add v16.4s, v0.4s, v7.4s
106
106
; CHECK-NEXT: add v17.4s, v3.4s, v6.4s
107
- ; CHECK-NEXT: add v22.4s, v1.4s, v4.4s
108
- ; CHECK-NEXT: uzp2 v18.4s, v17.4s, v16.4s
109
- ; CHECK-NEXT: uzp2 v19.4s, v16.4s, v17.4s
110
- ; CHECK-NEXT: add v21.4s, v2.4s, v5.4s
107
+ ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
111
108
; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
112
- ; CHECK-NEXT: trn2 v6.4s, v16.4s, v17.4s
113
- ; CHECK-NEXT: trn2 v20.4s, v17.4s, v16.4s
109
+ ; CHECK-NEXT: uzp2 v7.4s, v17.4s, v16.4s
110
+ ; CHECK-NEXT: zip2 v18.4s, v0.4s, v3.4s
111
+ ; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
112
+ ; CHECK-NEXT: uzp2 v3.4s, v16.4s, v17.4s
113
+ ; CHECK-NEXT: add v20.4s, v2.4s, v5.4s
114
+ ; CHECK-NEXT: add v21.4s, v1.4s, v4.4s
114
115
; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s
115
116
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
116
- ; CHECK-NEXT: zip1 v4.4s, v22.4s, v21.4s
117
- ; CHECK-NEXT: uzp2 v17.4s, v18.4s, v17.4s
118
- ; CHECK-NEXT: zip2 v18.4s, v22.4s, v21.4s
119
- ; CHECK-NEXT: uzp2 v16.4s, v19.4s, v16.4s
120
- ; CHECK-NEXT: zip1 v5.4s, v1.4s, v2.4s
121
- ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
117
+ ; CHECK-NEXT: trn2 v6.4s, v16.4s, v17.4s
118
+ ; CHECK-NEXT: trn2 v19.4s, v17.4s, v16.4s
119
+ ; CHECK-NEXT: zip1 v4.4s, v21.4s, v20.4s
120
+ ; CHECK-NEXT: uzp2 v5.4s, v7.4s, v17.4s
121
+ ; CHECK-NEXT: zip2 v7.4s, v21.4s, v20.4s
122
+ ; CHECK-NEXT: zip1 v17.4s, v1.4s, v2.4s
123
+ ; CHECK-NEXT: uzp2 v3.4s, v3.4s, v16.4s
122
124
; CHECK-NEXT: mov v6.d[1], v4.d[1]
123
- ; CHECK-NEXT: mov v16 .d[1], v18 .d[1]
124
- ; CHECK-NEXT: zip2 v7.4s, v0.4s, v3.4s
125
- ; CHECK-NEXT: ext v5.16b, v1.16b, v5.16b, #8
126
- ; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
125
+ ; CHECK-NEXT: mov v5 .d[1], v7 .d[1]
126
+ ; CHECK-NEXT: ext v16.16b, v1.16b, v17.16b, #8
127
+ ; CHECK-NEXT: mov v3.d[1], v7.d[1]
128
+ ; CHECK-NEXT: mov v19.d[1], v4.d[1]
127
129
; CHECK-NEXT: mov v1.s[3], v2.s[2]
128
- ; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s
129
- ; CHECK-NEXT: mov v17.d[1], v18.d[1]
130
- ; CHECK-NEXT: mov v20.d[1], v4.d[1]
131
- ; CHECK-NEXT: rev64 v6.4s, v3.4s
132
- ; CHECK-NEXT: mov v0.d[1], v5.d[1]
133
- ; CHECK-NEXT: mov v7.d[1], v1.d[1]
134
- ; CHECK-NEXT: add v2.4s, v17.4s, v20.4s
135
- ; CHECK-NEXT: add v5.4s, v3.4s, v6.4s
136
- ; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
137
- ; CHECK-NEXT: sub v6.4s, v0.4s, v7.4s
138
- ; CHECK-NEXT: add v0.4s, v7.4s, v0.4s
130
+ ; CHECK-NEXT: mov v0.d[1], v16.d[1]
131
+ ; CHECK-NEXT: sub v2.4s, v6.4s, v3.4s
132
+ ; CHECK-NEXT: add v3.4s, v5.4s, v19.4s
133
+ ; CHECK-NEXT: mov v18.d[1], v1.d[1]
134
+ ; CHECK-NEXT: rev64 v5.4s, v3.4s
139
135
; CHECK-NEXT: rev64 v4.4s, v2.4s
140
- ; CHECK-NEXT: rev64 v7.4s, v6.4s
141
- ; CHECK-NEXT: rev64 v16.4s, v0.4s
136
+ ; CHECK-NEXT: sub v7.4s, v0.4s, v18.4s
137
+ ; CHECK-NEXT: add v0.4s, v18.4s, v0.4s
138
+ ; CHECK-NEXT: add v6.4s, v3.4s, v5.4s
139
+ ; CHECK-NEXT: rev64 v16.4s, v7.4s
140
+ ; CHECK-NEXT: rev64 v17.4s, v0.4s
141
+ ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s
142
+ ; CHECK-NEXT: rev64 v5.4s, v6.4s
142
143
; CHECK-NEXT: add v1.4s, v2.4s, v4.4s
144
+ ; CHECK-NEXT: add v18.4s, v7.4s, v16.4s
145
+ ; CHECK-NEXT: add v19.4s, v0.4s, v17.4s
146
+ ; CHECK-NEXT: sub v7.4s, v7.4s, v16.4s
147
+ ; CHECK-NEXT: sub v0.4s, v0.4s, v17.4s
143
148
; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s
144
- ; CHECK-NEXT: ext v4.16b, v3.16b, v5.16b, #12
145
- ; CHECK-NEXT: add v5.4s, v6.4s, v7.4s
146
- ; CHECK-NEXT: add v17.4s, v0.4s, v16.4s
147
- ; CHECK-NEXT: sub v0.4s, v0.4s, v16.4s
148
- ; CHECK-NEXT: sub v6.4s, v6.4s, v7.4s
149
- ; CHECK-NEXT: ext v7.16b, v0.16b, v17.16b, #12
150
- ; CHECK-NEXT: ext v5.16b, v6.16b, v5.16b, #12
151
- ; CHECK-NEXT: rev64 v22.4s, v1.4s
152
- ; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
153
- ; CHECK-NEXT: ext v16.16b, v4.16b, v3.16b, #4
154
- ; CHECK-NEXT: ext v17.16b, v4.16b, v4.16b, #8
155
- ; CHECK-NEXT: ext v18.16b, v7.16b, v0.16b, #4
156
- ; CHECK-NEXT: ext v19.16b, v7.16b, v7.16b, #8
157
- ; CHECK-NEXT: ext v20.16b, v5.16b, v6.16b, #4
158
- ; CHECK-NEXT: ext v21.16b, v5.16b, v5.16b, #8
149
+ ; CHECK-NEXT: trn2 v4.4s, v5.4s, v3.4s
150
+ ; CHECK-NEXT: ext v5.16b, v2.16b, v1.16b, #12
151
+ ; CHECK-NEXT: ext v16.16b, v0.16b, v19.16b, #12
152
+ ; CHECK-NEXT: ext v17.16b, v7.16b, v18.16b, #12
153
+ ; CHECK-NEXT: ext v6.16b, v6.16b, v6.16b, #4
159
154
; CHECK-NEXT: rev64 v5.4s, v5.4s
160
- ; CHECK-NEXT: rev64 v7.4s, v7.4s
161
- ; CHECK-NEXT: rev64 v4.4s, v4.4s
162
- ; CHECK-NEXT: trn2 v1.4s, v2.4s, v1.4s
163
- ; CHECK-NEXT: ext v16.16b, v16.16b, v17.16b, #12
164
- ; CHECK-NEXT: ext v17.16b, v18.16b, v19.16b, #12
165
- ; CHECK-NEXT: ext v18.16b, v20.16b, v21.16b, #12
166
- ; CHECK-NEXT: trn2 v19.4s, v22.4s, v2.4s
167
- ; CHECK-NEXT: ext v2.16b, v5.16b, v6.16b, #4
168
- ; CHECK-NEXT: ext v0.16b, v7.16b, v0.16b, #4
169
- ; CHECK-NEXT: ext v3.16b, v4.16b, v3.16b, #4
170
- ; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
171
- ; CHECK-NEXT: add v4.4s, v18.4s, v2.4s
172
- ; CHECK-NEXT: add v5.4s, v17.4s, v0.4s
173
- ; CHECK-NEXT: add v6.4s, v16.4s, v3.4s
174
- ; CHECK-NEXT: add v7.4s, v19.4s, v1.4s
175
- ; CHECK-NEXT: sub v2.4s, v18.4s, v2.4s
176
- ; CHECK-NEXT: sub v0.4s, v17.4s, v0.4s
177
- ; CHECK-NEXT: sub v1.4s, v19.4s, v1.4s
178
- ; CHECK-NEXT: sub v3.4s, v16.4s, v3.4s
179
- ; CHECK-NEXT: mov v7.d[1], v1.d[1]
180
- ; CHECK-NEXT: mov v6.d[1], v3.d[1]
181
- ; CHECK-NEXT: mov v4.d[1], v2.d[1]
182
- ; CHECK-NEXT: mov v5.d[1], v0.d[1]
155
+ ; CHECK-NEXT: rev64 v16.4s, v16.4s
156
+ ; CHECK-NEXT: rev64 v17.4s, v17.4s
157
+ ; CHECK-NEXT: mov v1.s[3], v2.s[3]
158
+ ; CHECK-NEXT: mov v19.s[3], v0.s[3]
159
+ ; CHECK-NEXT: mov v18.s[3], v7.s[3]
160
+ ; CHECK-NEXT: ext v16.16b, v16.16b, v0.16b, #4
161
+ ; CHECK-NEXT: ext v17.16b, v17.16b, v7.16b, #4
162
+ ; CHECK-NEXT: ext v5.16b, v5.16b, v2.16b, #4
163
+ ; CHECK-NEXT: trn2 v3.4s, v3.4s, v6.4s
164
+ ; CHECK-NEXT: sub v20.4s, v19.4s, v16.4s
165
+ ; CHECK-NEXT: sub v21.4s, v18.4s, v17.4s
166
+ ; CHECK-NEXT: sub v6.4s, v1.4s, v5.4s
167
+ ; CHECK-NEXT: mov v18.s[0], v7.s[0]
168
+ ; CHECK-NEXT: mov v19.s[0], v0.s[0]
169
+ ; CHECK-NEXT: ext v0.16b, v3.16b, v3.16b, #4
170
+ ; CHECK-NEXT: mov v1.s[0], v2.s[0]
171
+ ; CHECK-NEXT: add v2.4s, v18.4s, v17.4s
172
+ ; CHECK-NEXT: add v3.4s, v19.4s, v16.4s
173
+ ; CHECK-NEXT: add v7.4s, v4.4s, v0.4s
174
+ ; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s
175
+ ; CHECK-NEXT: add v1.4s, v1.4s, v5.4s
176
+ ; CHECK-NEXT: mov v7.d[1], v0.d[1]
177
+ ; CHECK-NEXT: mov v1.d[1], v6.d[1]
178
+ ; CHECK-NEXT: mov v2.d[1], v21.d[1]
179
+ ; CHECK-NEXT: mov v3.d[1], v20.d[1]
183
180
; CHECK-NEXT: movi v0.8h, #1
184
181
; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff
185
- ; CHECK-NEXT: ushr v1 .4s, v4 .4s, #15
186
- ; CHECK-NEXT: ushr v2 .4s, v7.4s, #15
187
- ; CHECK-NEXT: ushr v3 .4s, v5 .4s, #15
188
- ; CHECK-NEXT: ushr v16.4s, v6 .4s, #15
189
- ; CHECK-NEXT: and v2 .16b, v2 .16b, v0.16b
182
+ ; CHECK-NEXT: ushr v4 .4s, v2 .4s, #15
183
+ ; CHECK-NEXT: ushr v5 .4s, v7.4s, #15
184
+ ; CHECK-NEXT: ushr v6 .4s, v3 .4s, #15
185
+ ; CHECK-NEXT: ushr v16.4s, v1 .4s, #15
186
+ ; CHECK-NEXT: and v5 .16b, v5 .16b, v0.16b
190
187
; CHECK-NEXT: and v16.16b, v16.16b, v0.16b
191
- ; CHECK-NEXT: and v3 .16b, v3 .16b, v0.16b
192
- ; CHECK-NEXT: and v0.16b, v1 .16b, v0.16b
193
- ; CHECK-NEXT: mul v1 .4s, v2 .4s, v17.4s
194
- ; CHECK-NEXT: mul v2 .4s, v16.4s, v17.4s
188
+ ; CHECK-NEXT: and v6 .16b, v6 .16b, v0.16b
189
+ ; CHECK-NEXT: and v0.16b, v4 .16b, v0.16b
190
+ ; CHECK-NEXT: mul v4 .4s, v5 .4s, v17.4s
191
+ ; CHECK-NEXT: mul v5 .4s, v16.4s, v17.4s
195
192
; CHECK-NEXT: mul v0.4s, v0.4s, v17.4s
196
- ; CHECK-NEXT: mul v3 .4s, v3 .4s, v17.4s
197
- ; CHECK-NEXT: add v7.4s, v1 .4s, v7.4s
198
- ; CHECK-NEXT: add v6 .4s, v2 .4s, v6 .4s
199
- ; CHECK-NEXT: add v4 .4s, v0.4s, v4 .4s
200
- ; CHECK-NEXT: add v5 .4s, v3 .4s, v5 .4s
201
- ; CHECK-NEXT: eor v0.16b, v4 .16b, v0.16b
202
- ; CHECK-NEXT: eor v3 .16b, v5 .16b, v3 .16b
203
- ; CHECK-NEXT: eor v2 .16b, v6 .16b, v2 .16b
204
- ; CHECK-NEXT: eor v1 .16b, v7.16b, v1 .16b
205
- ; CHECK-NEXT: add v1.4s, v1 .4s, v2 .4s
206
- ; CHECK-NEXT: add v0.4s, v3 .4s, v0.4s
193
+ ; CHECK-NEXT: mul v6 .4s, v6 .4s, v17.4s
194
+ ; CHECK-NEXT: add v7.4s, v4 .4s, v7.4s
195
+ ; CHECK-NEXT: add v1 .4s, v5 .4s, v1 .4s
196
+ ; CHECK-NEXT: add v2 .4s, v0.4s, v2 .4s
197
+ ; CHECK-NEXT: add v3 .4s, v6 .4s, v3 .4s
198
+ ; CHECK-NEXT: eor v0.16b, v2 .16b, v0.16b
199
+ ; CHECK-NEXT: eor v2 .16b, v3 .16b, v6 .16b
200
+ ; CHECK-NEXT: eor v1 .16b, v1 .16b, v5 .16b
201
+ ; CHECK-NEXT: eor v3 .16b, v7.16b, v4 .16b
202
+ ; CHECK-NEXT: add v1.4s, v3 .4s, v1 .4s
203
+ ; CHECK-NEXT: add v0.4s, v2 .4s, v0.4s
207
204
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
208
205
; CHECK-NEXT: addv s0, v0.4s
209
206
; CHECK-NEXT: fmov w8, s0
0 commit comments