@@ -288,13 +288,11 @@ define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
288
288
define float @nested_fadd_f32 (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
289
289
; CHECK-LABEL: nested_fadd_f32:
290
290
; CHECK: // %bb.0:
291
- ; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
291
+ ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
292
+ ; CHECK-NEXT: fadd s2, s2, s3
292
293
; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
293
- ; CHECK-NEXT: faddp s1, v1.2s
294
294
; CHECK-NEXT: faddp s0, v0.2s
295
- ; CHECK-NEXT: fadd s1, s1, s3
296
295
; CHECK-NEXT: fadd s0, s0, s2
297
- ; CHECK-NEXT: fadd s0, s0, s1
298
296
; CHECK-NEXT: ret
299
297
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %a )
300
298
%a1 = fadd fast float %r1 , %c
@@ -332,15 +330,12 @@ define float @nested_fadd_f32_slow(<4 x float> %a, <4 x float> %b, float %c, flo
332
330
define float @nested_mul_f32 (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
333
331
; CHECK-LABEL: nested_mul_f32:
334
332
; CHECK: // %bb.0:
335
- ; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
336
- ; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8
337
- ; CHECK-NEXT: fmul v1.2s, v1.2s, v4.2s
338
- ; CHECK-NEXT: fmul v0.2s, v0.2s, v5.2s
339
- ; CHECK-NEXT: fmul s1, s1, v1.s[1]
333
+ ; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
334
+ ; CHECK-NEXT: fmul s2, s2, s3
335
+ ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
336
+ ; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s
340
337
; CHECK-NEXT: fmul s0, s0, v0.s[1]
341
- ; CHECK-NEXT: fmul s1, s1, s3
342
338
; CHECK-NEXT: fmul s0, s0, s2
343
- ; CHECK-NEXT: fmul s0, s0, s1
344
339
; CHECK-NEXT: ret
345
340
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32 (float 1 .0 , <4 x float > %a )
346
341
%a1 = fmul fast float %r1 , %c
@@ -353,12 +348,10 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
353
348
define i32 @nested_add_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
354
349
; CHECK-LABEL: nested_add_i32:
355
350
; CHECK: // %bb.0:
356
- ; CHECK-NEXT: addv s1, v1.4s
351
+ ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
352
+ ; CHECK-NEXT: add w8, w0, w1
357
353
; CHECK-NEXT: addv s0, v0.4s
358
- ; CHECK-NEXT: fmov w8, s1
359
354
; CHECK-NEXT: fmov w9, s0
360
- ; CHECK-NEXT: add w9, w9, w0
361
- ; CHECK-NEXT: add w8, w8, w1
362
355
; CHECK-NEXT: add w0, w9, w8
363
356
; CHECK-NEXT: ret
364
357
%r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
@@ -372,12 +365,10 @@ define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
372
365
define i32 @nested_add_c1_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
373
366
; CHECK-LABEL: nested_add_c1_i32:
374
367
; CHECK: // %bb.0:
375
- ; CHECK-NEXT: addv s1, v1.4s
368
+ ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
369
+ ; CHECK-NEXT: add w8, w0, w1
376
370
; CHECK-NEXT: addv s0, v0.4s
377
- ; CHECK-NEXT: fmov w8, s1
378
371
; CHECK-NEXT: fmov w9, s0
379
- ; CHECK-NEXT: add w9, w0, w9
380
- ; CHECK-NEXT: add w8, w8, w1
381
372
; CHECK-NEXT: add w0, w9, w8
382
373
; CHECK-NEXT: ret
383
374
%r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
@@ -391,12 +382,10 @@ define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
391
382
define i32 @nested_add_c2_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
392
383
; CHECK-LABEL: nested_add_c2_i32:
393
384
; CHECK: // %bb.0:
394
- ; CHECK-NEXT: addv s1, v1.4s
385
+ ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
386
+ ; CHECK-NEXT: add w8, w0, w1
395
387
; CHECK-NEXT: addv s0, v0.4s
396
- ; CHECK-NEXT: fmov w8, s1
397
388
; CHECK-NEXT: fmov w9, s0
398
- ; CHECK-NEXT: add w9, w9, w0
399
- ; CHECK-NEXT: add w8, w1, w8
400
389
; CHECK-NEXT: add w0, w9, w8
401
390
; CHECK-NEXT: ret
402
391
%r1 = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %a )
@@ -429,19 +418,14 @@ define i32 @nested_add_manyreduct_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
429
418
define i32 @nested_mul_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
430
419
; CHECK-LABEL: nested_mul_i32:
431
420
; CHECK: // %bb.0:
432
- ; CHECK-NEXT: ext v3.16b, v0.16b , v0.16b, #8
433
- ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
434
- ; CHECK-NEXT: mul v0.2s , v0.2s, v3.2s
435
- ; CHECK-NEXT: mul v1 .2s, v1 .2s, v2 .2s
436
- ; CHECK-NEXT: mov w8 , v0.s[1]
421
+ ; CHECK-NEXT: mul v0.4s , v0.4s, v1.4s
422
+ ; CHECK-NEXT: mul w8, w0, w1
423
+ ; CHECK-NEXT: ext v1.16b , v0.16b, v0.16b, #8
424
+ ; CHECK-NEXT: mul v0 .2s, v0 .2s, v1 .2s
425
+ ; CHECK-NEXT: mov w9 , v0.s[1]
437
426
; CHECK-NEXT: fmov w10, s0
438
- ; CHECK-NEXT: mov w9, v1.s[1]
439
- ; CHECK-NEXT: mul w8, w10, w8
440
- ; CHECK-NEXT: fmov w10, s1
441
427
; CHECK-NEXT: mul w9, w10, w9
442
- ; CHECK-NEXT: mul w8, w8, w0
443
- ; CHECK-NEXT: mul w9, w9, w1
444
- ; CHECK-NEXT: mul w0, w8, w9
428
+ ; CHECK-NEXT: mul w0, w9, w8
445
429
; CHECK-NEXT: ret
446
430
%r1 = call i32 @llvm.vector.reduce.mul.v4i32 (<4 x i32 > %a )
447
431
%a1 = mul i32 %r1 , %c
@@ -454,19 +438,14 @@ define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
454
438
define i32 @nested_and_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
455
439
; CHECK-LABEL: nested_and_i32:
456
440
; CHECK: // %bb.0:
457
- ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
458
- ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
459
- ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
460
- ; CHECK-NEXT: and v0.8b, v0.8b, v3.8b
461
- ; CHECK-NEXT: fmov x8, d1
441
+ ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
442
+ ; CHECK-NEXT: and w8, w0, w1
443
+ ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
444
+ ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
462
445
; CHECK-NEXT: fmov x9, d0
463
446
; CHECK-NEXT: lsr x10, x9, #32
464
- ; CHECK-NEXT: lsr x11, x8, #32
465
- ; CHECK-NEXT: and w9, w9, w0
466
- ; CHECK-NEXT: and w8, w8, w1
467
- ; CHECK-NEXT: and w9, w9, w10
468
- ; CHECK-NEXT: and w8, w8, w11
469
- ; CHECK-NEXT: and w0, w9, w8
447
+ ; CHECK-NEXT: and w8, w9, w8
448
+ ; CHECK-NEXT: and w0, w8, w10
470
449
; CHECK-NEXT: ret
471
450
%r1 = call i32 @llvm.vector.reduce.and.v4i32 (<4 x i32 > %a )
472
451
%a1 = and i32 %r1 , %c
@@ -479,19 +458,14 @@ define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
479
458
define i32 @nested_or_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
480
459
; CHECK-LABEL: nested_or_i32:
481
460
; CHECK: // %bb.0:
482
- ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
483
- ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
484
- ; CHECK-NEXT: orr v1.8b, v1.8b, v2.8b
485
- ; CHECK-NEXT: orr v0.8b, v0.8b, v3.8b
486
- ; CHECK-NEXT: fmov x8, d1
461
+ ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
462
+ ; CHECK-NEXT: orr w8, w0, w1
463
+ ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
464
+ ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
487
465
; CHECK-NEXT: fmov x9, d0
488
466
; CHECK-NEXT: lsr x10, x9, #32
489
- ; CHECK-NEXT: lsr x11, x8, #32
490
- ; CHECK-NEXT: orr w9, w9, w0
491
- ; CHECK-NEXT: orr w8, w8, w1
492
- ; CHECK-NEXT: orr w9, w9, w10
493
- ; CHECK-NEXT: orr w8, w8, w11
494
- ; CHECK-NEXT: orr w0, w9, w8
467
+ ; CHECK-NEXT: orr w8, w9, w8
468
+ ; CHECK-NEXT: orr w0, w8, w10
495
469
; CHECK-NEXT: ret
496
470
%r1 = call i32 @llvm.vector.reduce.or.v4i32 (<4 x i32 > %a )
497
471
%a1 = or i32 %r1 , %c
@@ -504,19 +478,14 @@ define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
504
478
define i32 @nested_xor_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
505
479
; CHECK-LABEL: nested_xor_i32:
506
480
; CHECK: // %bb.0:
507
- ; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
508
- ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
509
- ; CHECK-NEXT: eor v1.8b, v1.8b, v2.8b
510
- ; CHECK-NEXT: eor v0.8b, v0.8b, v3.8b
511
- ; CHECK-NEXT: fmov x8, d1
481
+ ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
482
+ ; CHECK-NEXT: eor w8, w0, w1
483
+ ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
484
+ ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
512
485
; CHECK-NEXT: fmov x9, d0
513
486
; CHECK-NEXT: lsr x10, x9, #32
514
- ; CHECK-NEXT: lsr x11, x8, #32
515
- ; CHECK-NEXT: eor w9, w9, w0
516
- ; CHECK-NEXT: eor w8, w8, w1
517
- ; CHECK-NEXT: eor w9, w9, w10
518
- ; CHECK-NEXT: eor w8, w8, w11
519
- ; CHECK-NEXT: eor w0, w9, w8
487
+ ; CHECK-NEXT: eor w8, w9, w8
488
+ ; CHECK-NEXT: eor w0, w8, w10
520
489
; CHECK-NEXT: ret
521
490
%r1 = call i32 @llvm.vector.reduce.xor.v4i32 (<4 x i32 > %a )
522
491
%a1 = xor i32 %r1 , %c
@@ -529,14 +498,11 @@ define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
529
498
define i32 @nested_smin_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
530
499
; CHECK-LABEL: nested_smin_i32:
531
500
; CHECK: // %bb.0:
501
+ ; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
502
+ ; CHECK-NEXT: cmp w0, w1
503
+ ; CHECK-NEXT: csel w8, w0, w1, lt
532
504
; CHECK-NEXT: sminv s0, v0.4s
533
- ; CHECK-NEXT: sminv s1, v1.4s
534
505
; CHECK-NEXT: fmov w9, s0
535
- ; CHECK-NEXT: fmov w8, s1
536
- ; CHECK-NEXT: cmp w9, w0
537
- ; CHECK-NEXT: csel w9, w9, w0, lt
538
- ; CHECK-NEXT: cmp w8, w1
539
- ; CHECK-NEXT: csel w8, w8, w1, lt
540
506
; CHECK-NEXT: cmp w9, w8
541
507
; CHECK-NEXT: csel w0, w9, w8, lt
542
508
; CHECK-NEXT: ret
@@ -551,14 +517,11 @@ define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
551
517
define i32 @nested_smax_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
552
518
; CHECK-LABEL: nested_smax_i32:
553
519
; CHECK: // %bb.0:
520
+ ; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
521
+ ; CHECK-NEXT: cmp w0, w1
522
+ ; CHECK-NEXT: csel w8, w0, w1, gt
554
523
; CHECK-NEXT: smaxv s0, v0.4s
555
- ; CHECK-NEXT: smaxv s1, v1.4s
556
524
; CHECK-NEXT: fmov w9, s0
557
- ; CHECK-NEXT: fmov w8, s1
558
- ; CHECK-NEXT: cmp w9, w0
559
- ; CHECK-NEXT: csel w9, w9, w0, gt
560
- ; CHECK-NEXT: cmp w8, w1
561
- ; CHECK-NEXT: csel w8, w8, w1, gt
562
525
; CHECK-NEXT: cmp w9, w8
563
526
; CHECK-NEXT: csel w0, w9, w8, gt
564
527
; CHECK-NEXT: ret
@@ -573,14 +536,11 @@ define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
573
536
define i32 @nested_umin_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
574
537
; CHECK-LABEL: nested_umin_i32:
575
538
; CHECK: // %bb.0:
539
+ ; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
540
+ ; CHECK-NEXT: cmp w0, w1
541
+ ; CHECK-NEXT: csel w8, w0, w1, lo
576
542
; CHECK-NEXT: uminv s0, v0.4s
577
- ; CHECK-NEXT: uminv s1, v1.4s
578
543
; CHECK-NEXT: fmov w9, s0
579
- ; CHECK-NEXT: fmov w8, s1
580
- ; CHECK-NEXT: cmp w9, w0
581
- ; CHECK-NEXT: csel w9, w9, w0, lo
582
- ; CHECK-NEXT: cmp w8, w1
583
- ; CHECK-NEXT: csel w8, w8, w1, lo
584
544
; CHECK-NEXT: cmp w9, w8
585
545
; CHECK-NEXT: csel w0, w9, w8, lo
586
546
; CHECK-NEXT: ret
@@ -595,14 +555,11 @@ define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
595
555
define i32 @nested_umax_i32 (<4 x i32 > %a , <4 x i32 > %b , i32 %c , i32 %d ) {
596
556
; CHECK-LABEL: nested_umax_i32:
597
557
; CHECK: // %bb.0:
558
+ ; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
559
+ ; CHECK-NEXT: cmp w0, w1
560
+ ; CHECK-NEXT: csel w8, w0, w1, hi
598
561
; CHECK-NEXT: umaxv s0, v0.4s
599
- ; CHECK-NEXT: umaxv s1, v1.4s
600
562
; CHECK-NEXT: fmov w9, s0
601
- ; CHECK-NEXT: fmov w8, s1
602
- ; CHECK-NEXT: cmp w9, w0
603
- ; CHECK-NEXT: csel w9, w9, w0, hi
604
- ; CHECK-NEXT: cmp w8, w1
605
- ; CHECK-NEXT: csel w8, w8, w1, hi
606
563
; CHECK-NEXT: cmp w9, w8
607
564
; CHECK-NEXT: csel w0, w9, w8, hi
608
565
; CHECK-NEXT: ret
@@ -617,11 +574,10 @@ define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
617
574
define float @nested_fmin_float (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
618
575
; CHECK-LABEL: nested_fmin_float:
619
576
; CHECK: // %bb.0:
620
- ; CHECK-NEXT: fminnmv s1, v1.4s
577
+ ; CHECK-NEXT: fminnm v0.4s, v0.4s, v1.4s
578
+ ; CHECK-NEXT: fminnm s2, s2, s3
621
579
; CHECK-NEXT: fminnmv s0, v0.4s
622
- ; CHECK-NEXT: fminnm s1, s1, s3
623
580
; CHECK-NEXT: fminnm s0, s0, s2
624
- ; CHECK-NEXT: fminnm s0, s0, s1
625
581
; CHECK-NEXT: ret
626
582
%r1 = call float @llvm.vector.reduce.fmin.v4f32 (<4 x float > %a )
627
583
%a1 = call float @llvm.minnum.f32 (float %r1 , float %c )
@@ -634,11 +590,10 @@ define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float
634
590
define float @nested_fmax_float (<4 x float > %a , <4 x float > %b , float %c , float %d ) {
635
591
; CHECK-LABEL: nested_fmax_float:
636
592
; CHECK: // %bb.0:
637
- ; CHECK-NEXT: fmaxnmv s1, v1.4s
593
+ ; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
594
+ ; CHECK-NEXT: fmaxnm s2, s2, s3
638
595
; CHECK-NEXT: fmaxnmv s0, v0.4s
639
- ; CHECK-NEXT: fmaxnm s1, s1, s3
640
596
; CHECK-NEXT: fmaxnm s0, s0, s2
641
- ; CHECK-NEXT: fmaxnm s0, s0, s1
642
597
; CHECK-NEXT: ret
643
598
%r1 = call float @llvm.vector.reduce.fmax.v4f32 (<4 x float > %a )
644
599
%a1 = call float @llvm.maxnum.f32 (float %r1 , float %c )
0 commit comments