From 50c743fa713002fe4e0c76d23043e6c1f9e9fe6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=C3=A1vid=20Bolvansk=C3=BD?= <david.bolvansky@gmail.com>
Date: Thu, 13 Aug 2020 12:45:14 +0200
Subject: [PATCH 01/23] [BPI] Improve static heuristics for integer comparisons

Similarly as for pointers, even for integers a == b is usually false.

GCC also uses this heuristic.

Reviewed By: ebrevnov

Differential Revision: https://reviews.llvm.org/D85781
---
 .../test/profile/Linux/counter_promo_for.c    |  16 +-
 .../test/profile/Linux/counter_promo_while.c  |  14 +-
 .../llvm/Analysis/BranchProbabilityInfo.h     |   2 +-
 llvm/lib/Analysis/BranchProbabilityInfo.cpp   |  27 +-
 ...ro_heuristics.ll => integer_heuristics.ll} |  51 +
 .../Analysis/BranchProbabilityInfo/loop.ll    |  16 +-
 llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll  |   1 -
 .../CodeGen/AArch64/branch-relax-alignment.ll |  27 +-
 .../AArch64/combine-comparisons-by-cse.ll     | 441 +++++++--
 llvm/test/CodeGen/AArch64/cond-br-tuning.ll   | 139 ++-
 .../CodeGen/AArch64/fast-isel-cmp-branch.ll   | 347 +++++--
 .../CodeGen/ARM/2011-12-14-machine-sink.ll    |   7 +-
 llvm/test/CodeGen/ARM/cmpxchg-weak.ll         |  53 +-
 llvm/test/CodeGen/ARM/lsr-unfolded-offset.ll  |   8 +-
 llvm/test/CodeGen/ARM/machine-cse-cmp.ll      |   2 +-
 llvm/test/CodeGen/Hexagon/newvaluejump2.ll    |  30 +-
 llvm/test/CodeGen/Mips/brcongt.ll             |  25 +-
 llvm/test/CodeGen/Mips/brconlt.ll             |  45 +-
 llvm/test/CodeGen/Mips/brconne.ll             |  26 +-
 .../Mips/compactbranches/no-beqzc-bnezc.ll    | 377 +++++++-
 llvm/test/CodeGen/Mips/lcb2.ll                | 300 +++++-
 llvm/test/CodeGen/Mips/lcb5.ll                | 352 ++++++-
 .../compact-branches-long-branch.ll           | 233 ++++-
 llvm/test/CodeGen/Mips/seleq.ll               |  84 +-
 llvm/test/CodeGen/Mips/selle.ll               |  82 +-
 llvm/test/CodeGen/PowerPC/brcond.ll           | 583 +++++++++---
 .../memCmpUsedInZeroEqualityComparison.ll     |  18 +-
 .../PowerPC/redundant-copy-after-tail-dup.ll  |  28 +-
 llvm/test/CodeGen/RISCV/branch.ll             |  45 +-
 .../RISCV/rv64m-w-insts-legalization.ll       |  10 +-
 llvm/test/CodeGen/SystemZ/int-cmp-37.ll       |  78 +-
 llvm/test/CodeGen/SystemZ/int-cmp-40.ll       |  78 +-
 .../LowOverheadLoops/mve-float-loops.ll       |   3 +-
 .../varying-outer-2d-reduction.ll             |  21 +-
 .../CodeGen/Thumb2/mve-postinc-distribute.ll  |   3 +-
 llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll   | 139 ++-
 llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll  | 113 ++-
 llvm/test/CodeGen/Thumb2/thumb2-branch.ll     |   4 +-
 llvm/test/CodeGen/X86/3addr-16bit.ll          |  80 +-
 llvm/test/CodeGen/X86/absolute-cmp.ll         |  48 +-
 llvm/test/CodeGen/X86/atomic-flags.ll         |  97 +-
 llvm/test/CodeGen/X86/atomic-unordered.ll     |  20 +-
 llvm/test/CodeGen/X86/bmi.ll                  |  34 +-
 llvm/test/CodeGen/X86/bt.ll                   |  56 +-
 .../CodeGen/X86/conditional-tailcall-pgso.ll  |  48 +-
 llvm/test/CodeGen/X86/conditional-tailcall.ll |  48 +-
 .../test/CodeGen/X86/fast-isel-cmp-branch2.ll | 317 +++++--
 llvm/test/CodeGen/X86/funnel-shift.ll         |  18 +-
 .../X86/indirect-branch-tracking-eh2.ll       | 220 ++++-
 llvm/test/CodeGen/X86/jump_sign.ll            |  14 +-
 llvm/test/CodeGen/X86/lsr-negative-stride.ll  |   9 +-
 llvm/test/CodeGen/X86/machine-cse.ll          |   7 +-
 .../CodeGen/X86/memcmp-more-load-pairs.ll     | 875 +++++++++---------
 llvm/test/CodeGen/X86/memcmp-optsize.ll       | 158 ++--
 llvm/test/CodeGen/X86/memcmp-pgso.ll          | 166 ++--
 llvm/test/CodeGen/X86/memcmp.ll               | 433 ++++-----
 llvm/test/CodeGen/X86/neg_cmp.ll              |  16 +-
 llvm/test/CodeGen/X86/nobt.ll                 |  14 +-
 llvm/test/CodeGen/X86/pr29170.ll              |  10 +-
 llvm/test/CodeGen/X86/wide-integer-cmp.ll     |  20 +-
 llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll   |  23 +-
 .../PGOProfile/counter_promo_mexits.ll        | 127 ++-
 llvm/test/Transforms/PGOProfile/landingpad.ll |   3 +-
 63 files changed, 4757 insertions(+), 1932 deletions(-)
 rename llvm/test/Analysis/BranchProbabilityInfo/{zero_heuristics.ll => integer_heuristics.ll} (67%)

diff --git a/compiler-rt/test/profile/Linux/counter_promo_for.c b/compiler-rt/test/profile/Linux/counter_promo_for.c
index 0efebdc95271db..7cab70b08773bc 100644
--- a/compiler-rt/test/profile/Linux/counter_promo_for.c
+++ b/compiler-rt/test/profile/Linux/counter_promo_for.c
@@ -2,7 +2,7 @@
 // RUN: rm -fr %t.nopromo.prof
 // RUN: %clang_pgogen=%t.promo.prof/ -o %t.promo.gen -O2 %s
 // RUN: %clang_pgogen=%t.promo.prof/ -o %t.promo.gen.ll -emit-llvm -S -O2 %s
-// RUN: cat %t.promo.gen.ll | FileCheck --check-prefix=PROMO %s
+// RUN: cp %t.promo.gen.ll /tmp/d.txt ; cat %t.promo.gen.ll | FileCheck --check-prefix=PROMO %s
 // RUN: %run %t.promo.gen
 // RUN: llvm-profdata merge -o %t.promo.profdata %t.promo.prof/
 // RUN: llvm-profdata show --counts --all-functions %t.promo.profdata  > %t.promo.dump
@@ -22,23 +22,23 @@ __attribute__((noinline)) void foo(int n, int N) {
 // PROMO: load{{.*}}@__profc_foo{{.*}} 3){{.*}}
 // PROMO-NEXT: add
 // PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 3){{.*}}
-// PROMO: load{{.*}}@__profc_foo{{.*}} 0){{.*}}
-// PROMO-NEXT: add
-// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 0){{.*}}
-// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// PROMO: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
 // PROMO-NEXT: add
 // PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 0){{.*}}
+// PROMO-NEXT: add
+// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 0){{.*}}
 // PROMO: load{{.*}}@__profc_foo{{.*}} 2){{.*}}
 // PROMO-NEXT: add
 // PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 2){{.*}}
 //
 // NOPROMO-LABEL: @foo
-// NOPROMO: load{{.*}}@__profc_foo{{.*}} 0){{.*}}
-// NOPROMO-NEXT: add
-// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 0){{.*}}
 // NOPROMO: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
 // NOPROMO-NEXT: add
 // NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// NOPROMO: load{{.*}}@__profc_foo{{.*}} 0){{.*}}
+// NOPROMO-NEXT: add
+// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 0){{.*}}
 // NOPROMO: load{{.*}}@__profc_foo{{.*}} 2){{.*}}
 // NOPROMO-NEXT: add
 // NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 2){{.*}}
diff --git a/compiler-rt/test/profile/Linux/counter_promo_while.c b/compiler-rt/test/profile/Linux/counter_promo_while.c
index 183ef8543affb5..fdd77982c2bf69 100644
--- a/compiler-rt/test/profile/Linux/counter_promo_while.c
+++ b/compiler-rt/test/profile/Linux/counter_promo_while.c
@@ -17,23 +17,23 @@ int g;
 __attribute__((noinline)) void bar(int i) { g += i; }
 __attribute__((noinline)) void foo(int n, int N) {
 // PROMO-LABEL: @foo
-// PROMO: load{{.*}}@__profc_foo{{.*}} 0){{.*}}
-// PROMO-NEXT: add
-// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 0){{.*}}
-// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// PROMO: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
 // PROMO-NEXT: add
 // PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 0){{.*}}
+// PROMO-NEXT: add
+// PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 0){{.*}}
 // PROMO-NEXT: load{{.*}}@__profc_foo{{.*}} 2){{.*}}
 // PROMO-NEXT: add
 // PROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 2){{.*}}
 //
 // NOPROMO-LABEL: @foo
-// NOPROMO: load{{.*}}@__profc_foo{{.*}} 0){{.*}}
-// NOPROMO-NEXT: add
-// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 0){{.*}}
 // NOPROMO: load{{.*}}@__profc_foo{{.*}} 1){{.*}}
 // NOPROMO-NEXT: add
 // NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 1){{.*}}
+// NOPROMO: load{{.*}}@__profc_foo{{.*}} 0){{.*}}
+// NOPROMO-NEXT: add
+// NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 0){{.*}}
 // NOPROMO: load{{.*}}@__profc_foo{{.*}} 2){{.*}}
 // NOPROMO-NEXT: add
 // NOPROMO-NEXT: store{{.*}}@__profc_foo{{.*}} 2){{.*}}
diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index 447f14501cb65d..2c736fe9c1f07a 100644
--- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -315,7 +315,7 @@ class BranchProbabilityInfo {
   bool calcColdCallHeuristics(const BasicBlock *BB);
   bool calcPointerHeuristics(const BasicBlock *BB);
   bool calcLoopBranchHeuristics(const BasicBlock *BB, const LoopInfo &LI);
-  bool calcZeroHeuristics(const BasicBlock *BB, const TargetLibraryInfo *TLI);
+  bool calcIntegerHeuristics(const BasicBlock *BB, const TargetLibraryInfo *TLI);
   bool calcFloatingPointHeuristics(const BasicBlock *BB);
   bool calcInvokeHeuristics(const BasicBlock *BB);
 };
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index eae2c4ea9da887..6df90e66b84b8c 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -122,8 +122,8 @@ static const uint32_t CC_NONTAKEN_WEIGHT = 64;
 static const uint32_t PH_TAKEN_WEIGHT = 20;
 static const uint32_t PH_NONTAKEN_WEIGHT = 12;
 
-static const uint32_t ZH_TAKEN_WEIGHT = 20;
-static const uint32_t ZH_NONTAKEN_WEIGHT = 12;
+static const uint32_t INTH_TAKEN_WEIGHT = 20;
+static const uint32_t INTH_NONTAKEN_WEIGHT = 12;
 
 static const uint32_t FPH_TAKEN_WEIGHT = 20;
 static const uint32_t FPH_NONTAKEN_WEIGHT = 12;
@@ -856,7 +856,7 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
   return true;
 }
 
-bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
+bool BranchProbabilityInfo::calcIntegerHeuristics(const BasicBlock *BB,
                                                const TargetLibraryInfo *TLI) {
   const BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
@@ -873,10 +873,21 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
     return dyn_cast<ConstantInt>(V);
   };
 
+  BranchProbability TakenProb(INTH_TAKEN_WEIGHT,
+                              INTH_TAKEN_WEIGHT + INTH_NONTAKEN_WEIGHT);
+  BranchProbability UntakenProb(INTH_NONTAKEN_WEIGHT,
+                                INTH_TAKEN_WEIGHT + INTH_NONTAKEN_WEIGHT);
   Value *RHS = CI->getOperand(1);
   ConstantInt *CV = GetConstantInt(RHS);
-  if (!CV)
-    return false;
+  if (!CV) {
+    // X == Y -> Unlikely
+    // Otherwise -> Likely
+    if (CI->isTrueWhenEqual())
+      std::swap(TakenProb, UntakenProb);
+    setEdgeProbability(
+        BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb}));
+    return true;
+  }
 
   // If the LHS is the result of AND'ing a value with a single bit bitmask,
   // we don't have information about probabilities.
@@ -964,10 +975,6 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
     return false;
   }
 
-  BranchProbability TakenProb(ZH_TAKEN_WEIGHT,
-                              ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
-  BranchProbability UntakenProb(ZH_NONTAKEN_WEIGHT,
-                                ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
   if (!isProb)
     std::swap(TakenProb, UntakenProb);
 
@@ -1221,7 +1228,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
       continue;
     if (calcPointerHeuristics(BB))
       continue;
-    if (calcZeroHeuristics(BB, TLI))
+    if (calcIntegerHeuristics(BB, TLI))
       continue;
     if (calcFloatingPointHeuristics(BB))
       continue;
diff --git a/llvm/test/Analysis/BranchProbabilityInfo/zero_heuristics.ll b/llvm/test/Analysis/BranchProbabilityInfo/integer_heuristics.ll
similarity index 67%
rename from llvm/test/Analysis/BranchProbabilityInfo/zero_heuristics.ll
rename to llvm/test/Analysis/BranchProbabilityInfo/integer_heuristics.ll
index c6e1cb8c265a3b..c1d894a712fcfa 100644
--- a/llvm/test/Analysis/BranchProbabilityInfo/zero_heuristics.ll
+++ b/llvm/test/Analysis/BranchProbabilityInfo/integer_heuristics.ll
@@ -101,3 +101,54 @@ for.inc:
 exit:
   ret void
 }
+
+declare void @foo() 
+
+; CHECK-LABEL: foo1
+define i32 @foo1(i32 %x, i32 %y, i8 signext %z, i8 signext %w) {
+entry: 
+  %c = icmp eq i32 %x, %y
+  br i1 %c, label %then, label %else
+; CHECK: edge entry -> then probability is 0x30000000 / 0x80000000 = 37.50%
+; CHECK: edge entry -> else probability is 0x50000000 / 0x80000000 = 62.50%
+then:
+  tail call void @foo()
+  br label %else
+; CHECK: edge then -> else probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+else:
+  %v = phi i8 [ %z, %then ], [ %w, %entry ]
+  %r = sext i8 %v to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: foo2
+define i32 @foo2(i32 %x, i32 %y, i8 signext %z, i8 signext %w) {
+entry: 
+  %c = icmp ne i32 %x, %y
+  br i1 %c, label %then, label %else
+; CHECK: edge entry -> then probability is 0x50000000 / 0x80000000 = 62.50%
+; CHECK: edge entry -> else probability is 0x30000000 / 0x80000000 = 37.50%
+then:
+  br label %else
+; CHECK: edge then -> else probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+else:
+  %v = phi i8 [ %z, %then ], [ %w, %entry ]
+  %r = sext i8 %v to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: foo3
+define i32 @foo3(i32 %x, i32 %y, i8 signext %z, i8 signext %w) {
+entry: 
+  %c = icmp ult i32 %x, %y
+  br i1 %c, label %then, label %else
+; CHECK: edge entry -> then probability is 0x50000000 / 0x80000000 = 62.50%
+; CHECK: edge entry -> else probability is 0x30000000 / 0x80000000 = 37.50%
+then:
+  br label %else
+; CHECK: edge then -> else probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+else:
+  %v = phi i8 [ %z, %then ], [ %w, %entry ]
+  %r = sext i8 %v to i32
+  ret i32 %r
+}
diff --git a/llvm/test/Analysis/BranchProbabilityInfo/loop.ll b/llvm/test/Analysis/BranchProbabilityInfo/loop.ll
index 63377e3ba955d2..fde631a273c57d 100644
--- a/llvm/test/Analysis/BranchProbabilityInfo/loop.ll
+++ b/llvm/test/Analysis/BranchProbabilityInfo/loop.ll
@@ -263,8 +263,8 @@ for.body:
   %0 = load i32, i32* %c, align 4
   %cmp1 = icmp eq i32 %0, %i.011
   br i1 %cmp1, label %for.inc5, label %if.end
-; CHECK: edge for.body -> for.inc5 probability is 0x40000000 / 0x80000000 = 50.00%
-; CHECK: edge for.body -> if.end probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge for.body -> for.inc5 probability is 0x30000000 / 0x80000000 = 37.50%
+; CHECK: edge for.body -> if.end probability is 0x50000000 / 0x80000000 = 62.50%
 
 if.end:
   call void @g1()
@@ -324,22 +324,22 @@ for.body3:
   %0 = load i32, i32* %c, align 4
   %cmp4 = icmp eq i32 %0, %j.017
   br i1 %cmp4, label %for.inc, label %if.end
-; CHECK: edge for.body3 -> for.inc probability is 0x40000000 / 0x80000000 = 50.00%
-; CHECK: edge for.body3 -> if.end probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge for.body3 -> for.inc probability is 0x30000000 / 0x80000000 = 37.50%
+; CHECK: edge for.body3 -> if.end probability is 0x50000000 / 0x80000000 = 62.50%
 
 if.end:
   %1 = load i32, i32* %arrayidx5, align 4
   %cmp6 = icmp eq i32 %1, %j.017
   br i1 %cmp6, label %for.inc, label %if.end8
-; CHECK: edge if.end -> for.inc probability is 0x40000000 / 0x80000000 = 50.00%
-; CHECK: edge if.end -> if.end8 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge if.end -> for.inc probability is 0x30000000 / 0x80000000 = 37.50%
+; CHECK: edge if.end -> if.end8 probability is 0x50000000 / 0x80000000 = 62.50%
 
 if.end8:
   %2 = load i32, i32* %arrayidx9, align 4
   %cmp10 = icmp eq i32 %2, %j.017
   br i1 %cmp10, label %for.inc, label %if.end12
-; CHECK: edge if.end8 -> for.inc probability is 0x40000000 / 0x80000000 = 50.00%
-; CHECK: edge if.end8 -> if.end12 probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge if.end8 -> for.inc probability is 0x30000000 / 0x80000000 = 37.50%
+; CHECK: edge if.end8 -> if.end12 probability is 0x50000000 / 0x80000000 = 62.50%
 
 if.end12:
   call void @g2()
diff --git a/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll b/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
index df9534ffde0973..b74873c61748b5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -88,7 +88,6 @@ exit:
 ; CHECK-LABEL: test_GEP_across_BB:
 ; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #528]
 ; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #532]
-; CHECK-NOT: add
 ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #532]
 ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #528]
 
diff --git a/llvm/test/CodeGen/AArch64/branch-relax-alignment.ll b/llvm/test/CodeGen/AArch64/branch-relax-alignment.ll
index 7135dff7f5732b..308917be00152d 100644
--- a/llvm/test/CodeGen/AArch64/branch-relax-alignment.ll
+++ b/llvm/test/CodeGen/AArch64/branch-relax-alignment.ll
@@ -1,19 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-bcc-offset-bits=4 -align-all-nofallthru-blocks=4 < %s | FileCheck %s
 
 ; Long branch is assumed because the block has a higher alignment
 ; requirement than the function.
 
-; CHECK-LABEL: invert_bcc_block_align_higher_func:
-; CHECK: b.eq [[JUMP_BB1:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b [[JUMP_BB2:LBB[0-9]+_[0-9]+]]
-
-; CHECK: [[JUMP_BB1]]:
-; CHECK: ret
-; CHECK: .p2align 4
-
-; CHECK: [[JUMP_BB2]]:
-; CHECK: ret
 define i32 @invert_bcc_block_align_higher_func(i32 %x, i32 %y) align 4 #0 {
+; CHECK-LABEL: invert_bcc_block_align_higher_func:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.ne LBB0_1
+; CHECK-NEXT:    b LBB0_2
+; CHECK-NEXT:  LBB0_1: ; %bb2
+; CHECK-NEXT:    mov w8, #9
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    str w8, [x8]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  LBB0_2: ; %bb1
+; CHECK-NEXT:    mov w8, #42
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str w8, [x8]
+; CHECK-NEXT:    ret
   %1 = icmp eq i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 
diff --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index e7c6e3b5ef7b16..f8aab08da1cdcc 100644
--- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -debugify-and-strip-all-safe < %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 ; marked as external to prevent possible optimizations
@@ -8,12 +9,34 @@
 
 ; (a > 10 && b == c) || (a >= 10 && b == d)
 define i32 @combine_gt_ge_10() #0 {
-; CHECK-LABEL: combine_gt_ge_10
-; CHECK: cmp
-; CHECK: b.le
-; CHECK: ret
-; CHECK-NOT: cmp
-; CHECK: b.lt
+; CHECK-LABEL: combine_gt_ge_10:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, a
+; CHECK-NEXT:    ldr w8, [x8, :lo12:a]
+; CHECK-NEXT:    cmp w8, #10 // =10
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    b.le .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %land.lhs.true
+; CHECK-NEXT:    adrp x10, c
+; CHECK-NEXT:    ldr w9, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w10, [x10, :lo12:c]
+; CHECK-NEXT:    cmp w9, w10
+; CHECK-NEXT:    b.ne .LBB0_3
+; CHECK-NEXT:    b .LBB0_5
+; CHECK-NEXT:  .LBB0_2: // %lor.lhs.false
+; CHECK-NEXT:    b.lt .LBB0_4
+; CHECK-NEXT:  .LBB0_3: // %land.lhs.true3
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB0_5
+; CHECK-NEXT:  .LBB0_4: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_5:
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp = icmp sgt i32 %0, 10
@@ -45,12 +68,35 @@ return:                                           ; preds = %if.end, %land.lhs.t
 
 ; (a > 5 && b == c) || (a < 5 && b == d)
 define i32 @combine_gt_lt_5() #0 {
-; CHECK-LABEL: combine_gt_lt_5
-; CHECK: cmp
-; CHECK: b.le
-; CHECK: ret
-; CHECK-NOT: cmp
-; CHECK: b.ge
+; CHECK-LABEL: combine_gt_lt_5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, a
+; CHECK-NEXT:    ldr w8, [x8, :lo12:a]
+; CHECK-NEXT:    cmp w8, #5 // =5
+; CHECK-NEXT:    b.le .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %land.lhs.true
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, c
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:c]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB1_4
+; CHECK-NEXT:    b .LBB1_5
+; CHECK-NEXT:  .LBB1_2: // %lor.lhs.false
+; CHECK-NEXT:    b.ge .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %land.lhs.true3
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB1_5
+; CHECK-NEXT:  .LBB1_4: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp = icmp sgt i32 %0, 5
@@ -82,12 +128,34 @@ return:                                           ; preds = %if.end, %land.lhs.t
 
 ; (a < 5 && b == c) || (a <= 5 && b == d)
 define i32 @combine_lt_ge_5() #0 {
-; CHECK-LABEL: combine_lt_ge_5
-; CHECK: cmp
-; CHECK: b.ge
-; CHECK: ret
-; CHECK-NOT: cmp
-; CHECK: b.gt
+; CHECK-LABEL: combine_lt_ge_5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, a
+; CHECK-NEXT:    ldr w8, [x8, :lo12:a]
+; CHECK-NEXT:    cmp w8, #5 // =5
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    b.ge .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %land.lhs.true
+; CHECK-NEXT:    adrp x10, c
+; CHECK-NEXT:    ldr w9, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w10, [x10, :lo12:c]
+; CHECK-NEXT:    cmp w9, w10
+; CHECK-NEXT:    b.ne .LBB2_3
+; CHECK-NEXT:    b .LBB2_5
+; CHECK-NEXT:  .LBB2_2: // %lor.lhs.false
+; CHECK-NEXT:    b.gt .LBB2_4
+; CHECK-NEXT:  .LBB2_3: // %land.lhs.true3
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB2_5
+; CHECK-NEXT:  .LBB2_4: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_5:
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp = icmp slt i32 %0, 5
@@ -119,12 +187,35 @@ return:                                           ; preds = %if.end, %land.lhs.t
 
 ; (a < 5 && b == c) || (a > 5 && b == d)
 define i32 @combine_lt_gt_5() #0 {
-; CHECK-LABEL: combine_lt_gt_5
-; CHECK: cmp
-; CHECK: b.ge
-; CHECK: ret
-; CHECK-NOT: cmp
-; CHECK: b.le
+; CHECK-LABEL: combine_lt_gt_5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, a
+; CHECK-NEXT:    ldr w8, [x8, :lo12:a]
+; CHECK-NEXT:    cmp w8, #5 // =5
+; CHECK-NEXT:    b.ge .LBB3_2
+; CHECK-NEXT:  // %bb.1: // %land.lhs.true
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, c
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:c]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB3_4
+; CHECK-NEXT:    b .LBB3_5
+; CHECK-NEXT:  .LBB3_2: // %lor.lhs.false
+; CHECK-NEXT:    b.le .LBB3_4
+; CHECK-NEXT:  // %bb.3: // %land.lhs.true3
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB3_5
+; CHECK-NEXT:  .LBB3_4: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_5:
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp = icmp slt i32 %0, 5
@@ -156,12 +247,35 @@ return:                                           ; preds = %if.end, %land.lhs.t
 
 ; (a > -5 && b == c) || (a < -5 && b == d)
 define i32 @combine_gt_lt_n5() #0 {
-; CHECK-LABEL: combine_gt_lt_n5
-; CHECK: cmn
-; CHECK: b.le
-; CHECK: ret
-; CHECK-NOT: cmn
-; CHECK: b.ge
+; CHECK-LABEL: combine_gt_lt_n5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, a
+; CHECK-NEXT:    ldr w8, [x8, :lo12:a]
+; CHECK-NEXT:    cmn w8, #5 // =5
+; CHECK-NEXT:    b.le .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %land.lhs.true
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, c
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:c]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB4_4
+; CHECK-NEXT:    b .LBB4_5
+; CHECK-NEXT:  .LBB4_2: // %lor.lhs.false
+; CHECK-NEXT:    b.ge .LBB4_4
+; CHECK-NEXT:  // %bb.3: // %land.lhs.true3
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB4_5
+; CHECK-NEXT:  .LBB4_4: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB4_5:
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp = icmp sgt i32 %0, -5
@@ -193,12 +307,35 @@ return:                                           ; preds = %if.end, %land.lhs.t
 
 ; (a < -5 && b == c) || (a > -5 && b == d)
 define i32 @combine_lt_gt_n5() #0 {
-; CHECK-LABEL: combine_lt_gt_n5
-; CHECK: cmn
-; CHECK: b.ge
-; CHECK: ret
-; CHECK-NOT: cmn
-; CHECK: b.le
+; CHECK-LABEL: combine_lt_gt_n5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, a
+; CHECK-NEXT:    ldr w8, [x8, :lo12:a]
+; CHECK-NEXT:    cmn w8, #5 // =5
+; CHECK-NEXT:    b.ge .LBB5_2
+; CHECK-NEXT:  // %bb.1: // %land.lhs.true
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, c
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:c]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB5_4
+; CHECK-NEXT:    b .LBB5_5
+; CHECK-NEXT:  .LBB5_2: // %lor.lhs.false
+; CHECK-NEXT:    b.le .LBB5_4
+; CHECK-NEXT:  // %bb.3: // %land.lhs.true3
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB5_5
+; CHECK-NEXT:  .LBB5_4: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB5_5:
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp = icmp slt i32 %0, -5
@@ -236,6 +373,38 @@ declare %struct.Struct* @Update(%struct.Struct*) #1
 
 ; no checks for this case, it just should be processed without errors
 define void @combine_non_adjacent_cmp_br(%struct.Struct* nocapture readonly %hdCall) #0 {
+; CHECK-LABEL: combine_non_adjacent_cmp_br:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w21, -24
+; CHECK-NEXT:    .cfi_offset w22, -32
+; CHECK-NEXT:    .cfi_offset w30, -48
+; CHECK-NEXT:    ldr x19, [x0]
+; CHECK-NEXT:    mov w20, #24
+; CHECK-NEXT:    adrp x22, glob
+; CHECK-NEXT:    add x21, x19, #2 // =2
+; CHECK-NEXT:  .LBB6_1: // %land.rhs
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr x8, [x20]
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    b.lt .LBB6_3
+; CHECK-NEXT:  // %bb.2: // %while.body
+; CHECK-NEXT:    // in Loop: Header=BB6_1 Depth=1
+; CHECK-NEXT:    ldr x0, [x22, :lo12:glob]
+; CHECK-NEXT:    bl Update
+; CHECK-NEXT:    sub x21, x21, #2 // =2
+; CHECK-NEXT:    cmp x19, x21
+; CHECK-NEXT:    b.lt .LBB6_1
+; CHECK-NEXT:  .LBB6_3: // %while.end
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %size = getelementptr inbounds %struct.Struct, %struct.Struct* %hdCall, i64 0, i32 0
   %0 = load i64, i64* %size, align 8
@@ -262,11 +431,49 @@ while.end:
 declare void @do_something() #1
 
 define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
-; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
-; CHECK: cmn
-; CHECK: b.gt
-; CHECK: cmp
-; CHECK: b.gt
+; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    adrp x19, a
+; CHECK-NEXT:    ldr w8, [x19, :lo12:a]
+; CHECK-NEXT:    cmn w8, #2 // =2
+; CHECK-NEXT:    b.le .LBB7_2
+; CHECK-NEXT:  // %bb.1: // %while.end
+; CHECK-NEXT:    cmp w8, #1 // =1
+; CHECK-NEXT:    b.le .LBB7_5
+; CHECK-NEXT:    b .LBB7_6
+; CHECK-NEXT:  .LBB7_2: // %while.body.preheader
+; CHECK-NEXT:    sub w20, w8, #1 // =1
+; CHECK-NEXT:  .LBB7_3: // %while.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    bl do_something
+; CHECK-NEXT:    adds w20, w20, #1 // =1
+; CHECK-NEXT:    b.mi .LBB7_3
+; CHECK-NEXT:  // %bb.4: // %while.cond.while.end_crit_edge
+; CHECK-NEXT:    ldr w8, [x19, :lo12:a]
+; CHECK-NEXT:    cmp w8, #1 // =1
+; CHECK-NEXT:    b.gt .LBB7_6
+; CHECK-NEXT:  .LBB7_5: // %land.lhs.true
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB7_7
+; CHECK-NEXT:  .LBB7_6: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    b .LBB7_8
+; CHECK-NEXT:  .LBB7_7:
+; CHECK-NEXT:    mov w0, #123
+; CHECK-NEXT:  .LBB7_8: // %return
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp4 = icmp slt i32 %0, -1
@@ -306,11 +513,43 @@ return:                                           ; preds = %if.end, %land.lhs.t
 }
 
 define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 {
-; CHECK-LABEL: do_nothing_if_compares_can_not_be_adjusted_to_each_other
-; CHECK: cmp
-; CHECK: b.gt
-; CHECK: cmn
-; CHECK: b.lt
+; CHECK-LABEL: do_nothing_if_compares_can_not_be_adjusted_to_each_other:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    adrp x8, a
+; CHECK-NEXT:    ldr w8, [x8, :lo12:a]
+; CHECK-NEXT:    cmp w8, #0 // =0
+; CHECK-NEXT:    b.gt .LBB8_3
+; CHECK-NEXT:  // %bb.1: // %while.body.preheader
+; CHECK-NEXT:    sub w19, w8, #1 // =1
+; CHECK-NEXT:  .LBB8_2: // %while.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    bl do_something
+; CHECK-NEXT:    adds w19, w19, #1 // =1
+; CHECK-NEXT:    b.mi .LBB8_2
+; CHECK-NEXT:  .LBB8_3: // %while.end
+; CHECK-NEXT:    adrp x8, c
+; CHECK-NEXT:    ldr w8, [x8, :lo12:c]
+; CHECK-NEXT:    cmn w8, #2 // =2
+; CHECK-NEXT:    b.lt .LBB8_5
+; CHECK-NEXT:  // %bb.4: // %land.lhs.true
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB8_6
+; CHECK-NEXT:  .LBB8_5: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB8_6:
+; CHECK-NEXT:    mov w0, #123
+; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp4 = icmp slt i32 %0, 1
@@ -356,19 +595,44 @@ return:                                           ; preds = %if.end, %land.lhs.t
 ; b.gt .LBB0_5
 
 define i32 @fcmpri(i32 %argc, i8** nocapture readonly %argv) {
-
 ; CHECK-LABEL: fcmpri:
-; CHECK: cmp w0, #2
-; CHECK: b.lt .LBB9_3
-; CHECK-NOT: cmp w0, #1
-; CHECK-NOT: b.le .LBB9_3
-
-; CHECK-LABEL-DAG: .LBB9_3
-; CHECK: cmp w19, #0
-; CHECK: fcmp d8, #0.0
-; CHECK-NOT: cmp w19, #1
-; CHECK-NOT: b.ge .LBB9_5
-
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset b8, -32
+; CHECK-NEXT:    cmp w0, #2 // =2
+; CHECK-NEXT:    b.lt .LBB9_3
+; CHECK-NEXT:  // %bb.1: // %land.lhs.true
+; CHECK-NEXT:    ldr x8, [x1, #8]
+; CHECK-NEXT:    cbz x8, .LBB9_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, #3
+; CHECK-NEXT:    b .LBB9_4
+; CHECK-NEXT:  .LBB9_3: // %if.end
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    bl zoo
+; CHECK-NEXT:    mov w19, w0
+; CHECK-NEXT:    mov w0, #-1
+; CHECK-NEXT:    bl yoo
+; CHECK-NEXT:    cmp w19, #0 // =0
+; CHECK-NEXT:    cinc w0, w19, gt
+; CHECK-NEXT:    mov w1, #2
+; CHECK-NEXT:    mov v8.16b, v0.16b
+; CHECK-NEXT:    bl xoo
+; CHECK-NEXT:    fmov d0, #-1.00000000
+; CHECK-NEXT:    fadd d0, d8, d0
+; CHECK-NEXT:    fcmp d8, #0.0
+; CHECK-NEXT:    fcsel d0, d8, d0, gt
+; CHECK-NEXT:    fmov d1, #-2.00000000
+; CHECK-NEXT:    bl woo
+; CHECK-NEXT:    mov w0, #4
+; CHECK-NEXT:  .LBB9_4: // %return
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %argc, 1
   br i1 %cmp, label %land.lhs.true, label %if.end
@@ -405,10 +669,27 @@ return:                                           ; preds = %land.lhs.true, %con
 
 define void @cmp_shifted(i32 %in, i32 %lhs, i32 %rhs) {
 ; CHECK-LABEL: cmp_shifted:
-; CHECK: cmp w0, #2, lsl #12
-; [...]
-; CHECK: cmp w0, #1
-
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    cmp w0, #2, lsl #12 // =8192
+; CHECK-NEXT:    b.lt .LBB10_2
+; CHECK-NEXT:  // %bb.1: // %true
+; CHECK-NEXT:    mov w0, #128
+; CHECK-NEXT:    b .LBB10_5
+; CHECK-NEXT:  .LBB10_2: // %false
+; CHECK-NEXT:    cmp w0, #1 // =1
+; CHECK-NEXT:    b.lt .LBB10_4
+; CHECK-NEXT:  // %bb.3: // %truer
+; CHECK-NEXT:    mov w0, #42
+; CHECK-NEXT:    b .LBB10_5
+; CHECK-NEXT:  .LBB10_4: // %falser
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:  .LBB10_5: // %true
+; CHECK-NEXT:    bl zoo
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
   %tst_low = icmp sgt i32 %in, 8191
   br i1 %tst_low, label %true, label %false
 
@@ -430,10 +711,38 @@ falser:
 }
 
 define i32 @combine_gt_ge_sel(i64 %v, i64* %p) #0 {
-; CHECK-LABEL: combine_gt_ge_sel
-; CHECK: ldr [[reg1:w[0-9]*]],
-; CHECK: cmp [[reg1]], #0
-; CHECK: csel {{.*}}, gt
+; CHECK-LABEL: combine_gt_ge_sel:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, a
+; CHECK-NEXT:    ldr w8, [x8, :lo12:a]
+; CHECK-NEXT:    cmp w8, #0 // =0
+; CHECK-NEXT:    csel x9, x0, xzr, gt
+; CHECK-NEXT:    str x9, [x1]
+; CHECK-NEXT:    b.le .LBB11_2
+; CHECK-NEXT:  // %bb.1: // %lor.lhs.false
+; CHECK-NEXT:    cmp w8, #2 // =2
+; CHECK-NEXT:    b.ge .LBB11_3
+; CHECK-NEXT:    b .LBB11_4
+; CHECK-NEXT:  .LBB11_2: // %land.lhs.true
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, c
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:c]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB11_5
+; CHECK-NEXT:  .LBB11_3: // %land.lhs.true3
+; CHECK-NEXT:    adrp x8, b
+; CHECK-NEXT:    adrp x9, d
+; CHECK-NEXT:    ldr w8, [x8, :lo12:b]
+; CHECK-NEXT:    ldr w9, [x9, :lo12:d]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.eq .LBB11_5
+; CHECK-NEXT:  .LBB11_4: // %if.end
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB11_5:
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp = icmp sgt i32 %0, 0
diff --git a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
index e0b6a2f0503732..e98d4110fd2760 100644
--- a/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
+++ b/llvm/test/CodeGen/AArch64/cond-br-tuning.ll
@@ -1,14 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -debugify-and-strip-all-safe < %s -O3 -mtriple=aarch64-eabi -verify-machineinstrs | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-linaro-linux-gnueabi"
 
 ; CMN is an alias of ADDS.
-; CHECK-LABEL: test_add_cbz:
-; CHECK: cmn w0, w1
-; CHECK: b.eq
-; CHECK: ret
 define void @test_add_cbz(i32 %a, i32 %b, i32* %ptr) {
+; CHECK-LABEL: test_add_cbz:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn w0, w1
+; CHECK-NEXT:    b.eq .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %L1
+; CHECK-NEXT:    str wzr, [x2]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %L2
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
   %c = add nsw i32 %a, %b
   %d = icmp ne i32 %c, 0
   br i1 %d, label %L1, label %L2
@@ -20,11 +28,17 @@ L2:
   ret void
 }
 
-; CHECK-LABEL: test_add_cbz_multiple_use:
-; CHECK: adds
-; CHECK: b.eq
-; CHECK: ret
 define void @test_add_cbz_multiple_use(i32 %a, i32 %b, i32* %ptr) {
+; CHECK-LABEL: test_add_cbz_multiple_use:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adds w8, w0, w1
+; CHECK-NEXT:    b.eq .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %L1
+; CHECK-NEXT:    str wzr, [x2]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2: // %L2
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
   %c = add nsw i32 %a, %b
   %d = icmp ne i32 %c, 0
   br i1 %d, label %L1, label %L2
@@ -36,10 +50,18 @@ L2:
   ret void
 }
 
-; CHECK-LABEL: test_add_cbz_64:
-; CHECK: cmn x0, x1
-; CHECK: b.eq
 define void @test_add_cbz_64(i64 %a, i64 %b, i64* %ptr) {
+; CHECK-LABEL: test_add_cbz_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn x0, x1
+; CHECK-NEXT:    b.eq .LBB2_2
+; CHECK-NEXT:  // %bb.1: // %L1
+; CHECK-NEXT:    str xzr, [x2]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2: // %L2
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    str x8, [x2]
+; CHECK-NEXT:    ret
   %c = add nsw i64 %a, %b
   %d = icmp ne i64 %c, 0
   br i1 %d, label %L1, label %L2
@@ -51,10 +73,18 @@ L2:
   ret void
 }
 
-; CHECK-LABEL: test_and_cbz:
-; CHECK: tst w0, #0x6
-; CHECK: b.eq
 define void @test_and_cbz(i32 %a, i32* %ptr) {
+; CHECK-LABEL: test_and_cbz:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tst w0, #0x6
+; CHECK-NEXT:    b.eq .LBB3_2
+; CHECK-NEXT:  // %bb.1: // %L1
+; CHECK-NEXT:    str wzr, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_2: // %L2
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
   %c = and i32 %a, 6
   %d = icmp ne i32 %c, 0
   br i1 %d, label %L1, label %L2
@@ -66,10 +96,18 @@ L2:
   ret void
 }
 
-; CHECK-LABEL: test_bic_cbnz:
-; CHECK: bics wzr, w1, w0
-; CHECK: b.ne
 define void @test_bic_cbnz(i32 %a, i32 %b, i32* %ptr) {
+; CHECK-LABEL: test_bic_cbnz:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bics wzr, w1, w0
+; CHECK-NEXT:    b.eq .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %L2
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB4_2: // %L1
+; CHECK-NEXT:    str wzr, [x2]
+; CHECK-NEXT:    ret
   %c = and i32 %a, %b
   %d = icmp eq i32 %c, %b
   br i1 %d, label %L1, label %L2
@@ -81,11 +119,15 @@ L2:
   ret void
 }
 
-; CHECK-LABEL: test_add_tbz:
-; CHECK: adds
-; CHECK: b.pl
-; CHECK: ret
 define void @test_add_tbz(i32 %a, i32 %b, i32* %ptr) {
+; CHECK-LABEL: test_add_tbz:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adds w8, w0, w1
+; CHECK-NEXT:    b.pl .LBB5_2
+; CHECK-NEXT:  // %bb.1: // %L1
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:  .LBB5_2: // %L2
+; CHECK-NEXT:    ret
 entry:
   %add = add nsw i32 %a, %b
   %cmp36 = icmp sge i32 %add, 0
@@ -97,11 +139,15 @@ L2:
   ret void
 }
 
-; CHECK-LABEL: test_subs_tbz:
-; CHECK: subs
-; CHECK: b.pl
-; CHECK: ret
 define void @test_subs_tbz(i32 %a, i32 %b, i32* %ptr) {
+; CHECK-LABEL: test_subs_tbz:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    b.pl .LBB6_2
+; CHECK-NEXT:  // %bb.1: // %L1
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:  .LBB6_2: // %L2
+; CHECK-NEXT:    ret
 entry:
   %sub = sub nsw i32 %a, %b
   %cmp36 = icmp sge i32 %sub, 0
@@ -113,11 +159,15 @@ L2:
   ret void
 }
 
-; CHECK-LABEL: test_add_tbnz
-; CHECK: adds
-; CHECK: b.mi
-; CHECK: ret
 define void @test_add_tbnz(i32 %a, i32 %b, i32* %ptr) {
+; CHECK-LABEL: test_add_tbnz:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adds w8, w0, w1
+; CHECK-NEXT:    b.mi .LBB7_2
+; CHECK-NEXT:  // %bb.1: // %L1
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:  .LBB7_2: // %L2
+; CHECK-NEXT:    ret
 entry:
   %add = add nsw i32 %a, %b
   %cmp36 = icmp slt i32 %add, 0
@@ -129,11 +179,15 @@ L2:
   ret void
 }
 
-; CHECK-LABEL: test_subs_tbnz
-; CHECK: subs
-; CHECK: b.mi
-; CHECK: ret
 define void @test_subs_tbnz(i32 %a, i32 %b, i32* %ptr) {
+; CHECK-LABEL: test_subs_tbnz:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subs w8, w0, w1
+; CHECK-NEXT:    b.mi .LBB8_2
+; CHECK-NEXT:  // %bb.1: // %L1
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:  .LBB8_2: // %L2
+; CHECK-NEXT:    ret
 entry:
   %sub = sub nsw i32 %a, %b
   %cmp36 = icmp slt i32 %sub, 0
@@ -149,11 +203,22 @@ declare void @foo()
 declare void @bar(i32)
 
 ; Don't transform since the call will clobber the NZCV bits.
-; CHECK-LABEL: test_call_clobber:
-; CHECK: and w[[DST:[0-9]+]], w1, #0x6
-; CHECK: bl bar
-; CHECK: cbnz w[[DST]]
 define void @test_call_clobber(i32 %unused, i32 %a) {
+; CHECK-LABEL: test_call_clobber:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    and w19, w1, #0x6
+; CHECK-NEXT:    mov w0, w19
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    cbnz w19, .LBB9_2
+; CHECK-NEXT:  // %bb.1: // %if.end
+; CHECK-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB9_2: // %if.then
+; CHECK-NEXT:    bl foo
 entry:
   %c = and i32 %a, 6
   call void @bar(i32 %c)
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-cmp-branch.ll b/llvm/test/CodeGen/AArch64/fast-isel-cmp-branch.ll
index ce47bc42453c8c..d1c762585a9e84 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-cmp-branch.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-cmp-branch.ll
@@ -1,10 +1,18 @@
-; RUN: llc                               -aarch64-enable-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s
-; RUN: llc -fast-isel -fast-isel-abort=1 -aarch64-enable-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc                               -aarch64-enable-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s --check-prefixes=CHECK,NOFASTISEL
+; RUN: llc -fast-isel -fast-isel-abort=1 -aarch64-enable-atomic-cfg-tidy=0 -mtriple=aarch64-apple-darwin < %s | FileCheck %s --check-prefixes=CHECK,FASTISEL
 
 define i32 @fcmp_oeq(float %x, float %y) {
-; CHECK-LABEL: fcmp_oeq
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.ne {{LBB.+_2}}
+; CHECK-LABEL: fcmp_oeq:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.ne LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB0_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp oeq float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -14,9 +22,16 @@ bb1:
 }
 
 define i32 @fcmp_ogt(float %x, float %y) {
-; CHECK-LABEL: fcmp_ogt
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.le {{LBB.+_2}}
+; CHECK-LABEL: fcmp_ogt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.le LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB1_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp ogt float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -26,9 +41,16 @@ bb1:
 }
 
 define i32 @fcmp_oge(float %x, float %y) {
-; CHECK-LABEL: fcmp_oge
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.lt {{LBB.+_2}}
+; CHECK-LABEL: fcmp_oge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.lt LBB2_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB2_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp oge float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -38,9 +60,16 @@ bb1:
 }
 
 define i32 @fcmp_olt(float %x, float %y) {
-; CHECK-LABEL: fcmp_olt
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.pl {{LBB.+_2}}
+; CHECK-LABEL: fcmp_olt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.pl LBB3_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB3_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp olt float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -50,9 +79,16 @@ bb1:
 }
 
 define i32 @fcmp_ole(float %x, float %y) {
-; CHECK-LABEL: fcmp_ole
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.hi {{LBB.+_2}}
+; CHECK-LABEL: fcmp_ole:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.hi LBB4_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB4_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp ole float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -62,10 +98,30 @@ bb1:
 }
 
 define i32 @fcmp_one(float %x, float %y) {
-; CHECK-LABEL: fcmp_one
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.mi
-; CHECK-NEXT:  b.gt
+; NOFASTISEL-LABEL: fcmp_one:
+; NOFASTISEL:       ; %bb.0:
+; NOFASTISEL-NEXT:    fcmp s0, s1
+; NOFASTISEL-NEXT:    b.mi LBB5_1
+; NOFASTISEL-NEXT:    b.gt LBB5_1
+; NOFASTISEL-NEXT:    b LBB5_2
+; NOFASTISEL-NEXT:  LBB5_1: ; %bb1
+; NOFASTISEL-NEXT:    mov w0, wzr
+; NOFASTISEL-NEXT:    ret
+; NOFASTISEL-NEXT:  LBB5_2: ; %bb2
+; NOFASTISEL-NEXT:    mov w0, #1
+; NOFASTISEL-NEXT:    ret
+;
+; FASTISEL-LABEL: fcmp_one:
+; FASTISEL:       ; %bb.0:
+; FASTISEL-NEXT:    fcmp s0, s1
+; FASTISEL-NEXT:    b.mi LBB5_2
+; FASTISEL-NEXT:    b.gt LBB5_2
+; FASTISEL-NEXT:  ; %bb.1: ; %bb2
+; FASTISEL-NEXT:    mov w0, #1
+; FASTISEL-NEXT:    ret
+; FASTISEL-NEXT:  LBB5_2: ; %bb1
+; FASTISEL-NEXT:    mov w0, wzr
+; FASTISEL-NEXT:    ret
   %1 = fcmp one float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -75,9 +131,16 @@ bb1:
 }
 
 define i32 @fcmp_ord(float %x, float %y) {
-; CHECK-LABEL: fcmp_ord
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.vs {{LBB.+_2}}
+; CHECK-LABEL: fcmp_ord:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.vs LBB6_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB6_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp ord float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -87,9 +150,16 @@ bb1:
 }
 
 define i32 @fcmp_uno(float %x, float %y) {
-; CHECK-LABEL: fcmp_uno
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.vs {{LBB.+_2}}
+; CHECK-LABEL: fcmp_uno:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.vs LBB7_2
+; CHECK-NEXT:  ; %bb.1: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB7_2: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
   %1 = fcmp uno float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -99,10 +169,30 @@ bb1:
 }
 
 define i32 @fcmp_ueq(float %x, float %y) {
-; CHECK-LABEL: fcmp_ueq
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.eq {{LBB.+_2}}
-; CHECK-NEXT:  b.vs {{LBB.+_2}}
+; NOFASTISEL-LABEL: fcmp_ueq:
+; NOFASTISEL:       ; %bb.0:
+; NOFASTISEL-NEXT:    fcmp s0, s1
+; NOFASTISEL-NEXT:    b.eq LBB8_2
+; NOFASTISEL-NEXT:    b.vs LBB8_2
+; NOFASTISEL-NEXT:    b LBB8_1
+; NOFASTISEL-NEXT:  LBB8_1: ; %bb2
+; NOFASTISEL-NEXT:    mov w0, #1
+; NOFASTISEL-NEXT:    ret
+; NOFASTISEL-NEXT:  LBB8_2: ; %bb1
+; NOFASTISEL-NEXT:    mov w0, wzr
+; NOFASTISEL-NEXT:    ret
+;
+; FASTISEL-LABEL: fcmp_ueq:
+; FASTISEL:       ; %bb.0:
+; FASTISEL-NEXT:    fcmp s0, s1
+; FASTISEL-NEXT:    b.eq LBB8_2
+; FASTISEL-NEXT:    b.vs LBB8_2
+; FASTISEL-NEXT:  ; %bb.1: ; %bb2
+; FASTISEL-NEXT:    mov w0, #1
+; FASTISEL-NEXT:    ret
+; FASTISEL-NEXT:  LBB8_2: ; %bb1
+; FASTISEL-NEXT:    mov w0, wzr
+; FASTISEL-NEXT:    ret
   %1 = fcmp ueq float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -112,9 +202,16 @@ bb1:
 }
 
 define i32 @fcmp_ugt(float %x, float %y) {
-; CHECK-LABEL: fcmp_ugt
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.ls {{LBB.+_2}}
+; CHECK-LABEL: fcmp_ugt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.ls LBB9_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB9_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp ugt float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -124,9 +221,16 @@ bb1:
 }
 
 define i32 @fcmp_uge(float %x, float %y) {
-; CHECK-LABEL: fcmp_uge
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.mi {{LBB.+_2}}
+; CHECK-LABEL: fcmp_uge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.mi LBB10_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB10_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp uge float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -136,9 +240,16 @@ bb1:
 }
 
 define i32 @fcmp_ult(float %x, float %y) {
-; CHECK-LABEL: fcmp_ult
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.ge {{LBB.+_2}}
+; CHECK-LABEL: fcmp_ult:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.ge LBB11_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB11_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp ult float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -148,9 +259,16 @@ bb1:
 }
 
 define i32 @fcmp_ule(float %x, float %y) {
-; CHECK-LABEL: fcmp_ule
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.gt {{LBB.+_2}}
+; CHECK-LABEL: fcmp_ule:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.gt LBB12_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB12_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp ule float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -160,9 +278,16 @@ bb1:
 }
 
 define i32 @fcmp_une(float %x, float %y) {
-; CHECK-LABEL: fcmp_une
-; CHECK:       fcmp s0, s1
-; CHECK-NEXT:  b.eq {{LBB.+_2}}
+; CHECK-LABEL: fcmp_une:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    b.eq LBB13_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB13_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = fcmp une float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -172,9 +297,16 @@ bb1:
 }
 
 define i32 @icmp_eq(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_eq
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.ne {{LBB.+_2}}
+; CHECK-LABEL: icmp_eq:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.eq LBB14_2
+; CHECK-NEXT:  ; %bb.1: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB14_2: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
   %1 = icmp eq i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -184,9 +316,16 @@ bb1:
 }
 
 define i32 @icmp_ne(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ne
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.eq {{LBB.+_2}}
+; CHECK-LABEL: icmp_ne:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.eq LBB15_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB15_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = icmp ne i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -196,9 +335,16 @@ bb1:
 }
 
 define i32 @icmp_ugt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ugt
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.ls {{LBB.+_2}}
+; CHECK-LABEL: icmp_ugt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.ls LBB16_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB16_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = icmp ugt i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -208,9 +354,16 @@ bb1:
 }
 
 define i32 @icmp_uge(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_uge
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.lo {{LBB.+_2}}
+; CHECK-LABEL: icmp_uge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.hs LBB17_2
+; CHECK-NEXT:  ; %bb.1: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB17_2: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
   %1 = icmp uge i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -220,9 +373,16 @@ bb1:
 }
 
 define i32 @icmp_ult(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ult
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.hs {{LBB.+_2}}
+; CHECK-LABEL: icmp_ult:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.hs LBB18_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB18_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = icmp ult i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -232,9 +392,16 @@ bb1:
 }
 
 define i32 @icmp_ule(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ule
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.hi {{LBB.+_2}}
+; CHECK-LABEL: icmp_ule:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.ls LBB19_2
+; CHECK-NEXT:  ; %bb.1: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB19_2: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
   %1 = icmp ule i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -244,9 +411,16 @@ bb1:
 }
 
 define i32 @icmp_sgt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sgt
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.le {{LBB.+_2}}
+; CHECK-LABEL: icmp_sgt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.le LBB20_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB20_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = icmp sgt i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -256,9 +430,16 @@ bb1:
 }
 
 define i32 @icmp_sge(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sge
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.lt {{LBB.+_2}}
+; CHECK-LABEL: icmp_sge:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.ge LBB21_2
+; CHECK-NEXT:  ; %bb.1: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB21_2: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
   %1 = icmp sge i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -268,9 +449,16 @@ bb1:
 }
 
 define i32 @icmp_slt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_slt
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.ge {{LBB.+_2}}
+; CHECK-LABEL: icmp_slt:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.ge LBB22_2
+; CHECK-NEXT:  ; %bb.1: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB22_2: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
   %1 = icmp slt i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -280,9 +468,16 @@ bb1:
 }
 
 define i32 @icmp_sle(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sle
-; CHECK:       cmp w0, w1
-; CHECK-NEXT:  b.gt {{LBB.+_2}}
+; CHECK-LABEL: icmp_sle:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    cmp w0, w1
+; CHECK-NEXT:    b.le LBB23_2
+; CHECK-NEXT:  ; %bb.1: ; %bb2
+; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB23_2: ; %bb1
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
   %1 = icmp sle i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
diff --git a/llvm/test/CodeGen/ARM/2011-12-14-machine-sink.ll b/llvm/test/CodeGen/ARM/2011-12-14-machine-sink.ll
index 88019f450e3663..5716d7dbc40574 100644
--- a/llvm/test/CodeGen/ARM/2011-12-14-machine-sink.ll
+++ b/llvm/test/CodeGen/ARM/2011-12-14-machine-sink.ll
@@ -1,9 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; REQUIRES: asserts
 ; RUN: llc < %s -o /dev/null -stats 2>&1 | FileCheck %s -check-prefix=STATS
 ; Radar 10266272
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios4.0.0"
-; STATS-NOT: machine-sink
+
+; STATS: 1 machine-cse          - Number of common subexpression eliminated
+; STATS: 1 machine-sink         - Number of critical edges split
+; STATS: 1 machine-sink         - Number of machine instructions sunk
+
 
 define i32 @foo(i32 %h, i32 %arg1) nounwind readonly ssp {
 entry:
diff --git a/llvm/test/CodeGen/ARM/cmpxchg-weak.ll b/llvm/test/CodeGen/ARM/cmpxchg-weak.ll
index 5ee07828526c56..17f2ed74ecc05b 100644
--- a/llvm/test/CodeGen/ARM/cmpxchg-weak.ll
+++ b/llvm/test/CodeGen/ARM/cmpxchg-weak.ll
@@ -5,27 +5,27 @@ define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) {
 
   %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
   %oldval = extractvalue { i32, i1 } %pair, 0
-; CHECK-NEXT: %bb.0:
-; CHECK-NEXT:     ldrex   [[LOADED:r[0-9]+]], [r0]
-; CHECK-NEXT:     cmp     [[LOADED]], r1
-; CHECK-NEXT:     bne     [[LDFAILBB:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: %bb.1:
-; CHECK-NEXT:     dmb ish
-; CHECK-NEXT:     strex   [[SUCCESS:r[0-9]+]], r2, [r0]
-; CHECK-NEXT:     cmp     [[SUCCESS]], #0
-; CHECK-NEXT:     beq     [[SUCCESSBB:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: %bb.2:
-; CHECK-NEXT:     str     r3, [r0]
-; CHECK-NEXT:     bx      lr
-; CHECK-NEXT: [[LDFAILBB]]:
+; CHECK-NEXT: @ %bb.0:                                @ %cmpxchg.start
+; CHECK-NEXT:     ldrex   r3, [r0]
+; CHECK-NEXT:     cmp     r3, r1
+; CHECK-NEXT:     beq     LBB0_2
+; CHECK-NEXT: @ %bb.1:                                @ %cmpxchg.nostore
 ; CHECK-NEXT:     clrex
+; CHECK-NEXT:     b       LBB0_3
+; CHECK-NEXT: LBB0_2:                                 @ %cmpxchg.fencedstore
+; CHECK-NEXT:     dmb     ish
+; CHECK-NEXT:     strex   r1, r2, [r0]
+; CHECK-NEXT:     cmp     r1, #0
+; CHECK-NEXT:     beq     LBB0_4
+; CHECK-NEXT: LBB0_3:                                 @ %cmpxchg.end
 ; CHECK-NEXT:     str     r3, [r0]
 ; CHECK-NEXT:     bx      lr
-; CHECK-NEXT: [[SUCCESSBB]]:
+; CHECK-NEXT: LBB0_4:                                 @ %cmpxchg.success
 ; CHECK-NEXT:     dmb     ish
 ; CHECK-NEXT:     str     r3, [r0]
 ; CHECK-NEXT:     bx      lr
 
+
   store i32 %oldval, i32* %addr
   ret void
 }
@@ -37,23 +37,24 @@ define i1 @test_cmpxchg_weak_to_bool(i32, i32 *%addr, i32 %desired, i32 %new) {
   %pair = cmpxchg weak i32* %addr, i32 %desired, i32 %new seq_cst monotonic
   %success = extractvalue { i32, i1 } %pair, 1
 
-; CHECK-NEXT: %bb.0:
-; CHECK-NEXT:     ldrex   [[LOADED:r[0-9]+]], [r1]
-; CHECK-NEXT:     cmp     [[LOADED]], r2
-; CHECK-NEXT:     bne     [[LDFAILBB:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: %bb.1:
-; CHECK-NEXT:     dmb ish
+; CHECK-NEXT: @ %bb.0:                                @ %cmpxchg.start
+; CHECK-NEXT:     ldrex   r0, [r1]
+; CHECK-NEXT:     cmp     r0, r2
+; CHECK-NEXT:     beq     LBB1_2
+; CHECK-NEXT: @ %bb.1:                                @ %cmpxchg.nostore
 ; CHECK-NEXT:     mov     r0, #0
-; CHECK-NEXT:     strex   [[SUCCESS:r[0-9]+]], r3, [r1]
-; CHECK-NEXT:     cmp     [[SUCCESS]], #0
+; CHECK-NEXT:     clrex
+; CHECK-NEXT:     bx      lr
+; CHECK-NEXT: LBB1_2:                                 @ %cmpxchg.fencedstore
+; CHECK-NEXT:     dmb     ish
+; CHECK-NEXT:     mov     r0, #0
+; CHECK-NEXT:     strex   r2, r3, [r1]
+; CHECK-NEXT:     cmp     r2, #0
 ; CHECK-NEXT:     bxne    lr
 ; CHECK-NEXT:     mov     r0, #1
 ; CHECK-NEXT:     dmb     ish
 ; CHECK-NEXT:     bx      lr
-; CHECK-NEXT: [[LDFAILBB]]:
-; CHECK-NEXT:     mov     r0, #0
-; CHECK-NEXT:     clrex
-; CHECK-NEXT:     bx      lr
+                    
 
   ret i1 %success
 }
diff --git a/llvm/test/CodeGen/ARM/lsr-unfolded-offset.ll b/llvm/test/CodeGen/ARM/lsr-unfolded-offset.ll
index a9a353cad57586..8656c858be5a08 100644
--- a/llvm/test/CodeGen/ARM/lsr-unfolded-offset.ll
+++ b/llvm/test/CodeGen/ARM/lsr-unfolded-offset.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -regalloc=greedy -arm-atomic-cfg-tidy=0 < %s | FileCheck %s
 
 ; LSR shouldn't introduce more induction variables than needed, increasing
@@ -7,8 +8,11 @@
 ; CHECK: sub sp, #{{40|36|32|28|24}}
 
 ; CHECK: %for.inc
-; CHECK-NOT: ldr
-; CHECK: add
+; CHECK: adds    r6, #1
+; CHECK: adds    r4, #24
+; CHECK: cmp     r1, r6
+; CHECK: bne     LBB0_3
+
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-ios"
diff --git a/llvm/test/CodeGen/ARM/machine-cse-cmp.ll b/llvm/test/CodeGen/ARM/machine-cse-cmp.ll
index 49dbb03135f5a4..e079f9fd965768 100644
--- a/llvm/test/CodeGen/ARM/machine-cse-cmp.ll
+++ b/llvm/test/CodeGen/ARM/machine-cse-cmp.ll
@@ -52,7 +52,7 @@ entry:
 ; CHECK-LABEL: f3:
 ; CHECK-NOT: sub
 ; CHECK: cmp
-; CHECK: blt
+; CHECK: bge
 %0 = load i32, i32* %offset, align 4
 %cmp = icmp slt i32 %0, %size
 %s = sub nsw i32 %0, %size
diff --git a/llvm/test/CodeGen/Hexagon/newvaluejump2.ll b/llvm/test/CodeGen/Hexagon/newvaluejump2.ll
index 99c9d1a60af7c4..f03c7b40703988 100644
--- a/llvm/test/CodeGen/Hexagon/newvaluejump2.ll
+++ b/llvm/test/CodeGen/Hexagon/newvaluejump2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hexagon-misched < %s \
 ; RUN:    | FileCheck %s
 ; Check that we generate new value jump, both registers, with one
@@ -5,8 +6,35 @@
 
 @Reg = common global i32 0, align 4
 define i32 @main() nounwind {
+; CHECK-LABEL: main:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r1 = memw(gp+#Reg)
+; CHECK-NEXT:     allocframe(r29,#8):raw
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = memw(r29+#4)
+; CHECK-NEXT:     if (!cmp.gt(r0.new,r1)) jump:nt .LBB0_1
+; CHECK-NEXT:    }
+; CHECK-NEXT:  // %bb.2: // %if.else
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call baz
+; CHECK-NEXT:     r1:0 = combine(#20,#10)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = #0
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
+; CHECK-NEXT:  .LBB0_1: // %if.then
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call bar
+; CHECK-NEXT:     r1:0 = combine(#2,#1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = #0
+; CHECK-NEXT:     dealloc_return
+; CHECK-NEXT:    }
 entry:
-; CHECK: if (cmp.gt(r{{[0-9]+}}.new,r{{[0-9]+}})) jump:{{[t|nt]}} .LBB{{[0-9]+}}_{{[0-9]+}}
   %Reg2 = alloca i32, align 4
   %0 = load i32, i32* %Reg2, align 4
   %1 = load i32, i32* @Reg, align 4
diff --git a/llvm/test/CodeGen/Mips/brcongt.ll b/llvm/test/CodeGen/Mips/brcongt.ll
index 7dffdb4112118b..223245bc622de0 100644
--- a/llvm/test/CodeGen/Mips/brcongt.ll
+++ b/llvm/test/CodeGen/Mips/brcongt.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
@@ -6,14 +7,32 @@
 @result = global i32 0, align 4
 
 define void @test() nounwind {
+; 16-LABEL: test:
+; 16:       # %bb.0: # %entry
+; 16-NEXT:    lui $2, %hi(_gp_disp)
+; 16-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; 16-NEXT:    li $2, %hi(_gp_disp)
+; 16-NEXT:    addiu $3, $pc, %lo(_gp_disp)
+; 16-NEXT:    sll $2, $2, 16
+; 16-NEXT:    addu $2, $3, $2
+; 16-NEXT:    lw $3, %got(i)($2)
+; 16-NEXT:    lw $4, %got(j)($2)
+; 16-NEXT:    lw $3, 0($3)
+; 16-NEXT:    lw $4, 0($4)
+; 16-NEXT:    slt $4, $3
+; 16-NEXT:    bteqz $BB0_2 # 16 bit inst
+; 16-NEXT:  # %bb.1: # %if.end
+; 16-NEXT:    jrc $ra
+; 16-NEXT:  $BB0_2: # %if.then
+; 16-NEXT:    lw $2, %got(result)($2)
+; 16-NEXT:    li $3, 1
+; 16-NEXT:    sw $3, 0($2)
+; 16-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4
   %1 = load i32, i32* @j, align 4
   %cmp = icmp sgt i32 %0, %1
   br i1 %cmp, label %if.end, label %if.then
-; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
-; 16: $[[LABEL]]:
 if.then:                                          ; preds = %entry
   store i32 1, i32* @result, align 4
   br label %if.end
diff --git a/llvm/test/CodeGen/Mips/brconlt.ll b/llvm/test/CodeGen/Mips/brconlt.ll
index 65f6c347b6710d..3b4ea80d020558 100644
--- a/llvm/test/CodeGen/Mips/brconlt.ll
+++ b/llvm/test/CodeGen/Mips/brconlt.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 ; RUN: llc  -march=mips -mattr=micromips -mcpu=mips32r6 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=MM32R6
 
@@ -7,16 +8,52 @@
 @result = global i32 0, align 4
 
 define void @test() nounwind {
+; 16-LABEL: test:
+; 16:       # %bb.0: # %entry
+; 16-NEXT:    lui $2, %hi(_gp_disp)
+; 16-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; 16-NEXT:    li $2, %hi(_gp_disp)
+; 16-NEXT:    addiu $3, $pc, %lo(_gp_disp)
+; 16-NEXT:    sll $2, $2, 16
+; 16-NEXT:    addu $2, $3, $2
+; 16-NEXT:    lw $3, %got(i)($2)
+; 16-NEXT:    lw $4, %got(j)($2)
+; 16-NEXT:    lw $3, 0($3)
+; 16-NEXT:    lw $4, 0($4)
+; 16-NEXT:    slt $4, $3
+; 16-NEXT:    bteqz $BB0_2 # 16 bit inst
+; 16-NEXT:  # %bb.1: # %if.end
+; 16-NEXT:    jrc $ra
+; 16-NEXT:  $BB0_2: # %if.then
+; 16-NEXT:    lw $2, %got(result)($2)
+; 16-NEXT:    li $3, 1
+; 16-NEXT:    sw $3, 0($2)
+; 16-NEXT:    jrc $ra
+;
+; MM32R6-LABEL: test:
+; MM32R6:       # %bb.0: # %entry
+; MM32R6-NEXT:    lui $2, %hi(_gp_disp)
+; MM32R6-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MM32R6-NEXT:    addu $2, $2, $25
+; MM32R6-NEXT:    lw $3, %got(i)($2)
+; MM32R6-NEXT:    lw $4, %got(j)($2)
+; MM32R6-NEXT:    lw16 $3, 0($3)
+; MM32R6-NEXT:    lw16 $4, 0($4)
+; MM32R6-NEXT:    slt $1, $4, $3
+; MM32R6-NEXT:    beqzc $1, $BB0_2
+; MM32R6-NEXT:  # %bb.1: # %if.end
+; MM32R6-NEXT:    jrc $ra
+; MM32R6-NEXT:  $BB0_2: # %if.then
+; MM32R6-NEXT:    lw $2, %got(result)($2)
+; MM32R6-NEXT:    li16 $3, 1
+; MM32R6-NEXT:    sw16 $3, 0($2)
+; MM32R6-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @j, align 4
   %1 = load i32, i32* @i, align 4
   %cmp = icmp slt i32 %0, %1
   br i1 %cmp, label %if.end, label %if.then
 
-; 16:     slt   ${{[0-9]+}}, ${{[0-9]+}}
-; MM32R6: slt   ${{[0-9]+}}, ${{[0-9]+}}
-; 16:     btnez $[[LABEL:[0-9A-Ba-b_]+]]
-; 16:     $[[LABEL]]:
 
 if.then:                                          ; preds = %entry
   store i32 1, i32* @result, align 4
diff --git a/llvm/test/CodeGen/Mips/brconne.ll b/llvm/test/CodeGen/Mips/brconne.ll
index e0cbe378fe3c6a..da11e842ada77c 100644
--- a/llvm/test/CodeGen/Mips/brconne.ll
+++ b/llvm/test/CodeGen/Mips/brconne.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc  -march=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
@@ -5,15 +6,32 @@
 @result = global i32 0, align 4
 
 define void @test() nounwind {
+; 16-LABEL: test:
+; 16:       # %bb.0: # %entry
+; 16-NEXT:    lui $2, %hi(_gp_disp)
+; 16-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; 16-NEXT:    li $2, %hi(_gp_disp)
+; 16-NEXT:    addiu $3, $pc, %lo(_gp_disp)
+; 16-NEXT:    sll $2, $2, 16
+; 16-NEXT:    addu $2, $3, $2
+; 16-NEXT:    lw $3, %got(i)($2)
+; 16-NEXT:    lw $4, %got(j)($2)
+; 16-NEXT:    lw $3, 0($3)
+; 16-NEXT:    lw $4, 0($4)
+; 16-NEXT:    cmp $4, $3
+; 16-NEXT:    bteqz $BB0_2 # 16 bit inst
+; 16-NEXT:  # %bb.1: # %if.end
+; 16-NEXT:    jrc $ra
+; 16-NEXT:  $BB0_2: # %if.then
+; 16-NEXT:    lw $2, %got(result)($2)
+; 16-NEXT:    li $3, 1
+; 16-NEXT:    sw $3, 0($2)
+; 16-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @j, align 4
   %1 = load i32, i32* @i, align 4
   %cmp = icmp eq i32 %0, %1
   br i1 %cmp, label %if.then, label %if.end
-; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
-; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
-; 16: $[[LABEL]]:
 
 if.then:                                          ; preds = %entry
   store i32 1, i32* @result, align 4
diff --git a/llvm/test/CodeGen/Mips/compactbranches/no-beqzc-bnezc.ll b/llvm/test/CodeGen/Mips/compactbranches/no-beqzc-bnezc.ll
index 5e607c2639f3b4..27194ef77f7c24 100644
--- a/llvm/test/CodeGen/Mips/compactbranches/no-beqzc-bnezc.ll
+++ b/llvm/test/CodeGen/Mips/compactbranches/no-beqzc-bnezc.ll
@@ -1,16 +1,57 @@
-; RUN: llc -march=mipsel -mcpu=mips32r6 -disable-mips-delay-filler < %s | FileCheck %s
-; RUN: llc -march=mips -mcpu=mips32r6 -disable-mips-delay-filler < %s -filetype=obj \
-; RUN:     -o - | llvm-objdump -d - | FileCheck %s --check-prefix=ENCODING
-; RUN: llc -march=mipsel -mcpu=mips64r6 -disable-mips-delay-filler -target-abi=n64 < %s | FileCheck %s
-; RUN: llc -march=mips -mcpu=mips64r6 -disable-mips-delay-filler -target-abi=n64 < %s -filetype=obj \
-; RUN:     -o - | llvm-objdump -d - | FileCheck %s --check-prefix=ENCODING
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=mipsel -mcpu=mips32r6 -disable-mips-delay-filler < %s | FileCheck %s --check-prefixes=ENCODING,MIPSEL32
+; RUN: llc -march=mips -mcpu=mips32r6 -disable-mips-delay-filler < %s | FileCheck %s --check-prefixes=ENCODING,MIPS32
+; RUN: llc -march=mipsel -mcpu=mips64r6 -disable-mips-delay-filler -target-abi=n64 < %s | FileCheck %s --check-prefixes=ENCODING,MIPSEL64
+; RUN: llc -march=mips -mcpu=mips64r6 -disable-mips-delay-filler -target-abi=n64 < %s | FileCheck %s --check-prefixes=ENCODING,MIPS64
 
 ; bnezc and beqzc have restriction that $rt != 0
 
 define i32 @f() {
 ; CHECK-LABEL: f:
 ; CHECK-NOT:   bnezc $0
-
+; MIPSEL32-LABEL: f:
+; MIPSEL32:       # %bb.0:
+; MIPSEL32-NEXT:    bnez $zero, $BB0_2
+; MIPSEL32-NEXT:    nop
+; MIPSEL32-NEXT:  # %bb.1: # %if.then
+; MIPSEL32-NEXT:    addiu $2, $zero, 1
+; MIPSEL32-NEXT:    jrc $ra
+; MIPSEL32-NEXT:  $BB0_2: # %if.end
+; MIPSEL32-NEXT:    addiu $2, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+;
+; MIPS32-LABEL: f:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    bnez $zero, $BB0_2
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:  # %bb.1: # %if.then
+; MIPS32-NEXT:    addiu $2, $zero, 1
+; MIPS32-NEXT:    jrc $ra
+; MIPS32-NEXT:  $BB0_2: # %if.end
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    jrc $ra
+;
+; MIPSEL64-LABEL: f:
+; MIPSEL64:       # %bb.0:
+; MIPSEL64-NEXT:    bnez $zero, .LBB0_2
+; MIPSEL64-NEXT:    nop
+; MIPSEL64-NEXT:  # %bb.1: # %if.then
+; MIPSEL64-NEXT:    addiu $2, $zero, 1
+; MIPSEL64-NEXT:    jrc $ra
+; MIPSEL64-NEXT:  .LBB0_2: # %if.end
+; MIPSEL64-NEXT:    addiu $2, $zero, 0
+; MIPSEL64-NEXT:    jrc $ra
+;
+; MIPS64-LABEL: f:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    bnez $zero, .LBB0_2
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:  # %bb.1: # %if.then
+; MIPS64-NEXT:    addiu $2, $zero, 1
+; MIPS64-NEXT:    jrc $ra
+; MIPS64-NEXT:  .LBB0_2: # %if.end
+; MIPS64-NEXT:    addiu $2, $zero, 0
+; MIPS64-NEXT:    jrc $ra
   %cmp = icmp eq i32 1, 1
   br i1 %cmp, label %if.then, label %if.end
 
@@ -24,7 +65,49 @@ define i32 @f() {
 define i32 @f1() {
 ; CHECK-LABEL: f1:
 ; CHECK-NOT:   beqzc $0
-
+; MIPSEL32-LABEL: f1:
+; MIPSEL32:       # %bb.0:
+; MIPSEL32-NEXT:    b $BB1_2
+; MIPSEL32-NEXT:    nop
+; MIPSEL32-NEXT:  # %bb.1: # %if.end
+; MIPSEL32-NEXT:    addiu $2, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+; MIPSEL32-NEXT:  $BB1_2: # %if.then
+; MIPSEL32-NEXT:    addiu $2, $zero, 1
+; MIPSEL32-NEXT:    jrc $ra
+;
+; MIPS32-LABEL: f1:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    b $BB1_2
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:  # %bb.1: # %if.end
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    jrc $ra
+; MIPS32-NEXT:  $BB1_2: # %if.then
+; MIPS32-NEXT:    addiu $2, $zero, 1
+; MIPS32-NEXT:    jrc $ra
+;
+; MIPSEL64-LABEL: f1:
+; MIPSEL64:       # %bb.0:
+; MIPSEL64-NEXT:    b .LBB1_2
+; MIPSEL64-NEXT:    nop
+; MIPSEL64-NEXT:  # %bb.1: # %if.end
+; MIPSEL64-NEXT:    addiu $2, $zero, 0
+; MIPSEL64-NEXT:    jrc $ra
+; MIPSEL64-NEXT:  .LBB1_2: # %if.then
+; MIPSEL64-NEXT:    addiu $2, $zero, 1
+; MIPSEL64-NEXT:    jrc $ra
+;
+; MIPS64-LABEL: f1:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    b .LBB1_2
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:  # %bb.1: # %if.end
+; MIPS64-NEXT:    addiu $2, $zero, 0
+; MIPS64-NEXT:    jrc $ra
+; MIPS64-NEXT:  .LBB1_2: # %if.then
+; MIPS64-NEXT:    addiu $2, $zero, 1
+; MIPS64-NEXT:    jrc $ra
   %cmp = icmp eq i32 0, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -40,9 +123,49 @@ define i32 @f1() {
 ; beqc and bnec have the restriction that $rs < $rt.
 
 define i32 @f2(i32 %a, i32 %b) {
-; ENCODING-LABEL: <f2>:
-; ENCODING-NOT:   beqc $5, $4
-; ENCODING-NOT:   bnec $5, $4
+; MIPSEL32-LABEL: f2:
+; MIPSEL32:       # %bb.0:
+; MIPSEL32-NEXT:    beqc $5, $4, $BB2_2
+; MIPSEL32-NEXT:  # %bb.1: # %if.end
+; MIPSEL32-NEXT:    addiu $2, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+; MIPSEL32-NEXT:  $BB2_2: # %if.then
+; MIPSEL32-NEXT:    addiu $2, $zero, 1
+; MIPSEL32-NEXT:    jrc $ra
+;
+; MIPS32-LABEL: f2:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    beqc $5, $4, $BB2_2
+; MIPS32-NEXT:  # %bb.1: # %if.end
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    jrc $ra
+; MIPS32-NEXT:  $BB2_2: # %if.then
+; MIPS32-NEXT:    addiu $2, $zero, 1
+; MIPS32-NEXT:    jrc $ra
+;
+; MIPSEL64-LABEL: f2:
+; MIPSEL64:       # %bb.0:
+; MIPSEL64-NEXT:    sll $1, $4, 0
+; MIPSEL64-NEXT:    sll $2, $5, 0
+; MIPSEL64-NEXT:    beqc $2, $1, .LBB2_2
+; MIPSEL64-NEXT:  # %bb.1: # %if.end
+; MIPSEL64-NEXT:    addiu $2, $zero, 0
+; MIPSEL64-NEXT:    jrc $ra
+; MIPSEL64-NEXT:  .LBB2_2: # %if.then
+; MIPSEL64-NEXT:    addiu $2, $zero, 1
+; MIPSEL64-NEXT:    jrc $ra
+;
+; MIPS64-LABEL: f2:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    sll $1, $4, 0
+; MIPS64-NEXT:    sll $2, $5, 0
+; MIPS64-NEXT:    beqc $2, $1, .LBB2_2
+; MIPS64-NEXT:  # %bb.1: # %if.end
+; MIPS64-NEXT:    addiu $2, $zero, 0
+; MIPS64-NEXT:    jrc $ra
+; MIPS64-NEXT:  .LBB2_2: # %if.then
+; MIPS64-NEXT:    addiu $2, $zero, 1
+; MIPS64-NEXT:    jrc $ra
 
   %cmp = icmp eq i32 %b, %a
   br i1 %cmp, label %if.then, label %if.end
@@ -57,7 +180,53 @@ define i32 @f2(i32 %a, i32 %b) {
 define i64 @f3() {
 ; CHECK-LABEL: f3:
 ; CHECK-NOT:   bnezc $0
-
+; MIPSEL32-LABEL: f3:
+; MIPSEL32:       # %bb.0:
+; MIPSEL32-NEXT:    bnez $zero, $BB3_2
+; MIPSEL32-NEXT:    nop
+; MIPSEL32-NEXT:  # %bb.1: # %if.then
+; MIPSEL32-NEXT:    addiu $2, $zero, 1
+; MIPSEL32-NEXT:    addiu $3, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+; MIPSEL32-NEXT:  $BB3_2: # %if.end
+; MIPSEL32-NEXT:    addiu $2, $zero, 0
+; MIPSEL32-NEXT:    addiu $3, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+;
+; MIPS32-LABEL: f3:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    bnez $zero, $BB3_2
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:  # %bb.1: # %if.then
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    addiu $3, $zero, 1
+; MIPS32-NEXT:    jrc $ra
+; MIPS32-NEXT:  $BB3_2: # %if.end
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    addiu $3, $zero, 0
+; MIPS32-NEXT:    jrc $ra
+;
+; MIPSEL64-LABEL: f3:
+; MIPSEL64:       # %bb.0:
+; MIPSEL64-NEXT:    bnez $zero, .LBB3_2
+; MIPSEL64-NEXT:    nop
+; MIPSEL64-NEXT:  # %bb.1: # %if.then
+; MIPSEL64-NEXT:    daddiu $2, $zero, 1
+; MIPSEL64-NEXT:    jrc $ra
+; MIPSEL64-NEXT:  .LBB3_2: # %if.end
+; MIPSEL64-NEXT:    daddiu $2, $zero, 0
+; MIPSEL64-NEXT:    jrc $ra
+;
+; MIPS64-LABEL: f3:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    bnez $zero, .LBB3_2
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:  # %bb.1: # %if.then
+; MIPS64-NEXT:    daddiu $2, $zero, 1
+; MIPS64-NEXT:    jrc $ra
+; MIPS64-NEXT:  .LBB3_2: # %if.end
+; MIPS64-NEXT:    daddiu $2, $zero, 0
+; MIPS64-NEXT:    jrc $ra
   %cmp = icmp eq i64 1, 1
   br i1 %cmp, label %if.then, label %if.end
 
@@ -71,7 +240,53 @@ define i64 @f3() {
 define i64 @f4() {
 ; CHECK-LABEL: f4:
 ; CHECK-NOT:   beqzc $0
-
+; MIPSEL32-LABEL: f4:
+; MIPSEL32:       # %bb.0:
+; MIPSEL32-NEXT:    b $BB4_2
+; MIPSEL32-NEXT:    nop
+; MIPSEL32-NEXT:  # %bb.1: # %if.end
+; MIPSEL32-NEXT:    addiu $2, $zero, 0
+; MIPSEL32-NEXT:    addiu $3, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+; MIPSEL32-NEXT:  $BB4_2: # %if.then
+; MIPSEL32-NEXT:    addiu $2, $zero, 1
+; MIPSEL32-NEXT:    addiu $3, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+;
+; MIPS32-LABEL: f4:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    b $BB4_2
+; MIPS32-NEXT:    nop
+; MIPS32-NEXT:  # %bb.1: # %if.end
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    addiu $3, $zero, 0
+; MIPS32-NEXT:    jrc $ra
+; MIPS32-NEXT:  $BB4_2: # %if.then
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    addiu $3, $zero, 1
+; MIPS32-NEXT:    jrc $ra
+;
+; MIPSEL64-LABEL: f4:
+; MIPSEL64:       # %bb.0:
+; MIPSEL64-NEXT:    b .LBB4_2
+; MIPSEL64-NEXT:    nop
+; MIPSEL64-NEXT:  # %bb.1: # %if.end
+; MIPSEL64-NEXT:    daddiu $2, $zero, 0
+; MIPSEL64-NEXT:    jrc $ra
+; MIPSEL64-NEXT:  .LBB4_2: # %if.then
+; MIPSEL64-NEXT:    daddiu $2, $zero, 1
+; MIPSEL64-NEXT:    jrc $ra
+;
+; MIPS64-LABEL: f4:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    b .LBB4_2
+; MIPS64-NEXT:    nop
+; MIPS64-NEXT:  # %bb.1: # %if.end
+; MIPS64-NEXT:    daddiu $2, $zero, 0
+; MIPS64-NEXT:    jrc $ra
+; MIPS64-NEXT:  .LBB4_2: # %if.then
+; MIPS64-NEXT:    daddiu $2, $zero, 1
+; MIPS64-NEXT:    jrc $ra
   %cmp = icmp eq i64 0, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -87,9 +302,55 @@ define i64 @f4() {
 ; beqc and bnec have the restriction that $rs < $rt.
 
 define i64 @f5(i64 %a, i64 %b) {
-; ENCODING-LABEL: <f5>:
-; ENCODING-NOT:   beqc $5, $4
-; ENCODING-NOT:   bnec $5, $4
+; MIPSEL32-LABEL: f5:
+; MIPSEL32:       # %bb.0:
+; MIPSEL32-NEXT:    xor $1, $7, $5
+; MIPSEL32-NEXT:    xor $2, $6, $4
+; MIPSEL32-NEXT:    or $1, $2, $1
+; MIPSEL32-NEXT:    beqzc $1, $BB5_2
+; MIPSEL32-NEXT:  # %bb.1: # %if.end
+; MIPSEL32-NEXT:    addiu $2, $zero, 0
+; MIPSEL32-NEXT:    addiu $3, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+; MIPSEL32-NEXT:  $BB5_2: # %if.then
+; MIPSEL32-NEXT:    addiu $2, $zero, 1
+; MIPSEL32-NEXT:    addiu $3, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+;
+; MIPS32-LABEL: f5:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    xor $1, $6, $4
+; MIPS32-NEXT:    xor $2, $7, $5
+; MIPS32-NEXT:    or $1, $2, $1
+; MIPS32-NEXT:    beqzc $1, $BB5_2
+; MIPS32-NEXT:  # %bb.1: # %if.end
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    addiu $3, $zero, 0
+; MIPS32-NEXT:    jrc $ra
+; MIPS32-NEXT:  $BB5_2: # %if.then
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    addiu $3, $zero, 1
+; MIPS32-NEXT:    jrc $ra
+;
+; MIPSEL64-LABEL: f5:
+; MIPSEL64:       # %bb.0:
+; MIPSEL64-NEXT:    beqc $5, $4, .LBB5_2
+; MIPSEL64-NEXT:  # %bb.1: # %if.end
+; MIPSEL64-NEXT:    daddiu $2, $zero, 0
+; MIPSEL64-NEXT:    jrc $ra
+; MIPSEL64-NEXT:  .LBB5_2: # %if.then
+; MIPSEL64-NEXT:    daddiu $2, $zero, 1
+; MIPSEL64-NEXT:    jrc $ra
+;
+; MIPS64-LABEL: f5:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    beqc $5, $4, .LBB5_2
+; MIPS64-NEXT:  # %bb.1: # %if.end
+; MIPS64-NEXT:    daddiu $2, $zero, 0
+; MIPS64-NEXT:    jrc $ra
+; MIPS64-NEXT:  .LBB5_2: # %if.then
+; MIPS64-NEXT:    daddiu $2, $zero, 1
+; MIPS64-NEXT:    jrc $ra
 
   %cmp = icmp eq i64 %b, %a
   br i1 %cmp, label %if.then, label %if.end
@@ -104,7 +365,47 @@ define i64 @f5(i64 %a, i64 %b) {
 define i32 @f6(i32 %a) {
 ; CHECK-LABEL: f6:
 ; CHECK: beqzc ${{[0-9]+}}, {{((\$)|(\.L))}}BB
-
+; MIPSEL32-LABEL: f6:
+; MIPSEL32:       # %bb.0:
+; MIPSEL32-NEXT:    beqzc $4, $BB6_2
+; MIPSEL32-NEXT:  # %bb.1: # %if.end
+; MIPSEL32-NEXT:    addiu $2, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+; MIPSEL32-NEXT:  $BB6_2: # %if.then
+; MIPSEL32-NEXT:    addiu $2, $zero, 1
+; MIPSEL32-NEXT:    jrc $ra
+;
+; MIPS32-LABEL: f6:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    beqzc $4, $BB6_2
+; MIPS32-NEXT:  # %bb.1: # %if.end
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    jrc $ra
+; MIPS32-NEXT:  $BB6_2: # %if.then
+; MIPS32-NEXT:    addiu $2, $zero, 1
+; MIPS32-NEXT:    jrc $ra
+;
+; MIPSEL64-LABEL: f6:
+; MIPSEL64:       # %bb.0:
+; MIPSEL64-NEXT:    sll $1, $4, 0
+; MIPSEL64-NEXT:    beqzc $1, .LBB6_2
+; MIPSEL64-NEXT:  # %bb.1: # %if.end
+; MIPSEL64-NEXT:    addiu $2, $zero, 0
+; MIPSEL64-NEXT:    jrc $ra
+; MIPSEL64-NEXT:  .LBB6_2: # %if.then
+; MIPSEL64-NEXT:    addiu $2, $zero, 1
+; MIPSEL64-NEXT:    jrc $ra
+;
+; MIPS64-LABEL: f6:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    sll $1, $4, 0
+; MIPS64-NEXT:    beqzc $1, .LBB6_2
+; MIPS64-NEXT:  # %bb.1: # %if.end
+; MIPS64-NEXT:    addiu $2, $zero, 0
+; MIPS64-NEXT:    jrc $ra
+; MIPS64-NEXT:  .LBB6_2: # %if.then
+; MIPS64-NEXT:    addiu $2, $zero, 1
+; MIPS64-NEXT:    jrc $ra
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -118,7 +419,47 @@ define i32 @f6(i32 %a) {
 define i32 @f7(i32 %a) {
 ; CHECK-LABEL: f7:
 ; CHECK: bnezc ${{[0-9]+}}, {{((\$)|(\.L))}}BB
-
+; MIPSEL32-LABEL: f7:
+; MIPSEL32:       # %bb.0:
+; MIPSEL32-NEXT:    beqzc $4, $BB7_2
+; MIPSEL32-NEXT:  # %bb.1: # %if.end
+; MIPSEL32-NEXT:    addiu $2, $zero, 0
+; MIPSEL32-NEXT:    jrc $ra
+; MIPSEL32-NEXT:  $BB7_2: # %if.then
+; MIPSEL32-NEXT:    addiu $2, $zero, 1
+; MIPSEL32-NEXT:    jrc $ra
+;
+; MIPS32-LABEL: f7:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    beqzc $4, $BB7_2
+; MIPS32-NEXT:  # %bb.1: # %if.end
+; MIPS32-NEXT:    addiu $2, $zero, 0
+; MIPS32-NEXT:    jrc $ra
+; MIPS32-NEXT:  $BB7_2: # %if.then
+; MIPS32-NEXT:    addiu $2, $zero, 1
+; MIPS32-NEXT:    jrc $ra
+;
+; MIPSEL64-LABEL: f7:
+; MIPSEL64:       # %bb.0:
+; MIPSEL64-NEXT:    sll $1, $4, 0
+; MIPSEL64-NEXT:    beqzc $1, .LBB7_2
+; MIPSEL64-NEXT:  # %bb.1: # %if.end
+; MIPSEL64-NEXT:    addiu $2, $zero, 0
+; MIPSEL64-NEXT:    jrc $ra
+; MIPSEL64-NEXT:  .LBB7_2: # %if.then
+; MIPSEL64-NEXT:    addiu $2, $zero, 1
+; MIPSEL64-NEXT:    jrc $ra
+;
+; MIPS64-LABEL: f7:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    sll $1, $4, 0
+; MIPS64-NEXT:    beqzc $1, .LBB7_2
+; MIPS64-NEXT:  # %bb.1: # %if.end
+; MIPS64-NEXT:    addiu $2, $zero, 0
+; MIPS64-NEXT:    jrc $ra
+; MIPS64-NEXT:  .LBB7_2: # %if.then
+; MIPS64-NEXT:    addiu $2, $zero, 1
+; MIPS64-NEXT:    jrc $ra
   %cmp = icmp eq i32 0, %a
   br i1 %cmp, label %if.then, label %if.end
 
diff --git a/llvm/test/CodeGen/Mips/lcb2.ll b/llvm/test/CodeGen/Mips/lcb2.ll
index 4987c606e3300d..6a0be713c47f87 100644
--- a/llvm/test/CodeGen/Mips/lcb2.ll
+++ b/llvm/test/CodeGen/Mips/lcb2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands=true   < %s | FileCheck %s -check-prefix=lcb
 
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands=true   < %s | FileCheck %s -check-prefix=lcbn
@@ -8,6 +9,51 @@
 
 ; Function Attrs: nounwind optsize
 define i32 @bnez() #0 {
+; lcb-LABEL: bnez:
+; lcb:       # %bb.0: # %entry
+; lcb-NEXT:    li $2, %hi(i)
+; lcb-NEXT:    sll $2, $2, 16
+; lcb-NEXT:    lw $3, %lo(i)($2)
+; lcb-NEXT:    bnez $3, $BB0_2
+; lcb-NEXT:  # %bb.1: # %if.then
+; lcb-NEXT:    li $3, 0
+; lcb-NEXT:    #APP
+; lcb-NEXT:    .set push
+; lcb-NEXT:    .set at
+; lcb-NEXT:    .set macro
+; lcb-NEXT:    .set reorder
+; lcb-EMPTY:
+; lcb-NEXT:    .space 10000
+; lcb-EMPTY:
+; lcb-NEXT:    .set pop
+; lcb-NEXT:    #NO_APP
+; lcb-NEXT:    sw $3, %lo(i)($2)
+; lcb-NEXT:  $BB0_2: # %if.end
+; lcb-NEXT:    li $2, 0
+; lcb-NEXT:    jrc $ra
+;
+; lcbn-LABEL: bnez:
+; lcbn:       # %bb.0: # %entry
+; lcbn-NEXT:    li $2, %hi(i)
+; lcbn-NEXT:    sll $2, $2, 16
+; lcbn-NEXT:    lw $3, %lo(i)($2)
+; lcbn-NEXT:    bnez $3, $BB0_2
+; lcbn-NEXT:  # %bb.1: # %if.then
+; lcbn-NEXT:    li $3, 0
+; lcbn-NEXT:    #APP
+; lcbn-NEXT:    .set push
+; lcbn-NEXT:    .set at
+; lcbn-NEXT:    .set macro
+; lcbn-NEXT:    .set reorder
+; lcbn-EMPTY:
+; lcbn-NEXT:    .space 10000
+; lcbn-EMPTY:
+; lcbn-NEXT:    .set pop
+; lcbn-NEXT:    #NO_APP
+; lcbn-NEXT:    sw $3, %lo(i)($2)
+; lcbn-NEXT:  $BB0_2: # %if.end
+; lcbn-NEXT:    li $2, 0
+; lcbn-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %cmp = icmp eq i32 %0, 0
@@ -21,15 +67,90 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %if.then, %entry
   ret i32 0
 }
-; lcb: 	.ent	bnez
-; lcbn:	.ent	bnez
-; lcb:	bnez	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}
-; lcbn-NOT: bnez	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}  # 16 bit inst
-; lcb: 	.end	bnez
-; lcbn:	.end	bnez
 
 ; Function Attrs: nounwind optsize
 define i32 @beqz() #0 {
+; lcb-LABEL: beqz:
+; lcb:       # %bb.0: # %entry
+; lcb-NEXT:    li $2, %hi(i)
+; lcb-NEXT:    sll $2, $2, 16
+; lcb-NEXT:    lw $2, %lo(i)($2)
+; lcb-NEXT:    beqz $2, $BB1_2
+; lcb-NEXT:  # %bb.1: # %if.else
+; lcb-NEXT:    li $2, %hi(j)
+; lcb-NEXT:    sll $2, $2, 16
+; lcb-NEXT:    li $3, 55
+; lcb-NEXT:    sw $3, %lo(j)($2)
+; lcb-NEXT:    #APP
+; lcb-NEXT:    .set push
+; lcb-NEXT:    .set at
+; lcb-NEXT:    .set macro
+; lcb-NEXT:    .set reorder
+; lcb-EMPTY:
+; lcb-NEXT:    .space 10000
+; lcb-EMPTY:
+; lcb-NEXT:    .set pop
+; lcb-NEXT:    #NO_APP
+; lcb-NEXT:    b $BB1_3
+; lcb-NEXT:  $BB1_2: # %if.then
+; lcb-NEXT:    li $2, %hi(j)
+; lcb-NEXT:    sll $2, $2, 16
+; lcb-NEXT:    li $3, 10
+; lcb-NEXT:    sw $3, %lo(j)($2)
+; lcb-NEXT:    #APP
+; lcb-NEXT:    .set push
+; lcb-NEXT:    .set at
+; lcb-NEXT:    .set macro
+; lcb-NEXT:    .set reorder
+; lcb-EMPTY:
+; lcb-NEXT:    .space 10000
+; lcb-EMPTY:
+; lcb-NEXT:    .set pop
+; lcb-NEXT:    #NO_APP
+; lcb-NEXT:  $BB1_3: # %if.end
+; lcb-NEXT:    li $2, 0
+; lcb-NEXT:    jrc $ra
+;
+; lcbn-LABEL: beqz:
+; lcbn:       # %bb.0: # %entry
+; lcbn-NEXT:    li $2, %hi(i)
+; lcbn-NEXT:    sll $2, $2, 16
+; lcbn-NEXT:    lw $2, %lo(i)($2)
+; lcbn-NEXT:    beqz $2, $BB1_2
+; lcbn-NEXT:  # %bb.1: # %if.else
+; lcbn-NEXT:    li $2, %hi(j)
+; lcbn-NEXT:    sll $2, $2, 16
+; lcbn-NEXT:    li $3, 55
+; lcbn-NEXT:    sw $3, %lo(j)($2)
+; lcbn-NEXT:    #APP
+; lcbn-NEXT:    .set push
+; lcbn-NEXT:    .set at
+; lcbn-NEXT:    .set macro
+; lcbn-NEXT:    .set reorder
+; lcbn-EMPTY:
+; lcbn-NEXT:    .space 10000
+; lcbn-EMPTY:
+; lcbn-NEXT:    .set pop
+; lcbn-NEXT:    #NO_APP
+; lcbn-NEXT:    b $BB1_3
+; lcbn-NEXT:  $BB1_2: # %if.then
+; lcbn-NEXT:    li $2, %hi(j)
+; lcbn-NEXT:    sll $2, $2, 16
+; lcbn-NEXT:    li $3, 10
+; lcbn-NEXT:    sw $3, %lo(j)($2)
+; lcbn-NEXT:    #APP
+; lcbn-NEXT:    .set push
+; lcbn-NEXT:    .set at
+; lcbn-NEXT:    .set macro
+; lcbn-NEXT:    .set reorder
+; lcbn-EMPTY:
+; lcbn-NEXT:    .space 10000
+; lcbn-EMPTY:
+; lcbn-NEXT:    .set pop
+; lcbn-NEXT:    #NO_APP
+; lcbn-NEXT:  $BB1_3: # %if.end
+; lcbn-NEXT:    li $2, 0
+; lcbn-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %cmp = icmp eq i32 %0, 0
@@ -49,16 +170,93 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 0
 }
 
-; lcb: 	.ent	beqz
-; lcbn:	.ent	beqz
-; lcb:	beqz	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}
-; lcbn-NOT: beqz	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}  # 16 bit inst
-; lcb: 	.end	beqz
-; lcbn:	.end	beqz
-
-
 ; Function Attrs: nounwind optsize
 define void @bteqz() #0 {
+; lcb-LABEL: bteqz:
+; lcb:       # %bb.0: # %entry
+; lcb-NEXT:    li $2, %hi(j)
+; lcb-NEXT:    sll $2, $2, 16
+; lcb-NEXT:    lw $2, %lo(j)($2)
+; lcb-NEXT:    li $3, %hi(i)
+; lcb-NEXT:    sll $3, $3, 16
+; lcb-NEXT:    lw $3, %lo(i)($3)
+; lcb-NEXT:    cmp $3, $2
+; lcb-NEXT:    bteqz $BB2_2
+; lcb-NEXT:  # %bb.1: # %if.else
+; lcb-NEXT:    li $2, %hi(k)
+; lcb-NEXT:    sll $2, $2, 16
+; lcb-NEXT:    #APP
+; lcb-NEXT:    .set push
+; lcb-NEXT:    .set at
+; lcb-NEXT:    .set macro
+; lcb-NEXT:    .set reorder
+; lcb-EMPTY:
+; lcb-NEXT:    .space 1000
+; lcb-EMPTY:
+; lcb-NEXT:    .set pop
+; lcb-NEXT:    #NO_APP
+; lcb-NEXT:    li $3, 2
+; lcb-NEXT:    sw $3, %lo(k)($2)
+; lcb-NEXT:    jrc $ra
+; lcb-NEXT:  $BB2_2: # %if.then
+; lcb-NEXT:    li $2, %hi(k)
+; lcb-NEXT:    sll $2, $2, 16
+; lcb-NEXT:    li $3, 1
+; lcb-NEXT:    sw $3, %lo(k)($2)
+; lcb-NEXT:    #APP
+; lcb-NEXT:    .set push
+; lcb-NEXT:    .set at
+; lcb-NEXT:    .set macro
+; lcb-NEXT:    .set reorder
+; lcb-EMPTY:
+; lcb-NEXT:    .space 1000
+; lcb-EMPTY:
+; lcb-NEXT:    .set pop
+; lcb-NEXT:    #NO_APP
+; lcb-NEXT:    jrc $ra
+;
+; lcbn-LABEL: bteqz:
+; lcbn:       # %bb.0: # %entry
+; lcbn-NEXT:    li $2, %hi(j)
+; lcbn-NEXT:    sll $2, $2, 16
+; lcbn-NEXT:    lw $2, %lo(j)($2)
+; lcbn-NEXT:    li $3, %hi(i)
+; lcbn-NEXT:    sll $3, $3, 16
+; lcbn-NEXT:    lw $3, %lo(i)($3)
+; lcbn-NEXT:    cmp $3, $2
+; lcbn-NEXT:    bteqz $BB2_2
+; lcbn-NEXT:  # %bb.1: # %if.else
+; lcbn-NEXT:    li $2, %hi(k)
+; lcbn-NEXT:    sll $2, $2, 16
+; lcbn-NEXT:    #APP
+; lcbn-NEXT:    .set push
+; lcbn-NEXT:    .set at
+; lcbn-NEXT:    .set macro
+; lcbn-NEXT:    .set reorder
+; lcbn-EMPTY:
+; lcbn-NEXT:    .space 1000
+; lcbn-EMPTY:
+; lcbn-NEXT:    .set pop
+; lcbn-NEXT:    #NO_APP
+; lcbn-NEXT:    li $3, 2
+; lcbn-NEXT:    sw $3, %lo(k)($2)
+; lcbn-NEXT:    jrc $ra
+; lcbn-NEXT:  $BB2_2: # %if.then
+; lcbn-NEXT:    li $2, %hi(k)
+; lcbn-NEXT:    sll $2, $2, 16
+; lcbn-NEXT:    li $3, 1
+; lcbn-NEXT:    sw $3, %lo(k)($2)
+; lcbn-NEXT:    #APP
+; lcbn-NEXT:    .set push
+; lcbn-NEXT:    .set at
+; lcbn-NEXT:    .set macro
+; lcbn-NEXT:    .set reorder
+; lcbn-EMPTY:
+; lcbn-NEXT:    .space 1000
+; lcbn-EMPTY:
+; lcbn-NEXT:    .set pop
+; lcbn-NEXT:    #NO_APP
+; lcbn-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %1 = load i32, i32* @j, align 4, !tbaa !1
@@ -79,16 +277,65 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-; lcb: 	.ent	bteqz
-; lcbn:	.ent	bteqz
-; lcb:	btnez	$BB{{[0-9]+}}_{{[0-9]+}}
-; lcbn-NOT: btnez	$BB{{[0-9]+}}_{{[0-9]+}} # 16 bit inst
-; lcb: 	.end	bteqz
-; lcbn:	.end	bteqz
-
-
 ; Function Attrs: nounwind optsize
 define void @btz() #0 {
+; lcb-LABEL: btz:
+; lcb:       # %bb.0: # %entry
+; lcb-NEXT:    li $2, %hi(i)
+; lcb-NEXT:    sll $2, $2, 16
+; lcb-NEXT:    lw $4, %lo(i)($2)
+; lcb-NEXT:    li $3, %hi(j)
+; lcb-NEXT:    sll $3, $3, 16
+; lcb-NEXT:    lw $5, %lo(j)($3)
+; lcb-NEXT:    slt $5, $4
+; lcb-NEXT:    bteqz $BB3_2
+; lcb-NEXT:  $BB3_1: # %if.then
+; lcb-NEXT:    # =>This Inner Loop Header: Depth=1
+; lcb-NEXT:    #APP
+; lcb-NEXT:    .set push
+; lcb-NEXT:    .set at
+; lcb-NEXT:    .set macro
+; lcb-NEXT:    .set reorder
+; lcb-EMPTY:
+; lcb-NEXT:    .space 60000
+; lcb-EMPTY:
+; lcb-NEXT:    .set pop
+; lcb-NEXT:    #NO_APP
+; lcb-NEXT:    lw $4, %lo(i)($2)
+; lcb-NEXT:    lw $5, %lo(j)($3)
+; lcb-NEXT:    slt $5, $4
+; lcb-NEXT:    btnez $BB3_1
+; lcb-NEXT:  $BB3_2: # %if.end
+; lcb-NEXT:    jrc $ra
+;
+; lcbn-LABEL: btz:
+; lcbn:       # %bb.0: # %entry
+; lcbn-NEXT:    li $2, %hi(i)
+; lcbn-NEXT:    sll $2, $2, 16
+; lcbn-NEXT:    lw $4, %lo(i)($2)
+; lcbn-NEXT:    li $3, %hi(j)
+; lcbn-NEXT:    sll $3, $3, 16
+; lcbn-NEXT:    lw $5, %lo(j)($3)
+; lcbn-NEXT:    slt $5, $4
+; lcbn-NEXT:    bteqz $BB3_2
+; lcbn-NEXT:  $BB3_1: # %if.then
+; lcbn-NEXT:    # =>This Inner Loop Header: Depth=1
+; lcbn-NEXT:    #APP
+; lcbn-NEXT:    .set push
+; lcbn-NEXT:    .set at
+; lcbn-NEXT:    .set macro
+; lcbn-NEXT:    .set reorder
+; lcbn-EMPTY:
+; lcbn-NEXT:    .space 60000
+; lcbn-EMPTY:
+; lcbn-NEXT:    .set pop
+; lcbn-NEXT:    #NO_APP
+; lcbn-NEXT:    lw $4, %lo(i)($2)
+; lcbn-NEXT:    lw $5, %lo(j)($3)
+; lcbn-NEXT:    slt $5, $4
+; lcbn-NEXT:    btnez $BB3_1
+; lcbn-NEXT:  $BB3_2: # %if.end
+; lcbn-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %1 = load i32, i32* @j, align 4, !tbaa !1
@@ -106,15 +353,6 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-; lcb: 	.ent	btz
-; lcbn:	.ent	btz
-; lcb:	bteqz	$BB{{[0-9]+}}_{{[0-9]+}}
-; lcbn-NOT: bteqz	$BB{{[0-9]+}}_{{[0-9]+}} # 16 bit inst
-; lcb:	btnez	$BB{{[0-9]+}}_{{[0-9]+}}
-; lcbn-NOT: btnez	$BB{{[0-9]+}}_{{[0-9]+}} # 16 bit inst
-; lcb: 	.end	btz
-; lcbn:	.end	btz
-
 attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Mips/lcb5.ll b/llvm/test/CodeGen/Mips/lcb5.ll
index 96e924a44f725c..6b8ebfd07e6ffc 100644
--- a/llvm/test/CodeGen/Mips/lcb5.ll
+++ b/llvm/test/CodeGen/Mips/lcb5.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mattr=mips16 -mattr=+soft-float -mips16-hard-float -relocation-model=static     < %s | FileCheck %s -check-prefix=ci
 
 @i = global i32 0, align 4
@@ -6,6 +7,41 @@
 
 ; Function Attrs: nounwind optsize
 define i32 @x0() #0 {
+; ci-LABEL: x0:
+; ci:       # %bb.0: # %entry
+; ci-NEXT:    li $2, %hi(i)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    lw $3, %lo(i)($2)
+; ci-NEXT:    beqz $3, $BB0_2
+; ci-NEXT:  # %bb.1: # %if.else
+; ci-NEXT:    li $3, 1
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 1004
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    b $BB0_3 # 16 bit inst
+; ci-NEXT:  $BB0_2: # %if.then
+; ci-NEXT:    li $3, 0
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 1000
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:  $BB0_3: # %if.end
+; ci-NEXT:    sw $3, %lo(i)($2)
+; ci-NEXT:    li $2, 0
+; ci-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %cmp = icmp eq i32 %0, 0
@@ -25,13 +61,48 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 0
 }
 
-; ci:	.ent	x0
-; ci: 	beqz	$3, $BB0_2
-; ci: $BB0_2:
-; ci:	.end	x0
-
 ; Function Attrs: nounwind optsize
 define i32 @x1() #0 {
+; ci-LABEL: x1:
+; ci:       # %bb.0: # %entry
+; ci-NEXT:    li $2, %hi(i)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    lw $3, %lo(i)($2)
+; ci-NEXT:    bnez $3, $BB1_1 # 16 bit inst
+; ci-NEXT:    jal $BB1_2 # branch
+; ci-NEXT:    nop
+; ci-NEXT:  $BB1_1: # %if.else
+; ci-NEXT:    li $3, 1
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 1000004
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    jal $BB1_3 # branch
+; ci-NEXT:    nop
+; ci-NEXT:    .p2align 2
+; ci-NEXT:  $BB1_2: # %if.then
+; ci-NEXT:    li $3, 0
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 1000000
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    .p2align 2
+; ci-NEXT:  $BB1_3: # %if.end
+; ci-NEXT:    sw $3, %lo(i)($2)
+; ci-NEXT:    li $2, 0
+; ci-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %cmp = icmp eq i32 %0, 0
@@ -51,15 +122,49 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 0
 }
 
-; ci:	.ent	x1
-; ci:	bnez	$3, $BB1_1  # 16 bit inst
-; ci:	jal	$BB1_2	# branch
-; ci:	nop
-; ci: $BB1_1:
-; ci:	.end	x1
 
 ; Function Attrs: nounwind optsize
 define i32 @y0() #0 {
+; ci-LABEL: y0:
+; ci:       # %bb.0: # %entry
+; ci-NEXT:    li $2, %hi(i)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    lw $2, %lo(i)($2)
+; ci-NEXT:    beqz $2, $BB2_2
+; ci-NEXT:  # %bb.1: # %if.else
+; ci-NEXT:    li $2, %hi(j)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    li $3, 55
+; ci-NEXT:    sw $3, %lo(j)($2)
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 1004
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    b $BB2_3 # 16 bit inst
+; ci-NEXT:  $BB2_2: # %if.then
+; ci-NEXT:    li $2, %hi(j)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    li $3, 10
+; ci-NEXT:    sw $3, %lo(j)($2)
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 1000
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:  $BB2_3: # %if.end
+; ci-NEXT:    li $2, 0
+; ci-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %cmp = icmp eq i32 %0, 0
@@ -79,12 +184,53 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 0
 }
 
-; ci:	.ent	y0
-; ci:	beqz	$2, $BB2_2
-; ci:	.end	y0
-
 ; Function Attrs: nounwind optsize
 define i32 @y1() #0 {
+; ci-LABEL: y1:
+; ci:       # %bb.0: # %entry
+; ci-NEXT:    li $2, %hi(i)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    lw $2, %lo(i)($2)
+; ci-NEXT:    bnez $2, $BB3_1 # 16 bit inst
+; ci-NEXT:    jal $BB3_2 # branch
+; ci-NEXT:    nop
+; ci-NEXT:  $BB3_1: # %if.else
+; ci-NEXT:    li $2, %hi(j)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    li $3, 55
+; ci-NEXT:    sw $3, %lo(j)($2)
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 1000004
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    jal $BB3_3 # branch
+; ci-NEXT:    nop
+; ci-NEXT:    .p2align 2
+; ci-NEXT:  $BB3_2: # %if.then
+; ci-NEXT:    li $2, %hi(j)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    li $3, 10
+; ci-NEXT:    sw $3, %lo(j)($2)
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 1000000
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    .p2align 2
+; ci-NEXT:  $BB3_3: # %if.end
+; ci-NEXT:    li $2, 0
+; ci-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %cmp = icmp eq i32 %0, 0
@@ -104,15 +250,51 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 0
 }
 
-; ci:	.ent	y1
-; ci:	bnez	$2, $BB3_1  # 16 bit inst
-; ci:	jal	$BB3_2	# branch
-; ci:	nop
-; ci: $BB3_1:
-; ci:	.end	y1
 
 ; Function Attrs: nounwind optsize
 define void @z0() #0 {
+; ci-LABEL: z0:
+; ci:       # %bb.0: # %entry
+; ci-NEXT:    li $2, %hi(j)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    lw $2, %lo(j)($2)
+; ci-NEXT:    li $3, %hi(i)
+; ci-NEXT:    sll $3, $3, 16
+; ci-NEXT:    lw $3, %lo(i)($3)
+; ci-NEXT:    cmp $3, $2
+; ci-NEXT:    bteqz $BB4_2
+; ci-NEXT:  # %bb.1: # %if.else
+; ci-NEXT:    li $2, %hi(k)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 10004
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    li $3, 2
+; ci-NEXT:    sw $3, %lo(k)($2)
+; ci-NEXT:    jrc $ra
+; ci-NEXT:  $BB4_2: # %if.then
+; ci-NEXT:    li $2, %hi(k)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    li $3, 1
+; ci-NEXT:    sw $3, %lo(k)($2)
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 10000
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %1 = load i32, i32* @j, align 4, !tbaa !1
@@ -133,12 +315,54 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-; ci:	.ent	z0
-; ci:	btnez	$BB4_2
-; ci:	.end	z0
 
 ; Function Attrs: nounwind optsize
 define void @z1() #0 {
+; ci-LABEL: z1:
+; ci:       # %bb.0: # %entry
+; ci-NEXT:    li $2, %hi(j)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    lw $2, %lo(j)($2)
+; ci-NEXT:    li $3, %hi(i)
+; ci-NEXT:    sll $3, $3, 16
+; ci-NEXT:    lw $3, %lo(i)($3)
+; ci-NEXT:    cmp $3, $2
+; ci-NEXT:    btnez $BB5_1 # 16 bit inst
+; ci-NEXT:    jal $BB5_2 # branch
+; ci-NEXT:    nop
+; ci-NEXT:  $BB5_1: # %if.else
+; ci-NEXT:    li $2, %hi(k)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 10000004
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    li $3, 2
+; ci-NEXT:    sw $3, %lo(k)($2)
+; ci-NEXT:    jrc $ra
+; ci-NEXT:    .p2align 2
+; ci-NEXT:  $BB5_2: # %if.then
+; ci-NEXT:    li $2, %hi(k)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    li $3, 1
+; ci-NEXT:    sw $3, %lo(k)($2)
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 10000000
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %1 = load i32, i32* @j, align 4, !tbaa !1
@@ -159,15 +383,37 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void
 }
 
-; ci:	.ent	z1
-; ci:	bteqz	$BB5_1  # 16 bit inst
-; ci:	jal	$BB5_2	# branch
-; ci:	nop
-; ci: $BB5_1:
-; ci:	.end	z1
 
 ; Function Attrs: nounwind optsize
 define void @z3() #0 {
+; ci-LABEL: z3:
+; ci:       # %bb.0: # %entry
+; ci-NEXT:    li $2, %hi(i)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    lw $4, %lo(i)($2)
+; ci-NEXT:    li $3, %hi(j)
+; ci-NEXT:    sll $3, $3, 16
+; ci-NEXT:    lw $5, %lo(j)($3)
+; ci-NEXT:    slt $5, $4
+; ci-NEXT:    bteqz $BB6_2
+; ci-NEXT:  $BB6_1: # %if.then
+; ci-NEXT:    # =>This Inner Loop Header: Depth=1
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 10000
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    lw $4, %lo(i)($2)
+; ci-NEXT:    lw $5, %lo(j)($3)
+; ci-NEXT:    slt $5, $4
+; ci-NEXT:    btnez $BB6_1
+; ci-NEXT:  $BB6_2: # %if.end
+; ci-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %1 = load i32, i32* @j, align 4, !tbaa !1
@@ -185,12 +431,42 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-; ci:	.ent	z3
-; ci:	bteqz	$BB6_2
-; ci:	.end	z3
-
 ; Function Attrs: nounwind optsize
 define void @z4() #0 {
+; ci-LABEL: z4:
+; ci:       # %bb.0: # %entry
+; ci-NEXT:    li $2, %hi(i)
+; ci-NEXT:    sll $2, $2, 16
+; ci-NEXT:    lw $4, %lo(i)($2)
+; ci-NEXT:    li $3, %hi(j)
+; ci-NEXT:    sll $3, $3, 16
+; ci-NEXT:    lw $5, %lo(j)($3)
+; ci-NEXT:    slt $5, $4
+; ci-NEXT:    btnez $BB7_1 # 16 bit inst
+; ci-NEXT:    jal $BB7_2 # branch
+; ci-NEXT:    nop
+; ci-NEXT:    .p2align 2
+; ci-NEXT:  $BB7_1: # %if.then
+; ci-NEXT:    # =>This Inner Loop Header: Depth=1
+; ci-NEXT:    #APP
+; ci-NEXT:    .set push
+; ci-NEXT:    .set at
+; ci-NEXT:    .set macro
+; ci-NEXT:    .set reorder
+; ci-EMPTY:
+; ci-NEXT:    .space 10000000
+; ci-EMPTY:
+; ci-NEXT:    .set pop
+; ci-NEXT:    #NO_APP
+; ci-NEXT:    lw $4, %lo(i)($2)
+; ci-NEXT:    lw $5, %lo(j)($3)
+; ci-NEXT:    slt $5, $4
+; ci-NEXT:    bteqz $BB7_2 # 16 bit inst
+; ci-NEXT:    jal $BB7_1 # branch
+; ci-NEXT:    nop
+; ci-NEXT:    .p2align 2
+; ci-NEXT:  $BB7_2: # %if.end
+; ci-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @i, align 4, !tbaa !1
   %1 = load i32, i32* @j, align 4, !tbaa !1
@@ -208,14 +484,6 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-; ci:	.ent	z4
-; ci:	btnez	$BB7_1  # 16 bit inst
-; ci:	jal	$BB7_2	# branch
-; ci:	nop
-; ci:	.p2align	2
-; ci: $BB7_1:
-; ci:	.end	z4
-
 attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Mips/longbranch/compact-branches-long-branch.ll b/llvm/test/CodeGen/Mips/longbranch/compact-branches-long-branch.ll
index 709cd477a778e6..dbd071f81cbfed 100644
--- a/llvm/test/CodeGen/Mips/longbranch/compact-branches-long-branch.ll
+++ b/llvm/test/CodeGen/Mips/longbranch/compact-branches-long-branch.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=mips -mcpu=mips32r6 -force-mips-long-branch | FileCheck %s
 
 ; Check that when MIPS32R6 with the static relocation model with the usage of
@@ -9,11 +10,29 @@ declare i32 @f(i32)
 
 declare i32 @g()
 
-; CHECK-LABEL: test1:
-; CHECK:       bnezc
-; CHECK-NEXT:  nop
-
 define i32 @test1(i32 %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    bnezc $4, $BB0_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB0_3
+; CHECK-NEXT:  $BB0_2: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB0_3: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %cond.true, label %cond.false
@@ -25,11 +44,30 @@ cond.false:
   ret i32 %2
 }
 
-; CHECK-LABEL: test2:
-; CHECK:       bgezc
-; CHECK-NEXT:  nop
 
 define i32 @test2(i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    bgezc $4, $BB1_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB1_3
+; CHECK-NEXT:  $BB1_2: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB1_3: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp sge i32 %a, 0
   br i1 %0, label %cond.true, label %cond.false
@@ -41,11 +79,30 @@ cond.false:
   ret i32 %2
 }
 
-; CHECK-LABEL: test3:
-; CHECK:       blezc
-; CHECK-NEXT:  nop
 
 define i32 @test3(i32 %a) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    blezc $4, $BB2_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB2_3
+; CHECK-NEXT:  $BB2_2: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB2_3: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp sle i32 %a, 0
   br i1 %0, label %cond.true, label %cond.false
@@ -57,11 +114,30 @@ cond.false:
   ret i32 %2
 }
 
-; CHECK-LABEL: test4:
-; CHECK:       bgtzc
-; CHECK-NEXT:  nop
 
 define i32 @test4(i32 %a) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    bgtzc $4, $BB3_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB3_3
+; CHECK-NEXT:  $BB3_2: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB3_3: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp sgt i32 %a, 0
   br i1 %0, label %cond.true, label %cond.false
@@ -73,11 +149,29 @@ cond.false:
   ret i32 %2
 }
 
-; CHECK-LABEL: test5:
-; CHECK:       bgezc
-; CHECK-NEXT:  nop
-
 define i32 @test5(i32 %a) {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    bgezc $4, $BB4_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB4_3
+; CHECK-NEXT:  $BB4_2: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB4_3: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp slt i32 %a, 0
   br i1 %0, label %cond.true, label %cond.false
@@ -89,11 +183,30 @@ cond.false:
   ret i32 %2
 }
 
-; CHECK-LABEL: test6:
-; CHECK:       bnezc
-; CHECK-NEXT:  nop
-
 define i32 @test6(i32 %a, i32 %b) {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    sltu $1, $5, $4
+; CHECK-NEXT:    bnezc $1, $BB5_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB5_3
+; CHECK-NEXT:  $BB5_2: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB5_3: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp ugt i32 %a, %b
   br i1 %0, label %cond.true, label %cond.false
@@ -105,11 +218,31 @@ cond.false:
   ret i32 %2
 }
 
-; CHECK-LABEL: test7:
-; CHECK:       beqzc
-; CHECK-NEXT:  nop
 
 define i32 @test7(i32 %a, i32 %b) {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    sltu $1, $4, $5
+; CHECK-NEXT:    bnezc $1, $BB6_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB6_3
+; CHECK-NEXT:  $BB6_2: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB6_3: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp uge i32 %a, %b
   br i1 %0, label %cond.true, label %cond.false
@@ -121,11 +254,31 @@ cond.false:
   ret i32 %2
 }
 
-; CHECK-LABEL: test8:
-; CHECK:       bnezc
-; CHECK-NEXT:  nop
 
 define i32 @test8(i32 %a, i32 %b) {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    sltu $1, $4, $5
+; CHECK-NEXT:    bnezc $1, $BB7_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB7_3
+; CHECK-NEXT:  $BB7_2: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB7_3: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp ult i32 %a, %b
   br i1 %0, label %cond.true, label %cond.false
@@ -137,11 +290,31 @@ cond.false:
   ret i32 %2
 }
 
-; CHECK-LABEL: test9:
-; CHECK:       beqzc
-; CHECK-NEXT:  nop
 
 define i32 @test9(i32 %a, i32 %b) {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addiu $sp, $sp, -24
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -4
+; CHECK-NEXT:    sltu $1, $5, $4
+; CHECK-NEXT:    bnezc $1, $BB8_2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    bc $BB8_3
+; CHECK-NEXT:  $BB8_2: # %cond.false
+; CHECK-NEXT:    jal g
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
+; CHECK-NEXT:  $BB8_3: # %cond.true
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    addiu $sp, $sp, 24
 entry:
   %0 = icmp ule i32 %a, %b
   br i1 %0, label %cond.true, label %cond.false
diff --git a/llvm/test/CodeGen/Mips/seleq.ll b/llvm/test/CodeGen/Mips/seleq.ll
index 7d1e034d68c7c1..34565ea2727583 100644
--- a/llvm/test/CodeGen/Mips/seleq.ll
+++ b/llvm/test/CodeGen/Mips/seleq.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=MIPS16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
@@ -11,6 +12,74 @@
 @z4 = common global i32 0, align 4
 
 define void @calc_seleq() nounwind {
+; MIPS16-LABEL: calc_seleq:
+; MIPS16:       # %bb.0: # %entry
+; MIPS16-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS16-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS16-NEXT:    li $2, %hi(_gp_disp)
+; MIPS16-NEXT:    addiu $3, $pc, %lo(_gp_disp)
+; MIPS16-NEXT:    sll $2, $2, 16
+; MIPS16-NEXT:    addu $2, $3, $2
+; MIPS16-NEXT:    lw $4, %got(b)($2)
+; MIPS16-NEXT:    lw $5, 0($4)
+; MIPS16-NEXT:    lw $3, %got(a)($2)
+; MIPS16-NEXT:    lw $6, 0($3)
+; MIPS16-NEXT:    cmp $6, $5
+; MIPS16-NEXT:    bteqz $BB0_2 # 16 bit inst
+; MIPS16-NEXT:  # %bb.1: # %cond.false
+; MIPS16-NEXT:    lw $5, %got(t)($2)
+; MIPS16-NEXT:    lw $5, 0($5)
+; MIPS16-NEXT:    b $BB0_3 # 16 bit inst
+; MIPS16-NEXT:  $BB0_2: # %cond.true
+; MIPS16-NEXT:    lw $5, %got(f)($2)
+; MIPS16-NEXT:    lw $5, 0($5)
+; MIPS16-NEXT:  $BB0_3: # %cond.end
+; MIPS16-NEXT:    lw $6, %got(z1)($2)
+; MIPS16-NEXT:    sw $5, 0($6)
+; MIPS16-NEXT:    lw $5, 0($3)
+; MIPS16-NEXT:    lw $4, 0($4)
+; MIPS16-NEXT:    cmp $4, $5
+; MIPS16-NEXT:    bteqz $BB0_5 # 16 bit inst
+; MIPS16-NEXT:  # %bb.4: # %cond.false3
+; MIPS16-NEXT:    lw $4, %got(t)($2)
+; MIPS16-NEXT:    lw $4, 0($4)
+; MIPS16-NEXT:    b $BB0_6 # 16 bit inst
+; MIPS16-NEXT:  $BB0_5: # %cond.true2
+; MIPS16-NEXT:    lw $4, %got(f)($2)
+; MIPS16-NEXT:    lw $4, 0($4)
+; MIPS16-NEXT:  $BB0_6: # %cond.end4
+; MIPS16-NEXT:    lw $5, %got(z2)($2)
+; MIPS16-NEXT:    sw $4, 0($5)
+; MIPS16-NEXT:    lw $5, 0($3)
+; MIPS16-NEXT:    lw $4, %got(c)($2)
+; MIPS16-NEXT:    lw $6, 0($4)
+; MIPS16-NEXT:    cmp $6, $5
+; MIPS16-NEXT:    bteqz $BB0_8 # 16 bit inst
+; MIPS16-NEXT:  # %bb.7: # %cond.false8
+; MIPS16-NEXT:    lw $5, %got(f)($2)
+; MIPS16-NEXT:    lw $5, 0($5)
+; MIPS16-NEXT:    b $BB0_9 # 16 bit inst
+; MIPS16-NEXT:  $BB0_8: # %cond.true7
+; MIPS16-NEXT:    lw $5, %got(t)($2)
+; MIPS16-NEXT:    lw $5, 0($5)
+; MIPS16-NEXT:  $BB0_9: # %cond.end9
+; MIPS16-NEXT:    lw $6, %got(z3)($2)
+; MIPS16-NEXT:    sw $5, 0($6)
+; MIPS16-NEXT:    lw $4, 0($4)
+; MIPS16-NEXT:    lw $3, 0($3)
+; MIPS16-NEXT:    cmp $3, $4
+; MIPS16-NEXT:    bteqz $BB0_11 # 16 bit inst
+; MIPS16-NEXT:  # %bb.10: # %cond.false13
+; MIPS16-NEXT:    lw $3, %got(f)($2)
+; MIPS16-NEXT:    lw $3, 0($3)
+; MIPS16-NEXT:    b $BB0_12 # 16 bit inst
+; MIPS16-NEXT:  $BB0_11: # %cond.true12
+; MIPS16-NEXT:    lw $3, %got(t)($2)
+; MIPS16-NEXT:    lw $3, 0($3)
+; MIPS16-NEXT:  $BB0_12: # %cond.end14
+; MIPS16-NEXT:    lw $2, %got(z4)($2)
+; MIPS16-NEXT:    sw $3, 0($2)
+; MIPS16-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @a, align 4
   %1 = load i32, i32* @b, align 4
@@ -80,16 +149,3 @@ cond.end14:                                       ; preds = %cond.false13, %cond
 }
 
 attributes #0 = { nounwind "target-cpu"="mips32" "target-features"="+o32,+mips32" }
-
-; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
-
-; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
-
-; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
-
-; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
-
diff --git a/llvm/test/CodeGen/Mips/selle.ll b/llvm/test/CodeGen/Mips/selle.ll
index 8925aac10c4d19..ffad4ba1c349d9 100644
--- a/llvm/test/CodeGen/Mips/selle.ll
+++ b/llvm/test/CodeGen/Mips/selle.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=MIPS16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
@@ -12,6 +13,74 @@
 @.str = private unnamed_addr constant [5 x i8] c"%i \0A\00", align 1
 
 define void @calc_z() nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" {
+; MIPS16-LABEL: calc_z:
+; MIPS16:       # %bb.0: # %entry
+; MIPS16-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS16-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS16-NEXT:    li $2, %hi(_gp_disp)
+; MIPS16-NEXT:    addiu $3, $pc, %lo(_gp_disp)
+; MIPS16-NEXT:    sll $2, $2, 16
+; MIPS16-NEXT:    addu $2, $3, $2
+; MIPS16-NEXT:    lw $3, %got(a)($2)
+; MIPS16-NEXT:    lw $5, 0($3)
+; MIPS16-NEXT:    lw $4, %got(b)($2)
+; MIPS16-NEXT:    lw $6, 0($4)
+; MIPS16-NEXT:    slt $6, $5
+; MIPS16-NEXT:    bteqz $BB0_2 # 16 bit inst
+; MIPS16-NEXT:  # %bb.1: # %cond.false
+; MIPS16-NEXT:    lw $5, %got(f)($2)
+; MIPS16-NEXT:    lw $5, 0($5)
+; MIPS16-NEXT:    b $BB0_3 # 16 bit inst
+; MIPS16-NEXT:  $BB0_2: # %cond.true
+; MIPS16-NEXT:    lw $5, %got(t)($2)
+; MIPS16-NEXT:    lw $5, 0($5)
+; MIPS16-NEXT:  $BB0_3: # %cond.end
+; MIPS16-NEXT:    lw $6, %got(z1)($2)
+; MIPS16-NEXT:    sw $5, 0($6)
+; MIPS16-NEXT:    lw $4, 0($4)
+; MIPS16-NEXT:    lw $5, 0($3)
+; MIPS16-NEXT:    slt $5, $4
+; MIPS16-NEXT:    bteqz $BB0_5 # 16 bit inst
+; MIPS16-NEXT:  # %bb.4: # %cond.false3
+; MIPS16-NEXT:    lw $4, %got(t)($2)
+; MIPS16-NEXT:    lw $4, 0($4)
+; MIPS16-NEXT:    b $BB0_6 # 16 bit inst
+; MIPS16-NEXT:  $BB0_5: # %cond.true2
+; MIPS16-NEXT:    lw $4, %got(f)($2)
+; MIPS16-NEXT:    lw $4, 0($4)
+; MIPS16-NEXT:  $BB0_6: # %cond.end4
+; MIPS16-NEXT:    lw $5, %got(z2)($2)
+; MIPS16-NEXT:    sw $4, 0($5)
+; MIPS16-NEXT:    lw $4, %got(c)($2)
+; MIPS16-NEXT:    lw $5, 0($4)
+; MIPS16-NEXT:    lw $6, 0($3)
+; MIPS16-NEXT:    slt $6, $5
+; MIPS16-NEXT:    bteqz $BB0_8 # 16 bit inst
+; MIPS16-NEXT:  # %bb.7: # %cond.false8
+; MIPS16-NEXT:    lw $5, %got(f)($2)
+; MIPS16-NEXT:    lw $5, 0($5)
+; MIPS16-NEXT:    b $BB0_9 # 16 bit inst
+; MIPS16-NEXT:  $BB0_8: # %cond.true7
+; MIPS16-NEXT:    lw $5, %got(t)($2)
+; MIPS16-NEXT:    lw $5, 0($5)
+; MIPS16-NEXT:  $BB0_9: # %cond.end9
+; MIPS16-NEXT:    lw $6, %got(z3)($2)
+; MIPS16-NEXT:    sw $5, 0($6)
+; MIPS16-NEXT:    lw $3, 0($3)
+; MIPS16-NEXT:    lw $4, 0($4)
+; MIPS16-NEXT:    slt $4, $3
+; MIPS16-NEXT:    bteqz $BB0_11 # 16 bit inst
+; MIPS16-NEXT:  # %bb.10: # %cond.false13
+; MIPS16-NEXT:    lw $3, %got(f)($2)
+; MIPS16-NEXT:    lw $3, 0($3)
+; MIPS16-NEXT:    b $BB0_12 # 16 bit inst
+; MIPS16-NEXT:  $BB0_11: # %cond.true12
+; MIPS16-NEXT:    lw $3, %got(t)($2)
+; MIPS16-NEXT:    lw $3, 0($3)
+; MIPS16-NEXT:  $BB0_12: # %cond.end14
+; MIPS16-NEXT:    lw $2, %got(z4)($2)
+; MIPS16-NEXT:    sw $3, 0($2)
+; MIPS16-NEXT:    jrc $ra
 entry:
   %0 = load i32, i32* @a, align 4
   %1 = load i32, i32* @b, align 4
@@ -80,17 +149,6 @@ cond.end14:                                       ; preds = %cond.false13, %cond
   ret void
 }
 
-; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
-
-; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
-
-; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
-
-; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
-; 16:	btnez	$BB{{[0-9]+}}_{{[0-9]}}
 
 attributes #0 = { nounwind "target-cpu"="mips16" "target-features"="+mips16,+o32" }
 attributes #1 = { "target-cpu"="mips16" "target-features"="+mips16,+o32" }
diff --git a/llvm/test/CodeGen/PowerPC/brcond.ll b/llvm/test/CodeGen/PowerPC/brcond.ll
index b8c98427f107f6..3df169dd64da03 100644
--- a/llvm/test/CodeGen/PowerPC/brcond.ll
+++ b/llvm/test/CodeGen/PowerPC/brcond.ll
@@ -1,12 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
 ; RUN:   -ppc-reduce-cr-logicals=false < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   -ppc-reduce-cr-logicals=false < %s | FileCheck %s
 
 define signext i32 @testi32slt(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
-; CHECK-LABEL: testi32slt
-; CHECK: crorc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi32slt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crorc 20, 2, 6
+; CHECK-NEXT:    bc 12, 20, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB0_2: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -19,9 +29,18 @@ iffalse:
 }
 
 define signext i32 @testi32ult(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
-; CHECK-LABEL: testi32ult
-; CHECK: crorc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi32ult:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crorc 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB1_2: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -34,9 +53,18 @@ iffalse:
 }
 
 define signext i32 @testi32sle(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
-; CHECK-LABEL: testi32sle
-; CHECK: crandc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi32sle:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crandc 20, 2, 6
+; CHECK-NEXT:    bc 4, 20, .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB2_2: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -49,9 +77,18 @@ iffalse:
 }
 
 define signext i32 @testi32ule(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
-; CHECK-LABEL: testi32ule
-; CHECK: crandc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi32ule:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crandc 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB3_2: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -65,8 +102,17 @@ iffalse:
 
 define signext i32 @testi32eq(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
 ; CHECK-LABEL: testi32eq:
-; CHECK: crxor [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crxor 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB4_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB4_2: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -80,8 +126,17 @@ iffalse:
 
 define signext i32 @testi32sge(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
 ; CHECK-LABEL: testi32sge:
-; CHECK: crandc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crandc 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB5_2: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -95,8 +150,17 @@ iffalse:
 
 define signext i32 @testi32uge(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
 ; CHECK-LABEL: testi32uge:
-; CHECK: crandc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crandc 20, 2, 6
+; CHECK-NEXT:    bc 4, 20, .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB6_2: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -110,8 +174,17 @@ iffalse:
 
 define signext i32 @testi32sgt(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
 ; CHECK-LABEL: testi32sgt:
-; CHECK: crorc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crorc 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB7_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB7_2: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -125,8 +198,17 @@ iffalse:
 
 define signext i32 @testi32ugt(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
 ; CHECK-LABEL: testi32ugt:
-; CHECK: crorc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    crorc 20, 2, 6
+; CHECK-NEXT:    bc 12, 20, .LBB8_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB8_2: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -140,8 +222,17 @@ iffalse:
 
 define signext i32 @testi32ne(i32 signext %c1, i32 signext %c2, i32 signext %c3, i32 signext %c4, i32 signext %a1, i32 signext %a2) #0 {
 ; CHECK-LABEL: testi32ne:
-; CHECK: creqv [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpw 5, 6
+; CHECK-NEXT:    cmpw 1, 3, 4
+; CHECK-NEXT:    creqv 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB9_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    extsw 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB9_2: # %iffalse
+; CHECK-NEXT:    extsw 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i32 %c3, %c4
   %cmp3tmp = icmp eq i32 %c1, %c2
@@ -154,9 +245,18 @@ iffalse:
 }
 
 define i64 @testi64slt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64slt
-; CHECK: crorc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64slt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crorc 20, 2, 6
+; CHECK-NEXT:    bc 12, 20, .LBB10_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB10_2: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -169,9 +269,18 @@ iffalse:
 }
 
 define i64 @testi64ult(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64ult
-; CHECK: crorc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64ult:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crorc 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB11_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB11_2: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -184,9 +293,18 @@ iffalse:
 }
 
 define i64 @testi64sle(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64sle
-; CHECK: crandc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64sle:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crandc 20, 2, 6
+; CHECK-NEXT:    bc 4, 20, .LBB12_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB12_2: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -199,9 +317,18 @@ iffalse:
 }
 
 define i64 @testi64ule(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64ule
-; CHECK: crandc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64ule:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crandc 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB13_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB13_2: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -214,9 +341,18 @@ iffalse:
 }
 
 define i64 @testi64eq(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64eq
-; CHECK: crxor [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64eq:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crxor 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB14_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB14_2: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -229,9 +365,18 @@ iffalse:
 }
 
 define i64 @testi64sge(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64sge
-; CHECK: crandc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64sge:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crandc 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB15_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB15_2: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -244,9 +389,18 @@ iffalse:
 }
 
 define i64 @testi64uge(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64uge
-; CHECK: crandc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64uge:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crandc 20, 2, 6
+; CHECK-NEXT:    bc 4, 20, .LBB16_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB16_2: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -259,9 +413,18 @@ iffalse:
 }
 
 define i64 @testi64sgt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64sgt
-; CHECK: crorc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64sgt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crorc 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB17_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB17_2: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -274,9 +437,18 @@ iffalse:
 }
 
 define i64 @testi64ugt(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64ugt
-; CHECK: crorc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64ugt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    crorc 20, 2, 6
+; CHECK-NEXT:    bc 12, 20, .LBB18_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB18_2: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -289,9 +461,18 @@ iffalse:
 }
 
 define i64 @testi64ne(i64 %c1, i64 %c2, i64 %c3, i64 %c4, i64 %a1, i64 %a2) #0 {
-; CHECK-LABEL: testi64ne
-; CHECK: creqv [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testi64ne:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpd 5, 6
+; CHECK-NEXT:    cmpd 1, 3, 4
+; CHECK-NEXT:    creqv 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB19_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    mr 3, 7
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB19_2: # %iffalse
+; CHECK-NEXT:    mr 3, 8
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = icmp eq i64 %c3, %c4
   %cmp3tmp = icmp eq i64 %c1, %c2
@@ -304,9 +485,18 @@ iffalse:
 }
 
 define float @testfloatslt(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatslt
-; CHECK: crorc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatslt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crorc 20, 2, 6
+; CHECK-NEXT:    bc 12, 20, .LBB20_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB20_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -319,9 +509,18 @@ iffalse:
 }
 
 define float @testfloatult(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatult
-; CHECK: crorc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatult:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crorc 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB21_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB21_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -334,9 +533,18 @@ iffalse:
 }
 
 define float @testfloatsle(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatsle
-; CHECK: crandc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatsle:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crandc 20, 2, 6
+; CHECK-NEXT:    bc 4, 20, .LBB22_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB22_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -349,9 +557,18 @@ iffalse:
 }
 
 define float @testfloatule(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatule
-; CHECK: crandc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatule:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crandc 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB23_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB23_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -364,9 +581,18 @@ iffalse:
 }
 
 define float @testfloateq(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloateq
-; CHECK: crxor [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloateq:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crxor 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB24_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB24_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -379,9 +605,18 @@ iffalse:
 }
 
 define float @testfloatsge(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatsge
-; CHECK: crandc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatsge:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crandc 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB25_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB25_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -394,9 +629,18 @@ iffalse:
 }
 
 define float @testfloatuge(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatuge
-; CHECK: crandc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatuge:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crandc 20, 2, 6
+; CHECK-NEXT:    bc 4, 20, .LBB26_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB26_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -409,9 +653,18 @@ iffalse:
 }
 
 define float @testfloatsgt(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatsgt
-; CHECK: crorc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatsgt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crorc 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB27_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB27_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -424,9 +677,18 @@ iffalse:
 }
 
 define float @testfloatugt(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatugt
-; CHECK: crorc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatugt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crorc 20, 2, 6
+; CHECK-NEXT:    bc 12, 20, .LBB28_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB28_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -439,9 +701,18 @@ iffalse:
 }
 
 define float @testfloatne(float %c1, float %c2, float %c3, float %c4, float %a1, float %a2) #0 {
-; CHECK-LABEL: testfloatne
-; CHECK: creqv [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testfloatne:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    creqv 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB29_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB29_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq float %c3, %c4
   %cmp3tmp = fcmp oeq float %c1, %c2
@@ -454,9 +725,18 @@ iffalse:
 }
 
 define double @testdoubleslt(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
-; CHECK-LABEL: testdoubleslt
-; CHECK: crorc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testdoubleslt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crorc 20, 2, 6
+; CHECK-NEXT:    bc 12, 20, .LBB30_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB30_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -470,8 +750,17 @@ iffalse:
 
 define double @testdoubleult(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
 ; CHECK-LABEL: testdoubleult:
-; CHECK: crorc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crorc 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB31_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB31_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -484,9 +773,18 @@ iffalse:
 }
 
 define double @testdoublesle(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
-; CHECK-LABEL: testdoublesle
-; CHECK: crandc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testdoublesle:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crandc 20, 2, 6
+; CHECK-NEXT:    bc 4, 20, .LBB32_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB32_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -500,8 +798,17 @@ iffalse:
 
 define double @testdoubleule(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
 ; CHECK-LABEL: testdoubleule:
-; CHECK: crandc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crandc 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB33_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB33_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -514,9 +821,18 @@ iffalse:
 }
 
 define double @testdoubleeq(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
-; CHECK-LABEL: testdoubleeq
-; CHECK: crxor [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testdoubleeq:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crxor 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB34_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB34_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -529,9 +845,18 @@ iffalse:
 }
 
 define double @testdoublesge(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
-; CHECK-LABEL: testdoublesge
-; CHECK: crandc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testdoublesge:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crandc 20, 6, 2
+; CHECK-NEXT:    bc 4, 20, .LBB35_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB35_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -544,9 +869,18 @@ iffalse:
 }
 
 define double @testdoubleuge(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
-; CHECK-LABEL: testdoubleuge
-; CHECK: crandc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testdoubleuge:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crandc 20, 2, 6
+; CHECK-NEXT:    bc 4, 20, .LBB36_2
+; CHECK-NEXT:  # %bb.1: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB36_2: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -560,8 +894,17 @@ iffalse:
 
 define double @testdoublesgt(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
 ; CHECK-LABEL: testdoublesgt:
-; CHECK: crorc [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crorc 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB37_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB37_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -574,9 +917,18 @@ iffalse:
 }
 
 define double @testdoubleugt(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
-; CHECK-LABEL: testdoubleugt
-; CHECK: crorc [[REG:[0-9]+]], 2, 6
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testdoubleugt:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    crorc 20, 2, 6
+; CHECK-NEXT:    bc 12, 20, .LBB38_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB38_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
@@ -589,9 +941,18 @@ iffalse:
 }
 
 define double @testdoublene(double %c1, double %c2, double %c3, double %c4, double %a1, double %a2) #0 {
-; CHECK-LABEL: testdoublene
-; CHECK: creqv [[REG:[0-9]+]], 6, 2
-; CHECK: bc 12, [[REG]], {{\.[a-zA-Z0-9_]+}}
+; CHECK-LABEL: testdoublene:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fcmpu 0, 3, 4
+; CHECK-NEXT:    fcmpu 1, 1, 2
+; CHECK-NEXT:    creqv 20, 6, 2
+; CHECK-NEXT:    bc 12, 20, .LBB39_2
+; CHECK-NEXT:  # %bb.1: # %iftrue
+; CHECK-NEXT:    fmr 1, 5
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB39_2: # %iffalse
+; CHECK-NEXT:    fmr 1, 6
+; CHECK-NEXT:    blr
 entry:
   %cmp1 = fcmp oeq double %c3, %c4
   %cmp3tmp = fcmp oeq double %c1, %c2
diff --git a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
index ce2f93871359ae..c60efc2401609b 100644
--- a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -43,12 +43,12 @@ define signext i32 @zeroEqualityTest01(i8* %x, i8* %y) {
 ; CHECK-NEXT:    ld 3, 8(3)
 ; CHECK-NEXT:    ld 4, 8(4)
 ; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    beq 0, .LBB1_3
 ; CHECK-NEXT:  .LBB1_2: # %res_block
 ; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:  .LBB1_3: # %endblock
-; CHECK-NEXT:    clrldi 3, 3, 32
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB1_3:
+; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 16)
   %not.tobool = icmp ne i32 %call, 0
@@ -73,12 +73,12 @@ define signext i32 @zeroEqualityTest03(i8* %x, i8* %y) {
 ; CHECK-NEXT:    lbz 3, 6(3)
 ; CHECK-NEXT:    lbz 4, 6(4)
 ; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    beq 0, .LBB2_4
 ; CHECK-NEXT:  .LBB2_3: # %res_block
 ; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:  .LBB2_4: # %endblock
-; CHECK-NEXT:    clrldi 3, 3, 32
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 7)
   %not.lnot = icmp ne i32 %call, 0
@@ -136,14 +136,16 @@ define signext i32 @equalityFoldOneConstant(i8* %X) {
 ; CHECK-NEXT:    sldi 4, 4, 32
 ; CHECK-NEXT:    ori 4, 4, 2
 ; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    beq 0, .LBB6_3
+; CHECK-NEXT:    beq 0, .LBB6_4
 ; CHECK-NEXT:  .LBB6_2: # %res_block
 ; CHECK-NEXT:    li 3, 1
 ; CHECK-NEXT:  .LBB6_3: # %endblock
 ; CHECK-NEXT:    cntlzw 3, 3
 ; CHECK-NEXT:    srwi 3, 3, 5
 ; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB6_4:
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    b .LBB6_3
   %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* %X, i64 16)
   %not.tobool = icmp eq i32 %call, 0
   %cond = zext i1 %not.tobool to i32
diff --git a/llvm/test/CodeGen/PowerPC/redundant-copy-after-tail-dup.ll b/llvm/test/CodeGen/PowerPC/redundant-copy-after-tail-dup.ll
index f6506b3c87f869..c65acff5f0e106 100644
--- a/llvm/test/CodeGen/PowerPC/redundant-copy-after-tail-dup.ll
+++ b/llvm/test/CodeGen/PowerPC/redundant-copy-after-tail-dup.ll
@@ -17,24 +17,28 @@
 define dso_local i1 @t(%class.A* %this, i32 %color, i32 %vertex) local_unnamed_addr {
 ; CHECK-P9-LABEL: t:
 ; CHECK-P9:       # %bb.0: # %entry
-; CHECK-P9-NEXT:    li r5, 1
-; CHECK-P9-NEXT:    bc 12, 4*cr5+lt, .LBB0_4
+; CHECK-P9-NEXT:    bc 12, 4*cr5+lt, .LBB0_3
 ; CHECK-P9-NEXT:  # %bb.1: # %land.lhs.true
-; CHECK-P9-NEXT:    bc 12, 4*cr5+lt, .LBB0_5
-; CHECK-P9-NEXT:  .LBB0_2: # %for.inc
+; CHECK-P9-NEXT:    li r4, 1
+; CHECK-P9-NEXT:    bc 4, 4*cr5+lt, .LBB0_4
+; CHECK-P9-NEXT:  # %bb.2: # %cleanup16
+; CHECK-P9-NEXT:    mr r3, r4
+; CHECK-P9-NEXT:    blr
+; CHECK-P9-NEXT:  .LBB0_3: # %lor.lhs.false
+; CHECK-P9-NEXT:    cmplwi r4, 0
+; CHECK-P9-NEXT:    beq cr0, .LBB0_6
+; CHECK-P9-NEXT:  .LBB0_4: # %for.inc
 ; CHECK-P9-NEXT:    lhz r3, 5308(r3)
 ; CHECK-P9-NEXT:    cmplwi r3, 2
-; CHECK-P9-NEXT:    bge- cr0, .LBB0_6
-; CHECK-P9-NEXT:  # %bb.3: # %land.lhs.true.1
+; CHECK-P9-NEXT:    bge- cr0, .LBB0_7
+; CHECK-P9-NEXT:  # %bb.5: # %land.lhs.true.1
 ; CHECK-P9-NEXT:    li r3, 0
 ; CHECK-P9-NEXT:    blr
-; CHECK-P9-NEXT:  .LBB0_4: # %lor.lhs.false
-; CHECK-P9-NEXT:    cmplwi r4, 0
-; CHECK-P9-NEXT:    bne cr0, .LBB0_2
-; CHECK-P9-NEXT:  .LBB0_5: # %cleanup16
-; CHECK-P9-NEXT:    mr r3, r5
+; CHECK-P9-NEXT:  .LBB0_6:
+; CHECK-P9-NEXT:    li r4, 1
+; CHECK-P9-NEXT:    mr r3, r4
 ; CHECK-P9-NEXT:    blr
-; CHECK-P9-NEXT:  .LBB0_6: # %lor.lhs.false.1
+; CHECK-P9-NEXT:  .LBB0_7: # %lor.lhs.false.1
 entry:
   br i1 undef, label %land.lhs.true, label %lor.lhs.false
 
diff --git a/llvm/test/CodeGen/RISCV/branch.ll b/llvm/test/CodeGen/RISCV/branch.ll
index e834499280328d..562b0fe5cf07c5 100644
--- a/llvm/test/CodeGen/RISCV/branch.ll
+++ b/llvm/test/CodeGen/RISCV/branch.ll
@@ -6,41 +6,42 @@ define void @foo(i32 %a, i32 *%b, i1 %c) nounwind {
 ; RV32I-LABEL: foo:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    beq a3, a0, .LBB0_12
+; RV32I-NEXT:    beq a3, a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1: # %test2
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    bne a3, a0, .LBB0_12
-; RV32I-NEXT:  # %bb.2: # %test3
+; RV32I-NEXT:    beq a3, a0, .LBB0_3
+; RV32I-NEXT:  .LBB0_2: # %end
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB0_3: # %test3
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    blt a3, a0, .LBB0_12
-; RV32I-NEXT:  # %bb.3: # %test4
+; RV32I-NEXT:    blt a3, a0, .LBB0_2
+; RV32I-NEXT:  # %bb.4: # %test4
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    bge a3, a0, .LBB0_12
-; RV32I-NEXT:  # %bb.4: # %test5
+; RV32I-NEXT:    bge a3, a0, .LBB0_2
+; RV32I-NEXT:  # %bb.5: # %test5
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    bltu a3, a0, .LBB0_12
-; RV32I-NEXT:  # %bb.5: # %test6
+; RV32I-NEXT:    bltu a3, a0, .LBB0_2
+; RV32I-NEXT:  # %bb.6: # %test6
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    bgeu a3, a0, .LBB0_12
-; RV32I-NEXT:  # %bb.6: # %test7
+; RV32I-NEXT:    bgeu a3, a0, .LBB0_2
+; RV32I-NEXT:  # %bb.7: # %test7
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    blt a0, a3, .LBB0_12
-; RV32I-NEXT:  # %bb.7: # %test8
+; RV32I-NEXT:    blt a0, a3, .LBB0_2
+; RV32I-NEXT:  # %bb.8: # %test8
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    bge a0, a3, .LBB0_12
-; RV32I-NEXT:  # %bb.8: # %test9
+; RV32I-NEXT:    bge a0, a3, .LBB0_2
+; RV32I-NEXT:  # %bb.9: # %test9
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    bltu a0, a3, .LBB0_12
-; RV32I-NEXT:  # %bb.9: # %test10
+; RV32I-NEXT:    bltu a0, a3, .LBB0_2
+; RV32I-NEXT:  # %bb.10: # %test10
 ; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    bgeu a0, a3, .LBB0_12
-; RV32I-NEXT:  # %bb.10: # %test11
+; RV32I-NEXT:    bgeu a0, a3, .LBB0_2
+; RV32I-NEXT:  # %bb.11: # %test11
 ; RV32I-NEXT:    lw a0, 0(a1)
 ; RV32I-NEXT:    andi a0, a2, 1
-; RV32I-NEXT:    bnez a0, .LBB0_12
-; RV32I-NEXT:  # %bb.11: # %test12
+; RV32I-NEXT:    bnez a0, .LBB0_2
+; RV32I-NEXT:  # %bb.12: # %test12
 ; RV32I-NEXT:    lw a0, 0(a1)
-; RV32I-NEXT:  .LBB0_12: # %end
 ; RV32I-NEXT:    ret
   %val1 = load volatile i32, i32* %b
   %tst1 = icmp eq i32 %val1, %a
diff --git a/llvm/test/CodeGen/RISCV/rv64m-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64m-w-insts-legalization.ll
index 8af41d2608449d..5635c9ee687244 100644
--- a/llvm/test/CodeGen/RISCV/rv64m-w-insts-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/rv64m-w-insts-legalization.ll
@@ -5,15 +5,13 @@ define signext i32 @mulw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-LABEL: mulw:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi a2, zero, 1
-; CHECK-NEXT:    bge a0, a1, .LBB0_3
-; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    addi a2, zero, 1
-; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    bge a0, a1, .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mulw a2, a0, a2
 ; CHECK-NEXT:    addiw a0, a0, 1
-; CHECK-NEXT:    blt a0, a1, .LBB0_2
-; CHECK-NEXT:  .LBB0_3: # %for.cond.cleanup
+; CHECK-NEXT:    blt a0, a1, .LBB0_1
+; CHECK-NEXT:  .LBB0_2: # %for.cond.cleanup
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/SystemZ/int-cmp-37.ll b/llvm/test/CodeGen/SystemZ/int-cmp-37.ll
index 28cabf7f5bb713..6126000a897843 100644
--- a/llvm/test/CodeGen/SystemZ/int-cmp-37.ll
+++ b/llvm/test/CodeGen/SystemZ/int-cmp-37.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Test 32-bit comparisons in which the second operand is zero-extended
 ; from a PC-relative i16.
 ;
@@ -9,9 +10,16 @@
 ; Check unsigned comparison.
 define i32 @f1(i32 %src1) {
 ; CHECK-LABEL: f1:
-; CHECK: clhrl %r2, g
-; CHECK-NEXT: jl
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clhrl %r2, g
+; CHECK-NEXT:    jhe .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB0_2: # %mulb
+; CHECK-NEXT:    msr %r2, %r2
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src2 = zext i16 %val to i32
@@ -29,8 +37,16 @@ exit:
 ; Check signed comparison.
 define i32 @f2(i32 %src1) {
 ; CHECK-LABEL: f2:
-; CHECK-NOT: clhrl
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    llhrl %r0, g
+; CHECK-NEXT:    crjhe %r2, %r0, .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB1_2: # %mulb
+; CHECK-NEXT:    msr %r2, %r2
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src2 = zext i16 %val to i32
@@ -48,9 +64,14 @@ exit:
 ; Check equality.
 define i32 @f3(i32 %src1) {
 ; CHECK-LABEL: f3:
-; CHECK: clhrl %r2, g
-; CHECK-NEXT: je
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clhrl %r2, g
+; CHECK-NEXT:    je .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %mulb
+; CHECK-NEXT:    msr %r2, %r2
+; CHECK-NEXT:  .LBB2_2: # %exit
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src2 = zext i16 %val to i32
@@ -68,9 +89,16 @@ exit:
 ; Check inequality.
 define i32 @f4(i32 %src1) {
 ; CHECK-LABEL: f4:
-; CHECK: clhrl %r2, g
-; CHECK-NEXT: jlh
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clhrl %r2, g
+; CHECK-NEXT:    je .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB3_2: # %mulb
+; CHECK-NEXT:    msr %r2, %r2
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src2 = zext i16 %val to i32
@@ -88,10 +116,17 @@ exit:
 ; Repeat f1 with an unaligned address.
 define i32 @f5(i32 %src1) {
 ; CHECK-LABEL: f5:
-; CHECK: lgrl [[REG:%r[0-5]]], h@GOT
-; CHECK: llh [[VAL:%r[0-5]]], 0([[REG]])
-; CHECK: clrjl %r2, [[VAL]],
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lgrl %r1, h@GOT
+; CHECK-NEXT:    llh %r0, 0(%r1)
+; CHECK-NEXT:    clrjhe %r2, %r0, .LBB4_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB4_2: # %mulb
+; CHECK-NEXT:    msr %r2, %r2
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@h, align 1
   %src2 = zext i16 %val to i32
@@ -109,9 +144,16 @@ exit:
 ; Check the comparison can be reversed if that allows CLHRL to be used.
 define i32 @f6(i32 %src2) {
 ; CHECK-LABEL: f6:
-; CHECK: clhrl %r2, g
-; CHECK-NEXT: jh {{\.L.*}}
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clhrl %r2, g
+; CHECK-NEXT:    jle .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB5_2: # %mulb
+; CHECK-NEXT:    msr %r2, %r2
+; CHECK-NEXT:    ahi %r2, 1
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src1 = zext i16 %val to i32
diff --git a/llvm/test/CodeGen/SystemZ/int-cmp-40.ll b/llvm/test/CodeGen/SystemZ/int-cmp-40.ll
index 09bf903be7b234..f14f48e1d3d03f 100644
--- a/llvm/test/CodeGen/SystemZ/int-cmp-40.ll
+++ b/llvm/test/CodeGen/SystemZ/int-cmp-40.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Test 64-bit comparisons in which the second operand is zero-extended
 ; from a PC-relative i16.
 ;
@@ -9,9 +10,16 @@
 ; Check unsigned comparison.
 define i64 @f1(i64 %src1) {
 ; CHECK-LABEL: f1:
-; CHECK: clghrl %r2, g
-; CHECK-NEXT: jl
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clghrl %r2, g
+; CHECK-NEXT:    jhe .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB0_2: # %mulb
+; CHECK-NEXT:    msgr %r2, %r2
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src2 = zext i16 %val to i64
@@ -29,8 +37,16 @@ exit:
 ; Check signed comparison.
 define i64 @f2(i64 %src1) {
 ; CHECK-LABEL: f2:
-; CHECK-NOT: clghrl
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    llghrl %r0, g
+; CHECK-NEXT:    cgrjhe %r2, %r0, .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB1_2: # %mulb
+; CHECK-NEXT:    msgr %r2, %r2
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src2 = zext i16 %val to i64
@@ -48,9 +64,14 @@ exit:
 ; Check equality.
 define i64 @f3(i64 %src1) {
 ; CHECK-LABEL: f3:
-; CHECK: clghrl %r2, g
-; CHECK-NEXT: je
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clghrl %r2, g
+; CHECK-NEXT:    je .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %mulb
+; CHECK-NEXT:    msgr %r2, %r2
+; CHECK-NEXT:  .LBB2_2: # %exit
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src2 = zext i16 %val to i64
@@ -68,9 +89,16 @@ exit:
 ; Check inequality.
 define i64 @f4(i64 %src1) {
 ; CHECK-LABEL: f4:
-; CHECK: clghrl %r2, g
-; CHECK-NEXT: jlh
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clghrl %r2, g
+; CHECK-NEXT:    je .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB3_2: # %mulb
+; CHECK-NEXT:    msgr %r2, %r2
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src2 = zext i16 %val to i64
@@ -88,10 +116,17 @@ exit:
 ; Repeat f1 with an unaligned address.
 define i64 @f5(i64 %src1) {
 ; CHECK-LABEL: f5:
-; CHECK: lgrl [[REG:%r[0-5]]], h@GOT
-; CHECK: llgh [[VAL:%r[0-5]]], 0([[REG]])
-; CHECK: clgrjl %r2, [[VAL]],
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lgrl %r1, h@GOT
+; CHECK-NEXT:    llgh %r0, 0(%r1)
+; CHECK-NEXT:    clgrjhe %r2, %r0, .LBB4_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB4_2: # %mulb
+; CHECK-NEXT:    msgr %r2, %r2
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@h, align 1
   %src2 = zext i16 %val to i64
@@ -109,9 +144,16 @@ exit:
 ; Check the comparison can be reversed if that allows CLGHRL to be used.
 define i64 @f6(i64 %src2) {
 ; CHECK-LABEL: f6:
-; CHECK: clghrl %r2, g
-; CHECK-NEXT: jh {{\.L.*}}
-; CHECK: br %r14
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    clghrl %r2, g
+; CHECK-NEXT:    jle .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB5_2: # %mulb
+; CHECK-NEXT:    msgr %r2, %r2
+; CHECK-NEXT:    la %r2, 1(%r2)
+; CHECK-NEXT:    br %r14
 entry:
   %val = load i16, i16 *@g
   %src1 = zext i16 %val to i64
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 261222f60f17af..9ad5cdf60ce343 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -903,8 +903,7 @@ define arm_aapcs_vfpcc void @float_int_int_mul(i32* nocapture readonly %a, i32*
 ; CHECK-NEXT:    le lr, .LBB4_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB4_8
 ; CHECK-NEXT:  .LBB4_6: @ %for.body.preheader11
 ; CHECK-NEXT:    sub.w lr, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 32a1c17dbbff3b..0bd7ac870974d6 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -17,7 +17,7 @@
 define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input, i16* nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr {
 ; ENABLED-LABEL: varying_outer_2d_reduction:
 ; ENABLED:       @ %bb.0: @ %entry
-; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
 ; ENABLED-NEXT:    sub sp, #4
 ; ENABLED-NEXT:    cmp r3, #1
 ; ENABLED-NEXT:    str r0, [sp] @ 4-byte Spill
@@ -54,7 +54,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    subs r4, r2, r6
 ; ENABLED-NEXT:    vmov.i32 q0, #0x0
-; ENABLED-NEXT:    add.w r8, r7, r0, lsr #2
+; ENABLED-NEXT:    add.w r11, r7, r0, lsr #2
 ; ENABLED-NEXT:    mov r7, r10
 ; ENABLED-NEXT:    dlstp.32 lr, r4
 ; ENABLED-NEXT:    ldr r0, [sp] @ 4-byte Reload
@@ -63,9 +63,9 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
 ; ENABLED-NEXT:    vldrh.s32 q1, [r0], #8
 ; ENABLED-NEXT:    vldrh.s32 q2, [r7], #8
-; ENABLED-NEXT:    mov lr, r8
+; ENABLED-NEXT:    mov lr, r11
 ; ENABLED-NEXT:    vmul.i32 q1, q2, q1
-; ENABLED-NEXT:    sub.w r8, r8, #1
+; ENABLED-NEXT:    sub.w r11, r11, #1
 ; ENABLED-NEXT:    vshl.s32 q1, r5
 ; ENABLED-NEXT:    vadd.i32 q0, q1, q0
 ; ENABLED-NEXT:    letp lr, .LBB0_6
@@ -75,11 +75,11 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    b .LBB0_3
 ; ENABLED-NEXT:  .LBB0_8: @ %for.end17
 ; ENABLED-NEXT:    add sp, #4
-; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, pc}
 ;
 ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
 ; NOREDUCTIONS:       @ %bb.0: @ %entry
-; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
 ; NOREDUCTIONS-NEXT:    sub sp, #4
 ; NOREDUCTIONS-NEXT:    cmp r3, #1
 ; NOREDUCTIONS-NEXT:    str r0, [sp] @ 4-byte Spill
@@ -116,7 +116,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    subs r4, r2, r6
 ; NOREDUCTIONS-NEXT:    vmov.i32 q0, #0x0
-; NOREDUCTIONS-NEXT:    add.w r8, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT:    add.w r11, r7, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r7, r10
 ; NOREDUCTIONS-NEXT:    dlstp.32 lr, r4
 ; NOREDUCTIONS-NEXT:    ldr r0, [sp] @ 4-byte Reload
@@ -125,9 +125,9 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
 ; NOREDUCTIONS-NEXT:    vldrh.s32 q1, [r0], #8
 ; NOREDUCTIONS-NEXT:    vldrh.s32 q2, [r7], #8
-; NOREDUCTIONS-NEXT:    mov lr, r8
+; NOREDUCTIONS-NEXT:    mov lr, r11
 ; NOREDUCTIONS-NEXT:    vmul.i32 q1, q2, q1
-; NOREDUCTIONS-NEXT:    sub.w r8, r8, #1
+; NOREDUCTIONS-NEXT:    sub.w r11, r11, #1
 ; NOREDUCTIONS-NEXT:    vshl.s32 q1, r5
 ; NOREDUCTIONS-NEXT:    vadd.i32 q0, q1, q0
 ; NOREDUCTIONS-NEXT:    letp lr, .LBB0_6
@@ -137,8 +137,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    b .LBB0_3
 ; NOREDUCTIONS-NEXT:  .LBB0_8: @ %for.end17
 ; NOREDUCTIONS-NEXT:    add sp, #4
-; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
-;
+; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, pc}
 entry:
   %conv = sext i16 %N to i32
   %cmp36 = icmp sgt i16 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
index 2db5bf59ecfaeb..a3b88cfc23130d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll
@@ -302,8 +302,7 @@ define void @fma8(float* noalias nocapture readonly %A, float* noalias nocapture
 ; CHECK-NEXT:    le lr, .LBB2_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB2_8
 ; CHECK-NEXT:  .LBB2_6: @ %for.body.preheader12
 ; CHECK-NEXT:    sub.w lr, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index 646124e0cf9836..c99bcf7dccbb2b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -36,8 +36,7 @@ define void @fma(float* noalias nocapture readonly %A, float* noalias nocapture
 ; CHECK-NEXT:    le lr, .LBB0_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB0_8
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader12
 ; CHECK-NEXT:    sub.w lr, r3, r12
 ; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
@@ -220,7 +219,7 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu
 ; CHECK-NEXT:    ldr r3, [sp, #64]
 ; CHECK-NEXT:    mov.w r11, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r9, [sp, #56]
+; CHECK-NEXT:    ldr r4, [sp, #56]
 ; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r3
@@ -229,56 +228,56 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r3, #7
-; CHECK-NEXT:    lsrs r0, r0, #3
+; CHECK-NEXT:    lsr.w r9, r0, #3
 ; CHECK-NEXT:    b .LBB2_5
 ; CHECK-NEXT:  .LBB2_3: @ in Loop: Header=BB2_5 Depth=1
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:  .LBB2_4: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #72]
-; CHECK-NEXT:    add.w r1, r10, r8
-; CHECK-NEXT:    add r1, r6
-; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    strb.w r1, [r3, r11]
+; CHECK-NEXT:    ldr r1, [sp, #72]
+; CHECK-NEXT:    add.w r0, r12, r8
+; CHECK-NEXT:    add r0, r6
+; CHECK-NEXT:    add r0, r10
+; CHECK-NEXT:    strb.w r0, [r1, r11]
 ; CHECK-NEXT:    add.w r11, r11, #1
 ; CHECK-NEXT:    cmp r11, r2
 ; CHECK-NEXT:    beq .LBB2_8
 ; CHECK-NEXT:  .LBB2_5: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_7 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #68]
-; CHECK-NEXT:    subs.w lr, r0, r0
-; CHECK-NEXT:    ldr.w r12, [r1, r11, lsl #2]
+; CHECK-NEXT:    ldr r0, [sp, #68]
+; CHECK-NEXT:    subs.w lr, r9, r9
+; CHECK-NEXT:    ldr.w r10, [r0, r11, lsl #2]
 ; CHECK-NEXT:    ble .LBB2_3
 ; CHECK-NEXT:  @ %bb.6: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB2_5 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #64]
-; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #64]
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mla r7, r11, r3, r1
+; CHECK-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mla r7, r11, r1, r0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
-; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    ldrd r5, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r8, r10
 ; CHECK-NEXT:  .LBB2_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB2_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r9
+; CHECK-NEXT:    vldrb.s16 q0, [r5], #8
+; CHECK-NEXT:    vadd.i16 q1, q0, r4
 ; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
-; CHECK-NEXT:    vmlava.s16 r12, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
-; CHECK-NEXT:    vmlava.s16 r6, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
 ; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    le lr, .LBB2_7
 ; CHECK-NEXT:    b .LBB2_4
@@ -403,7 +402,7 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon
 ; CHECK-NEXT:    ldr r3, [sp, #64]
 ; CHECK-NEXT:    mov.w r11, #0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldr.w r9, [sp, #56]
+; CHECK-NEXT:    ldr r4, [sp, #56]
 ; CHECK-NEXT:    add.w r0, r1, r3, lsl #1
 ; CHECK-NEXT:    str r0, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r1, r3
@@ -412,55 +411,55 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon
 ; CHECK-NEXT:    add r0, r1
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r3, #7
-; CHECK-NEXT:    lsrs r0, r0, #3
+; CHECK-NEXT:    lsr.w r9, r0, #3
 ; CHECK-NEXT:  .LBB3_3: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_5 Depth 2
-; CHECK-NEXT:    ldr r1, [sp, #68]
-; CHECK-NEXT:    subs.w lr, r0, r0
-; CHECK-NEXT:    ldr.w r12, [r1, r11, lsl #2]
+; CHECK-NEXT:    ldr r0, [sp, #68]
+; CHECK-NEXT:    subs.w lr, r9, r9
+; CHECK-NEXT:    ldr.w r10, [r0, r11, lsl #2]
 ; CHECK-NEXT:    ble .LBB3_6
 ; CHECK-NEXT:  @ %bb.4: @ %for.body24.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB3_3 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #64]
-; CHECK-NEXT:    mov r6, r12
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #64]
+; CHECK-NEXT:    mov r6, r10
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mla r7, r11, r3, r1
+; CHECK-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mla r7, r11, r1, r0
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
-; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    ldrd r5, r0, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    mov r8, r10
 ; CHECK-NEXT:  .LBB3_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB3_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrb.s16 q0, [r4], #8
-; CHECK-NEXT:    vadd.i16 q1, q0, r9
+; CHECK-NEXT:    vldrb.s16 q0, [r5], #8
+; CHECK-NEXT:    vadd.i16 q1, q0, r4
 ; CHECK-NEXT:    vldrb.s16 q0, [r7], #8
-; CHECK-NEXT:    vmlava.s16 r12, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
-; CHECK-NEXT:    vmlava.s16 r6, q0, q1
-; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
 ; CHECK-NEXT:    vmlava.s16 r10, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r3], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r6, q0, q1
+; CHECK-NEXT:    vldrb.s16 q1, [r0], #8
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
+; CHECK-NEXT:    vmlava.s16 r12, q0, q1
 ; CHECK-NEXT:    vldrb.s16 q1, [r1], #8
-; CHECK-NEXT:    vadd.i16 q1, q1, r9
+; CHECK-NEXT:    vadd.i16 q1, q1, r4
 ; CHECK-NEXT:    vmlava.s16 r8, q0, q1
 ; CHECK-NEXT:    le lr, .LBB3_5
 ; CHECK-NEXT:    b .LBB3_7
 ; CHECK-NEXT:  .LBB3_6: @ in Loop: Header=BB3_3 Depth=1
-; CHECK-NEXT:    mov r8, r12
-; CHECK-NEXT:    mov r10, r12
-; CHECK-NEXT:    mov r6, r12
+; CHECK-NEXT:    mov r8, r10
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    mov r6, r10
 ; CHECK-NEXT:  .LBB3_7: @ %for.cond.cleanup23
 ; CHECK-NEXT:    @ in Loop: Header=BB3_3 Depth=1
-; CHECK-NEXT:    ldr r3, [sp, #72]
-; CHECK-NEXT:    add.w r1, r10, r8
-; CHECK-NEXT:    add r1, r6
-; CHECK-NEXT:    add r1, r12
-; CHECK-NEXT:    strb.w r1, [r3, r11]
+; CHECK-NEXT:    ldr r1, [sp, #72]
+; CHECK-NEXT:    add.w r0, r12, r8
+; CHECK-NEXT:    add r0, r6
+; CHECK-NEXT:    add r0, r10
+; CHECK-NEXT:    strb.w r0, [r1, r11]
 ; CHECK-NEXT:    add.w r11, r11, #1
 ; CHECK-NEXT:    cmp r11, r2
 ; CHECK-NEXT:    bne .LBB3_3
@@ -737,8 +736,8 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mla r5, r11, r3, r0
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r4, r7, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r4, r0, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:  .LBB5_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB5_5 Depth=1
@@ -747,10 +746,10 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    vadd.i16 q2, q1, q0
 ; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
 ; CHECK-NEXT:    vmlava.s16 r12, q1, q2
-; CHECK-NEXT:    vldrb.s16 q2, [r0], #8
+; CHECK-NEXT:    vldrb.s16 q2, [r7], #8
 ; CHECK-NEXT:    vadd.i16 q2, q2, q0
 ; CHECK-NEXT:    vmlava.s16 r6, q1, q2
-; CHECK-NEXT:    vldrb.s16 q2, [r7], #8
+; CHECK-NEXT:    vldrb.s16 q2, [r0], #8
 ; CHECK-NEXT:    vadd.i16 q2, q2, q0
 ; CHECK-NEXT:    vmlava.s16 r8, q1, q2
 ; CHECK-NEXT:    vldrb.s16 q2, [r1], #8
@@ -908,8 +907,8 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    mov r8, r12
 ; CHECK-NEXT:    mla r5, r11, r3, r0
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r4, r7, [sp] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r4, r0, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:  .LBB6_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB6_3 Depth=1
@@ -918,10 +917,10 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_
 ; CHECK-NEXT:    vadd.i16 q2, q1, q0
 ; CHECK-NEXT:    vldrb.s16 q1, [r5], #8
 ; CHECK-NEXT:    vmlava.s16 r12, q1, q2
-; CHECK-NEXT:    vldrb.s16 q2, [r0], #8
+; CHECK-NEXT:    vldrb.s16 q2, [r7], #8
 ; CHECK-NEXT:    vadd.i16 q2, q2, q0
 ; CHECK-NEXT:    vmlava.s16 r6, q1, q2
-; CHECK-NEXT:    vldrb.s16 q2, [r7], #8
+; CHECK-NEXT:    vldrb.s16 q2, [r0], #8
 ; CHECK-NEXT:    vadd.i16 q2, q2, q0
 ; CHECK-NEXT:    vmlava.s16 r8, q1, q2
 ; CHECK-NEXT:    vldrb.s16 q2, [r1], #8
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 9897b607d6b3a5..3ec0e464427688 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -20,50 +20,50 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r9, r1
+; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    mov r11, r2
 ; CHECK-NEXT:    b .LBB0_6
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
 ; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    bic r3, r3, #1
 ; CHECK-NEXT:    subs r7, r3, #2
-; CHECK-NEXT:    adr r4, .LCPI0_0
 ; CHECK-NEXT:    movs r6, #1
+; CHECK-NEXT:    adr r4, .LCPI0_0
 ; CHECK-NEXT:    add.w r11, r2, r3, lsl #2
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r6, r1, r3, lsl #2
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
+; CHECK-NEXT:    mvn r10, #-2147483648
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
-; CHECK-NEXT:    mov.w r10, #-1
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldrd r4, r5, [r0]
+; CHECK-NEXT:    ldrd r4, r8, [r0]
 ; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    ldrd r7, r6, [r1]
+; CHECK-NEXT:    ldrd r7, r5, [r1]
 ; CHECK-NEXT:    adds r1, #8
-; CHECK-NEXT:    smull r8, r5, r6, r5
+; CHECK-NEXT:    smull r8, r5, r5, r8
 ; CHECK-NEXT:    smull r4, r7, r7, r4
 ; CHECK-NEXT:    asrl r8, r5, #31
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
+; CHECK-NEXT:    rsbs.w r9, r4, #-2147483648
 ; CHECK-NEXT:    vmov.32 q4[0], r4
-; CHECK-NEXT:    sbcs.w r3, r10, r7
-; CHECK-NEXT:    vmov.32 q4[1], r7
+; CHECK-NEXT:    mov.w r9, #-1
+; CHECK-NEXT:    sbcs.w r3, r9, r7
 ; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    vmov.32 q4[2], r8
+; CHECK-NEXT:    vmov.32 q4[1], r7
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    csetm r3, ne
-; CHECK-NEXT:    vmov.32 q4[3], r5
+; CHECK-NEXT:    vmov.32 q4[2], r8
 ; CHECK-NEXT:    vmov.32 q2[0], r3
+; CHECK-NEXT:    vmov.32 q4[3], r5
 ; CHECK-NEXT:    vmov.32 q2[1], r3
 ; CHECK-NEXT:    rsbs.w r3, r8, #-2147483648
-; CHECK-NEXT:    sbcs.w r3, r10, r5
-; CHECK-NEXT:    mvn r5, #-2147483648
+; CHECK-NEXT:    sbcs.w r3, r9, r5
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r3, #1
@@ -76,7 +76,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vorr q2, q2, q3
 ; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    subs r4, r4, r5
+; CHECK-NEXT:    subs.w r4, r4, r10
 ; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    mov.w r3, #0
@@ -87,7 +87,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov.32 q3[0], r3
 ; CHECK-NEXT:    vmov.32 q3[1], r3
 ; CHECK-NEXT:    vmov r3, s11
-; CHECK-NEXT:    subs r4, r4, r5
+; CHECK-NEXT:    subs.w r4, r4, r10
 ; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    it lt
@@ -116,7 +116,7 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:  .LBB0_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r3, [r12], #4
-; CHECK-NEXT:    ldr r4, [r9], #4
+; CHECK-NEXT:    ldr r4, [r6], #4
 ; CHECK-NEXT:    smull r4, r3, r4, r3
 ; CHECK-NEXT:    asrl r4, r3, #31
 ; CHECK-NEXT:    subs r5, r1, r4
@@ -908,36 +908,41 @@ for.body:                                         ; preds = %for.body.preheader,
 define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32* nocapture readonly %pSrcB, i32* noalias nocapture %pDst, i32 %N) {
 ; CHECK-LABEL: usatmul_4_q31:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    beq.w .LBB4_8
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    mov.w r8, #0
+; CHECK-NEXT:    mov.w r11, #0
 ; CHECK-NEXT:    cmp r3, #3
 ; CHECK-NEXT:    bhi .LBB4_3
 ; CHECK-NEXT:  @ %bb.2:
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    mov r9, r1
-; CHECK-NEXT:    mov r10, r2
+; CHECK-NEXT:    mov r8, r1
+; CHECK-NEXT:    mov r9, r2
 ; CHECK-NEXT:    b .LBB4_6
 ; CHECK-NEXT:  .LBB4_3: @ %vector.ph
-; CHECK-NEXT:    bic r8, r3, #3
+; CHECK-NEXT:    bic r11, r3, #3
 ; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    sub.w r7, r8, #4
-; CHECK-NEXT:    add.w r10, r2, r8, lsl #2
-; CHECK-NEXT:    add.w r9, r1, r8, lsl #2
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
-; CHECK-NEXT:    add.w r12, r0, r8, lsl #2
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    sub.w r7, r11, #4
+; CHECK-NEXT:    add.w r9, r2, r11, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r11, lsl #2
+; CHECK-NEXT:    add.w r7, r6, r7, lsr #2
+; CHECK-NEXT:    add.w r12, r0, r11, lsl #2
+; CHECK-NEXT:    mov r10, r7
 ; CHECK-NEXT:  .LBB4_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    mov lr, r10
 ; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    sub.w lr, lr, #1
 ; CHECK-NEXT:    vmov.f32 s12, s10
+; CHECK-NEXT:    mov r10, lr
 ; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov.f32 s14, s11
 ; CHECK-NEXT:    vmullb.u32 q4, q3, q1
@@ -1002,28 +1007,31 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    le lr, .LBB4_4
-; CHECK-NEXT:  @ %bb.5: @ %middle.block
-; CHECK-NEXT:    cmp r8, r3
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    bne.w .LBB4_4
+; CHECK-NEXT:    b .LBB4_5
+; CHECK-NEXT:  .LBB4_5: @ %middle.block
+; CHECK-NEXT:    cmp r11, r3
 ; CHECK-NEXT:    beq .LBB4_8
 ; CHECK-NEXT:  .LBB4_6: @ %for.body.preheader21
-; CHECK-NEXT:    sub.w lr, r3, r8
+; CHECK-NEXT:    sub.w lr, r3, r11
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_7: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r0, [r12], #4
-; CHECK-NEXT:    ldr r1, [r9], #4
+; CHECK-NEXT:    ldr r1, [r8], #4
 ; CHECK-NEXT:    umull r0, r1, r1, r0
 ; CHECK-NEXT:    lsrl r0, r1, #31
 ; CHECK-NEXT:    subs.w r2, r0, #-1
 ; CHECK-NEXT:    sbcs r1, r1, #0
 ; CHECK-NEXT:    it hs
 ; CHECK-NEXT:    movhs.w r0, #-1
-; CHECK-NEXT:    str r0, [r10], #4
+; CHECK-NEXT:    str r0, [r9], #4
 ; CHECK-NEXT:    le lr, .LBB4_7
 ; CHECK-NEXT:  .LBB4_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %cmp8 = icmp eq i32 %N, 0
   br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
@@ -1136,8 +1144,7 @@ define arm_aapcs_vfpcc void @ssatmul_4_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    le lr, .LBB5_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB5_8
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader21
 ; CHECK-NEXT:    movw r0, #32768
 ; CHECK-NEXT:    sub.w lr, r3, r5
@@ -1277,8 +1284,7 @@ define arm_aapcs_vfpcc void @ssatmul_8_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    le lr, .LBB6_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB6_8
 ; CHECK-NEXT:  .LBB6_6: @ %for.body.preheader21
 ; CHECK-NEXT:    movw r0, #32768
 ; CHECK-NEXT:    sub.w lr, r3, r5
@@ -1415,8 +1421,7 @@ define arm_aapcs_vfpcc void @ssatmul_8i_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    le lr, .LBB7_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB7_8
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader21
 ; CHECK-NEXT:    movw r0, #32768
 ; CHECK-NEXT:    sub.w lr, r3, r5
@@ -1959,8 +1964,7 @@ define arm_aapcs_vfpcc void @usatmul_4_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    le lr, .LBB11_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB11_8
 ; CHECK-NEXT:  .LBB11_6: @ %for.body.preheader21
 ; CHECK-NEXT:    sub.w lr, r3, r5
 ; CHECK-NEXT:    movw r0, #65535
@@ -2093,8 +2097,7 @@ define arm_aapcs_vfpcc void @usatmul_8_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    le lr, .LBB12_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB12_8
 ; CHECK-NEXT:  .LBB12_6: @ %for.body.preheader21
 ; CHECK-NEXT:    sub.w lr, r3, r5
 ; CHECK-NEXT:    movw r0, #65535
@@ -2229,8 +2232,7 @@ define arm_aapcs_vfpcc void @ssatmul_4_q7(i8* nocapture readonly %pSrcA, i8* noc
 ; CHECK-NEXT:    le lr, .LBB13_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB13_8
 ; CHECK-NEXT:  .LBB13_6: @ %for.body.preheader21
 ; CHECK-NEXT:    sub.w lr, r3, r5
 ; CHECK-NEXT:    dls lr, lr
@@ -2364,8 +2366,7 @@ define arm_aapcs_vfpcc void @ssatmul_8_q7(i8* nocapture readonly %pSrcA, i8* noc
 ; CHECK-NEXT:    le lr, .LBB14_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB14_8
 ; CHECK-NEXT:  .LBB14_6: @ %for.body.preheader23
 ; CHECK-NEXT:    sub.w lr, r3, r5
 ; CHECK-NEXT:    mvn r0, #127
@@ -2504,8 +2505,7 @@ define arm_aapcs_vfpcc void @ssatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK-NEXT:    le lr, .LBB15_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB15_8
 ; CHECK-NEXT:  .LBB15_6: @ %for.body.preheader23
 ; CHECK-NEXT:    sub.w lr, r3, r5
 ; CHECK-NEXT:    mvn r0, #127
@@ -2641,8 +2641,7 @@ define arm_aapcs_vfpcc void @ssatmul_16i_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    le lr, .LBB16_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB16_8
 ; CHECK-NEXT:  .LBB16_6: @ %for.body.preheader23
 ; CHECK-NEXT:    sub.w lr, r3, r5
 ; CHECK-NEXT:    mvn r0, #127
@@ -3422,8 +3421,7 @@ define arm_aapcs_vfpcc void @usatmul_8_q7(i8* nocapture readonly %pSrcA, i8* noc
 ; CHECK-NEXT:    le lr, .LBB20_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB20_8
 ; CHECK-NEXT:  .LBB20_6: @ %for.body.preheader23
 ; CHECK-NEXT:    sub.w lr, r3, r5
 ; CHECK-NEXT:    dls lr, lr
@@ -3557,8 +3555,7 @@ define arm_aapcs_vfpcc void @usatmul_16_q7(i8* nocapture readonly %pSrcA, i8* no
 ; CHECK-NEXT:    le lr, .LBB21_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r5, r3
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    popeq {r4, r5, r6, pc}
+; CHECK-NEXT:    beq .LBB21_8
 ; CHECK-NEXT:  .LBB21_6: @ %for.body.preheader23
 ; CHECK-NEXT:    sub.w lr, r3, r5
 ; CHECK-NEXT:    dls lr, lr
diff --git a/llvm/test/CodeGen/Thumb2/thumb2-branch.ll b/llvm/test/CodeGen/Thumb2/thumb2-branch.ll
index 332ed50ede6f2a..e52bab2b11056e 100644
--- a/llvm/test/CodeGen/Thumb2/thumb2-branch.ll
+++ b/llvm/test/CodeGen/Thumb2/thumb2-branch.ll
@@ -8,7 +8,7 @@ declare void @foo()
 define i32 @f1(i32 %a, i32 %b, i32* %v) {
 entry:
 ; CHECK-LABEL: f1:
-; CHECK: bne LBB
+; CHECK: beq LBB
         %tmp = icmp eq i32 %a, %b               ; <i1> [#uses=1]
         br i1 %tmp, label %cond_true, label %return
 
@@ -59,7 +59,7 @@ return:         ; preds = %entry
 define i32 @f4(i32 %a, i32 %b, i32* %v) {
 entry:
 ; CHECK-LABEL: f4:
-; CHECK: blo LBB
+; CHECK: bhs LBB
         %tmp = icmp uge i32 %a, %b              ; <i1> [#uses=1]
         br i1 %tmp, label %cond_true, label %return
 
diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll
index 882150a6e39ab9..e8171e2767df12 100644
--- a/llvm/test/CodeGen/X86/3addr-16bit.ll
+++ b/llvm/test/CodeGen/X86/3addr-16bit.ll
@@ -12,8 +12,11 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    incl %eax
 ; X64-NEXT:    cmpw %di, %si
-; X64-NEXT:    jne LBB0_2
-; X64-NEXT:  ## %bb.1: ## %bb
+; X64-NEXT:    je LBB0_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB0_1: ## %bb
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movzwl %ax, %ebx
 ; X64-NEXT:    movl %ebx, %edi
@@ -21,9 +24,6 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
-; X64-NEXT:  LBB0_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
 ;
 ; X32-LABEL: test1:
 ; X32:       ## %bb.0: ## %entry
@@ -33,15 +33,15 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    incl %eax
 ; X32-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    jne LBB0_2
-; X32-NEXT:  ## %bb.1: ## %bb
+; X32-NEXT:    je LBB0_1
+; X32-NEXT:  ## %bb.2: ## %bb1
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    jmp LBB0_3
+; X32-NEXT:  LBB0_1: ## %bb
 ; X32-NEXT:    movzwl %ax, %esi
 ; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    calll _foo
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    jmp LBB0_3
-; X32-NEXT:  LBB0_2: ## %bb1
-; X32-NEXT:    movzwl %ax, %eax
 ; X32-NEXT:  LBB0_3: ## %bb1
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    popl %esi
@@ -65,8 +65,11 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    decl %eax
 ; X64-NEXT:    cmpw %di, %si
-; X64-NEXT:    jne LBB1_2
-; X64-NEXT:  ## %bb.1: ## %bb
+; X64-NEXT:    je LBB1_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB1_1: ## %bb
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movzwl %ax, %ebx
 ; X64-NEXT:    movl %ebx, %edi
@@ -74,9 +77,6 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
-; X64-NEXT:  LBB1_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
 ;
 ; X32-LABEL: test2:
 ; X32:       ## %bb.0: ## %entry
@@ -86,15 +86,15 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    decl %eax
 ; X32-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    jne LBB1_2
-; X32-NEXT:  ## %bb.1: ## %bb
+; X32-NEXT:    je LBB1_1
+; X32-NEXT:  ## %bb.2: ## %bb1
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    jmp LBB1_3
+; X32-NEXT:  LBB1_1: ## %bb
 ; X32-NEXT:    movzwl %ax, %esi
 ; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    calll _foo
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    jmp LBB1_3
-; X32-NEXT:  LBB1_2: ## %bb1
-; X32-NEXT:    movzwl %ax, %eax
 ; X32-NEXT:  LBB1_3: ## %bb1
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    popl %esi
@@ -120,8 +120,11 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    addl $2, %eax
 ; X64-NEXT:    cmpw %di, %si
-; X64-NEXT:    jne LBB2_2
-; X64-NEXT:  ## %bb.1: ## %bb
+; X64-NEXT:    je LBB2_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB2_1: ## %bb
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movzwl %ax, %ebx
 ; X64-NEXT:    movl %ebx, %edi
@@ -129,9 +132,6 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
-; X64-NEXT:  LBB2_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
 ;
 ; X32-LABEL: test3:
 ; X32:       ## %bb.0: ## %entry
@@ -141,15 +141,15 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    addl $2, %eax
 ; X32-NEXT:    cmpw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    jne LBB2_2
-; X32-NEXT:  ## %bb.1: ## %bb
+; X32-NEXT:    je LBB2_1
+; X32-NEXT:  ## %bb.2: ## %bb1
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    jmp LBB2_3
+; X32-NEXT:  LBB2_1: ## %bb
 ; X32-NEXT:    movzwl %ax, %esi
 ; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    calll _foo
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    jmp LBB2_3
-; X32-NEXT:  LBB2_2: ## %bb1
-; X32-NEXT:    movzwl %ax, %eax
 ; X32-NEXT:  LBB2_3: ## %bb1
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    popl %esi
@@ -173,8 +173,11 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    cmpw %di, %si
-; X64-NEXT:    jne LBB3_2
-; X64-NEXT:  ## %bb.1: ## %bb
+; X64-NEXT:    je LBB3_1
+; X64-NEXT:  ## %bb.2: ## %bb1
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  LBB3_1: ## %bb
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movzwl %ax, %ebx
 ; X64-NEXT:    movl %ebx, %edi
@@ -182,9 +185,6 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X64-NEXT:    movl %ebx, %eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
-; X64-NEXT:  LBB3_2: ## %bb1
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
 ;
 ; X32-LABEL: test4:
 ; X32:       ## %bb.0: ## %entry
@@ -195,15 +195,15 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
 ; X32-NEXT:    movl %edx, %eax
 ; X32-NEXT:    addl %ecx, %eax
 ; X32-NEXT:    cmpw %cx, %dx
-; X32-NEXT:    jne LBB3_2
-; X32-NEXT:  ## %bb.1: ## %bb
+; X32-NEXT:    je LBB3_1
+; X32-NEXT:  ## %bb.2: ## %bb1
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    jmp LBB3_3
+; X32-NEXT:  LBB3_1: ## %bb
 ; X32-NEXT:    movzwl %ax, %esi
 ; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    calll _foo
 ; X32-NEXT:    movl %esi, %eax
-; X32-NEXT:    jmp LBB3_3
-; X32-NEXT:  LBB3_2: ## %bb1
-; X32-NEXT:    movzwl %ax, %eax
 ; X32-NEXT:  LBB3_3: ## %bb1
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/absolute-cmp.ll b/llvm/test/CodeGen/X86/absolute-cmp.ll
index b4f158aa7c91aa..b5fd426039f22e 100644
--- a/llvm/test/CodeGen/X86/absolute-cmp.ll
+++ b/llvm/test/CodeGen/X86/absolute-cmp.ll
@@ -17,12 +17,16 @@ define void @foo8(i64 %val) {
 ; NOPIC-NEXT:    .cfi_def_cfa_offset 16
 ; NOPIC-NEXT:    cmpq $cmp8@ABS8, %rdi # encoding: [0x48,0x83,0xff,A]
 ; NOPIC-NEXT:    # fixup A - offset: 3, value: cmp8@ABS8, kind: FK_Data_1
-; NOPIC-NEXT:    ja .LBB0_2 # encoding: [0x77,A]
-; NOPIC-NEXT:    # fixup A - offset: 1, value: .LBB0_2-1, kind: FK_PCRel_1
-; NOPIC-NEXT:  # %bb.1: # %t
+; NOPIC-NEXT:    jbe .LBB0_1 # encoding: [0x76,A]
+; NOPIC-NEXT:    # fixup A - offset: 1, value: .LBB0_1-1, kind: FK_PCRel_1
+; NOPIC-NEXT:  # %bb.2: # %f
+; NOPIC-NEXT:    popq %rax # encoding: [0x58]
+; NOPIC-NEXT:    .cfi_def_cfa_offset 8
+; NOPIC-NEXT:    retq # encoding: [0xc3]
+; NOPIC-NEXT:  .LBB0_1: # %t
+; NOPIC-NEXT:    .cfi_def_cfa_offset 16
 ; NOPIC-NEXT:    callq f # encoding: [0xe8,A,A,A,A]
 ; NOPIC-NEXT:    # fixup A - offset: 1, value: f-4, kind: reloc_branch_4byte_pcrel
-; NOPIC-NEXT:  .LBB0_2: # %f
 ; NOPIC-NEXT:    popq %rax # encoding: [0x58]
 ; NOPIC-NEXT:    .cfi_def_cfa_offset 8
 ; NOPIC-NEXT:    retq # encoding: [0xc3]
@@ -33,12 +37,16 @@ define void @foo8(i64 %val) {
 ; PIC-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-NEXT:    cmpq $cmp8@ABS8, %rdi # encoding: [0x48,0x83,0xff,A]
 ; PIC-NEXT:    # fixup A - offset: 3, value: cmp8@ABS8, kind: FK_Data_1
-; PIC-NEXT:    ja .LBB0_2 # encoding: [0x77,A]
-; PIC-NEXT:    # fixup A - offset: 1, value: .LBB0_2-1, kind: FK_PCRel_1
-; PIC-NEXT:  # %bb.1: # %t
+; PIC-NEXT:    jbe .LBB0_1 # encoding: [0x76,A]
+; PIC-NEXT:    # fixup A - offset: 1, value: .LBB0_1-1, kind: FK_PCRel_1
+; PIC-NEXT:  # %bb.2: # %f
+; PIC-NEXT:    popq %rax # encoding: [0x58]
+; PIC-NEXT:    .cfi_def_cfa_offset 8
+; PIC-NEXT:    retq # encoding: [0xc3]
+; PIC-NEXT:  .LBB0_1: # %t
+; PIC-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-NEXT:    callq f@PLT # encoding: [0xe8,A,A,A,A]
 ; PIC-NEXT:    # fixup A - offset: 1, value: f@PLT-4, kind: FK_PCRel_4
-; PIC-NEXT:  .LBB0_2: # %f
 ; PIC-NEXT:    popq %rax # encoding: [0x58]
 ; PIC-NEXT:    .cfi_def_cfa_offset 8
 ; PIC-NEXT:    retq # encoding: [0xc3]
@@ -60,12 +68,16 @@ define void @foo32(i64 %val) {
 ; NOPIC-NEXT:    .cfi_def_cfa_offset 16
 ; NOPIC-NEXT:    cmpq $cmp32, %rdi # encoding: [0x48,0x81,0xff,A,A,A,A]
 ; NOPIC-NEXT:    # fixup A - offset: 3, value: cmp32, kind: reloc_signed_4byte
-; NOPIC-NEXT:    ja .LBB1_2 # encoding: [0x77,A]
-; NOPIC-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; NOPIC-NEXT:  # %bb.1: # %t
+; NOPIC-NEXT:    jbe .LBB1_1 # encoding: [0x76,A]
+; NOPIC-NEXT:    # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; NOPIC-NEXT:  # %bb.2: # %f
+; NOPIC-NEXT:    popq %rax # encoding: [0x58]
+; NOPIC-NEXT:    .cfi_def_cfa_offset 8
+; NOPIC-NEXT:    retq # encoding: [0xc3]
+; NOPIC-NEXT:  .LBB1_1: # %t
+; NOPIC-NEXT:    .cfi_def_cfa_offset 16
 ; NOPIC-NEXT:    callq f # encoding: [0xe8,A,A,A,A]
 ; NOPIC-NEXT:    # fixup A - offset: 1, value: f-4, kind: reloc_branch_4byte_pcrel
-; NOPIC-NEXT:  .LBB1_2: # %f
 ; NOPIC-NEXT:    popq %rax # encoding: [0x58]
 ; NOPIC-NEXT:    .cfi_def_cfa_offset 8
 ; NOPIC-NEXT:    retq # encoding: [0xc3]
@@ -76,12 +88,16 @@ define void @foo32(i64 %val) {
 ; PIC-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-NEXT:    cmpq $cmp32, %rdi # encoding: [0x48,0x81,0xff,A,A,A,A]
 ; PIC-NEXT:    # fixup A - offset: 3, value: cmp32, kind: reloc_signed_4byte
-; PIC-NEXT:    ja .LBB1_2 # encoding: [0x77,A]
-; PIC-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; PIC-NEXT:  # %bb.1: # %t
+; PIC-NEXT:    jbe .LBB1_1 # encoding: [0x76,A]
+; PIC-NEXT:    # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; PIC-NEXT:  # %bb.2: # %f
+; PIC-NEXT:    popq %rax # encoding: [0x58]
+; PIC-NEXT:    .cfi_def_cfa_offset 8
+; PIC-NEXT:    retq # encoding: [0xc3]
+; PIC-NEXT:  .LBB1_1: # %t
+; PIC-NEXT:    .cfi_def_cfa_offset 16
 ; PIC-NEXT:    callq f@PLT # encoding: [0xe8,A,A,A,A]
 ; PIC-NEXT:    # fixup A - offset: 1, value: f@PLT-4, kind: FK_PCRel_4
-; PIC-NEXT:  .LBB1_2: # %f
 ; PIC-NEXT:    popq %rax # encoding: [0x58]
 ; PIC-NEXT:    .cfi_def_cfa_offset 8
 ; PIC-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/atomic-flags.ll b/llvm/test/CodeGen/X86/atomic-flags.ll
index e0c4a915965c8b..6565c107f7a974 100644
--- a/llvm/test/CodeGen/X86/atomic-flags.ll
+++ b/llvm/test/CodeGen/X86/atomic-flags.ll
@@ -1,20 +1,55 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,X86-64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,X86-32
 
 ; Make sure that flags are properly preserved despite atomic optimizations.
 
 define i32 @atomic_and_flags_1(i8* %p, i32 %a, i32 %b) {
-; CHECK-LABEL: atomic_and_flags_1:
-
   ; Generate flags value, and use it.
-  ; CHECK:      cmpl
-  ; CHECK-NEXT: jne
+; X86-64-LABEL: atomic_and_flags_1:
+; X86-64:       # %bb.0:
+; X86-64-NEXT:    cmpl %edx, %esi
+; X86-64-NEXT:    je .LBB0_1
+; X86-64-NEXT:  # %bb.3: # %L2
+; X86-64-NEXT:    movl $2, %eax
+; X86-64-NEXT:    retq
+; X86-64-NEXT:  .LBB0_1: # %L1
+; X86-64-NEXT:    incb (%rdi)
+; X86-64-NEXT:    cmpl %edx, %esi
+; X86-64-NEXT:    je .LBB0_4
+; X86-64-NEXT:  # %bb.2: # %L4
+; X86-64-NEXT:    movl $4, %eax
+; X86-64-NEXT:    retq
+; X86-64-NEXT:  .LBB0_4: # %L3
+; X86-64-NEXT:    movl $3, %eax
+; X86-64-NEXT:    retq
+;
+; X86-32-LABEL: atomic_and_flags_1:
+; X86-32:       # %bb.0:
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT:    cmpl %eax, %ecx
+; X86-32-NEXT:    je .LBB0_1
+; X86-32-NEXT:  # %bb.3: # %L2
+; X86-32-NEXT:    movl $2, %eax
+; X86-32-NEXT:    retl
+; X86-32-NEXT:  .LBB0_1: # %L1
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-32-NEXT:    incb (%edx)
+; X86-32-NEXT:    cmpl %eax, %ecx
+; X86-32-NEXT:    je .LBB0_4
+; X86-32-NEXT:  # %bb.2: # %L4
+; X86-32-NEXT:    movl $4, %eax
+; X86-32-NEXT:    retl
+; X86-32-NEXT:  .LBB0_4: # %L3
+; X86-32-NEXT:    movl $3, %eax
+; X86-32-NEXT:    retl
   %cmp = icmp eq i32 %a, %b
   br i1 %cmp, label %L1, label %L2
 
 L1:
   ; The following pattern will get folded.
-  ; CHECK: incb
+
   %1 = load atomic i8, i8* %p seq_cst, align 1
   %2 = add i8 %1, 1 ; This forces the INC instruction to be generated.
   store atomic i8 %2, i8* %p release, align 1
@@ -23,8 +58,7 @@ L1:
   ; somehow. This test checks that cmpl gets emitted again, but any
   ; rematerialization would work (the optimizer used to clobber the flags with
   ; the add).
-  ; CHECK-NEXT: cmpl
-  ; CHECK-NEXT: jne
+
   br i1 %cmp, label %L3, label %L4
 
 L2:
@@ -39,18 +73,51 @@ L4:
 
 ; Same as above, but using 2 as immediate to avoid the INC instruction.
 define i32 @atomic_and_flags_2(i8* %p, i32 %a, i32 %b) {
-; CHECK-LABEL: atomic_and_flags_2:
-  ; CHECK:      cmpl
-  ; CHECK-NEXT: jne
+; X86-64-LABEL: atomic_and_flags_2:
+; X86-64:       # %bb.0:
+; X86-64-NEXT:    cmpl %edx, %esi
+; X86-64-NEXT:    je .LBB1_1
+; X86-64-NEXT:  # %bb.3: # %L2
+; X86-64-NEXT:    movl $2, %eax
+; X86-64-NEXT:    retq
+; X86-64-NEXT:  .LBB1_1: # %L1
+; X86-64-NEXT:    addb $2, (%rdi)
+; X86-64-NEXT:    cmpl %edx, %esi
+; X86-64-NEXT:    je .LBB1_4
+; X86-64-NEXT:  # %bb.2: # %L4
+; X86-64-NEXT:    movl $4, %eax
+; X86-64-NEXT:    retq
+; X86-64-NEXT:  .LBB1_4: # %L3
+; X86-64-NEXT:    movl $3, %eax
+; X86-64-NEXT:    retq
+;
+; X86-32-LABEL: atomic_and_flags_2:
+; X86-32:       # %bb.0:
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT:    cmpl %eax, %ecx
+; X86-32-NEXT:    je .LBB1_1
+; X86-32-NEXT:  # %bb.3: # %L2
+; X86-32-NEXT:    movl $2, %eax
+; X86-32-NEXT:    retl
+; X86-32-NEXT:  .LBB1_1: # %L1
+; X86-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-32-NEXT:    addb $2, (%edx)
+; X86-32-NEXT:    cmpl %eax, %ecx
+; X86-32-NEXT:    je .LBB1_4
+; X86-32-NEXT:  # %bb.2: # %L4
+; X86-32-NEXT:    movl $4, %eax
+; X86-32-NEXT:    retl
+; X86-32-NEXT:  .LBB1_4: # %L3
+; X86-32-NEXT:    movl $3, %eax
+; X86-32-NEXT:    retl
   %cmp = icmp eq i32 %a, %b
   br i1 %cmp, label %L1, label %L2
 L1:
-  ; CHECK: addb
   %1 = load atomic i8, i8* %p seq_cst, align 1
   %2 = add i8 %1, 2
   store atomic i8 %2, i8* %p release, align 1
-  ; CHECK-NEXT: cmpl
-  ; CHECK-NEXT: jne
+
   br i1 %cmp, label %L3, label %L4
 L2:
   ret i32 2
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 7a1f34c65c183d..3e5abc97b89a35 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2716,24 +2716,24 @@ define i1 @fold_cmp_over_fence(i32* %p, i32 %v1) {
 ; CHECK-O3-CUR-NEXT:    movl (%rdi), %eax
 ; CHECK-O3-CUR-NEXT:    mfence
 ; CHECK-O3-CUR-NEXT:    cmpl %eax, %esi
-; CHECK-O3-CUR-NEXT:    jne .LBB116_2
-; CHECK-O3-CUR-NEXT:  # %bb.1: # %taken
-; CHECK-O3-CUR-NEXT:    movb $1, %al
-; CHECK-O3-CUR-NEXT:    retq
-; CHECK-O3-CUR-NEXT:  .LBB116_2: # %untaken
+; CHECK-O3-CUR-NEXT:    je .LBB116_1
+; CHECK-O3-CUR-NEXT:  # %bb.2: # %untaken
 ; CHECK-O3-CUR-NEXT:    xorl %eax, %eax
 ; CHECK-O3-CUR-NEXT:    retq
+; CHECK-O3-CUR-NEXT:  .LBB116_1: # %taken
+; CHECK-O3-CUR-NEXT:    movb $1, %al
+; CHECK-O3-CUR-NEXT:    retq
 ;
 ; CHECK-O3-EX-LABEL: fold_cmp_over_fence:
 ; CHECK-O3-EX:       # %bb.0:
 ; CHECK-O3-EX-NEXT:    cmpl (%rdi), %esi
 ; CHECK-O3-EX-NEXT:    mfence
-; CHECK-O3-EX-NEXT:    jne .LBB116_2
-; CHECK-O3-EX-NEXT:  # %bb.1: # %taken
-; CHECK-O3-EX-NEXT:    movb $1, %al
-; CHECK-O3-EX-NEXT:    retq
-; CHECK-O3-EX-NEXT:  .LBB116_2: # %untaken
+; CHECK-O3-EX-NEXT:    je .LBB116_1
+; CHECK-O3-EX-NEXT:  # %bb.2: # %untaken
 ; CHECK-O3-EX-NEXT:    xorl %eax, %eax
+; CHECK-O3-EX-NEXT:    retq
+; CHECK-O3-EX-NEXT:  .LBB116_1: # %taken
+; CHECK-O3-EX-NEXT:    movb $1, %al
 ; CHECK-O3-EX-NEXT:    retq
   %v2 = load atomic i32, i32* %p unordered, align 4
   fence seq_cst
diff --git a/llvm/test/CodeGen/X86/bmi.ll b/llvm/test/CodeGen/X86/bmi.ll
index 9c3a74fccbe633..f8c7300186df20 100644
--- a/llvm/test/CodeGen/X86/bmi.ll
+++ b/llvm/test/CodeGen/X86/bmi.ll
@@ -1249,20 +1249,20 @@ define void @pr42118_i32(i32 %x) {
 ; X86-LABEL: pr42118_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    jne .LBB50_1
-; X86-NEXT:  # %bb.2:
-; X86-NEXT:    jmp bar # TAILCALL
-; X86-NEXT:  .LBB50_1:
+; X86-NEXT:    je .LBB50_2
+; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB50_2:
+; X86-NEXT:    jmp bar # TAILCALL
 ;
 ; X64-LABEL: pr42118_i32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    blsrl %edi, %eax
-; X64-NEXT:    jne .LBB50_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    jmp bar # TAILCALL
-; X64-NEXT:  .LBB50_1:
+; X64-NEXT:    je .LBB50_2
+; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB50_2:
+; X64-NEXT:    jmp bar # TAILCALL
   %tmp = sub i32 0, %x
   %tmp1 = and i32 %tmp, %x
   %cmp = icmp eq i32 %tmp1, %x
@@ -1289,25 +1289,25 @@ define void @pr42118_i64(i64 %x) {
 ; X86-NEXT:    andl %eax, %edx
 ; X86-NEXT:    andl %ecx, %esi
 ; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    jne .LBB51_1
-; X86-NEXT:  # %bb.2:
+; X86-NEXT:    je .LBB51_2
+; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
-; X86-NEXT:    jmp bar # TAILCALL
-; X86-NEXT:  .LBB51_1:
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB51_2:
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 4
-; X86-NEXT:    retl
+; X86-NEXT:    jmp bar # TAILCALL
 ;
 ; X64-LABEL: pr42118_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    blsrq %rdi, %rax
-; X64-NEXT:    jne .LBB51_1
-; X64-NEXT:  # %bb.2:
-; X64-NEXT:    jmp bar # TAILCALL
-; X64-NEXT:  .LBB51_1:
+; X64-NEXT:    je .LBB51_2
+; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB51_2:
+; X64-NEXT:    jmp bar # TAILCALL
   %tmp = sub i64 0, %x
   %tmp1 = and i64 %tmp, %x
   %cmp = icmp eq i64 %tmp1, %x
diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll
index 59df366c6e4a23..48fef1e8270ca7 100644
--- a/llvm/test/CodeGen/X86/bt.ll
+++ b/llvm/test/CodeGen/X86/bt.ll
@@ -611,21 +611,23 @@ define void @query3(i32 %x, i32 %n) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB16_2
-; X86-NEXT:  # %bb.1: # %bb
+; X86-NEXT:    jb .LBB16_1
+; X86-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB16_1: # %bb
 ; X86-NEXT:    calll foo
-; X86-NEXT:  .LBB16_2: # %UnifiedReturnBlock
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: query3:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    btl %esi, %edi
-; X64-NEXT:    jae .LBB16_2
-; X64-NEXT:  # %bb.1: # %bb
+; X64-NEXT:    jb .LBB16_1
+; X64-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB16_1: # %bb
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    callq foo
 ; X64-NEXT:    popq %rax
-; X64-NEXT:  .LBB16_2: # %UnifiedReturnBlock
 ; X64-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
@@ -647,21 +649,23 @@ define void @query3b(i32 %x, i32 %n) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB17_2
-; X86-NEXT:  # %bb.1: # %bb
+; X86-NEXT:    jb .LBB17_1
+; X86-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB17_1: # %bb
 ; X86-NEXT:    calll foo
-; X86-NEXT:  .LBB17_2: # %UnifiedReturnBlock
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: query3b:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    btl %esi, %edi
-; X64-NEXT:    jae .LBB17_2
-; X64-NEXT:  # %bb.1: # %bb
+; X64-NEXT:    jb .LBB17_1
+; X64-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB17_1: # %bb
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    callq foo
 ; X64-NEXT:    popq %rax
-; X64-NEXT:  .LBB17_2: # %UnifiedReturnBlock
 ; X64-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
@@ -683,21 +687,23 @@ define void @query3x(i32 %x, i32 %n) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB18_2
-; X86-NEXT:  # %bb.1: # %bb
+; X86-NEXT:    jb .LBB18_1
+; X86-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB18_1: # %bb
 ; X86-NEXT:    calll foo
-; X86-NEXT:  .LBB18_2: # %UnifiedReturnBlock
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: query3x:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    btl %esi, %edi
-; X64-NEXT:    jae .LBB18_2
-; X64-NEXT:  # %bb.1: # %bb
+; X64-NEXT:    jb .LBB18_1
+; X64-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB18_1: # %bb
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    callq foo
 ; X64-NEXT:    popq %rax
-; X64-NEXT:  .LBB18_2: # %UnifiedReturnBlock
 ; X64-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
@@ -719,21 +725,23 @@ define void @query3bx(i32 %x, i32 %n) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    btl %ecx, %eax
-; X86-NEXT:    jae .LBB19_2
-; X86-NEXT:  # %bb.1: # %bb
+; X86-NEXT:    jb .LBB19_1
+; X86-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB19_1: # %bb
 ; X86-NEXT:    calll foo
-; X86-NEXT:  .LBB19_2: # %UnifiedReturnBlock
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: query3bx:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    btl %esi, %edi
-; X64-NEXT:    jae .LBB19_2
-; X64-NEXT:  # %bb.1: # %bb
+; X64-NEXT:    jb .LBB19_1
+; X64-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB19_1: # %bb
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    callq foo
 ; X64-NEXT:    popq %rax
-; X64-NEXT:  .LBB19_2: # %UnifiedReturnBlock
 ; X64-NEXT:    retq
 entry:
   %tmp29 = shl i32 1, %n
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
index 65bd1dad21a822..d3cb6daecf9c9a 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -63,21 +63,21 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
 ; CHECK32-NEXT:    #APP
 ; CHECK32-NEXT:    #NO_APP
 ; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c]
-; CHECK32-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; CHECK32-NEXT:  # %bb.1: # %bb1
+; CHECK32-NEXT:    je .LBB1_1 # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.2: # %bb2
 ; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
 ; CHECK32-NEXT:    .cfi_def_cfa_offset 4
-; CHECK32-NEXT:    jmp foo # TAILCALL
+; CHECK32-NEXT:    jmp bar # TAILCALL
 ; CHECK32-NEXT:    # encoding: [0xeb,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
-; CHECK32-NEXT:  .LBB1_2: # %bb2
+; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB1_1: # %bb1
 ; CHECK32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
 ; CHECK32-NEXT:    .cfi_def_cfa_offset 4
-; CHECK32-NEXT:    jmp bar # TAILCALL
+; CHECK32-NEXT:    jmp foo # TAILCALL
 ; CHECK32-NEXT:    # encoding: [0xeb,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
 ;
 ; CHECK64-LABEL: f_non_leaf:
 ; CHECK64:       # %bb.0: # %entry
@@ -87,21 +87,21 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
 ; CHECK64-NEXT:    #APP
 ; CHECK64-NEXT:    #NO_APP
 ; CHECK64-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
-; CHECK64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  # %bb.1: # %bb1
+; CHECK64-NEXT:    je .LBB1_1 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.2: # %bb2
 ; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 8
-; CHECK64-NEXT:    jmp foo # TAILCALL
+; CHECK64-NEXT:    jmp bar # TAILCALL
 ; CHECK64-NEXT:    # encoding: [0xeb,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  .LBB1_2: # %bb2
+; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB1_1: # %bb1
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 8
-; CHECK64-NEXT:    jmp bar # TAILCALL
+; CHECK64-NEXT:    jmp foo # TAILCALL
 ; CHECK64-NEXT:    # encoding: [0xeb,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
 ;
 ; WIN64-LABEL: f_non_leaf:
 ; WIN64:       # %bb.0: # %entry
@@ -111,19 +111,19 @@ define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
 ; WIN64-NEXT:    #APP
 ; WIN64-NEXT:    #NO_APP
 ; WIN64-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
-; WIN64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; WIN64-NEXT:  # %bb.1: # %bb1
+; WIN64-NEXT:    je .LBB1_1 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.2: # %bb2
 ; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
-; WIN64-NEXT:    jmp foo # TAILCALL
+; WIN64-NEXT:    jmp bar # TAILCALL
 ; WIN64-NEXT:    # encoding: [0xeb,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
-; WIN64-NEXT:  .LBB1_2: # %bb2
+; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB1_1: # %bb1
 ; WIN64-NEXT:    nop # encoding: [0x90]
 ; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
-; WIN64-NEXT:    jmp bar # TAILCALL
+; WIN64-NEXT:    jmp foo # TAILCALL
 ; WIN64-NEXT:    # encoding: [0xeb,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
 ; WIN64-NEXT:    .seh_handlerdata
 ; WIN64-NEXT:    .text
 ; WIN64-NEXT:    .seh_endproc
diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll
index 17078413a82423..66c57697aa7242 100644
--- a/llvm/test/CodeGen/X86/conditional-tailcall.ll
+++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll
@@ -63,21 +63,21 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize {
 ; CHECK32-NEXT:    #APP
 ; CHECK32-NEXT:    #NO_APP
 ; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c]
-; CHECK32-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; CHECK32-NEXT:  # %bb.1: # %bb1
+; CHECK32-NEXT:    je .LBB1_1 # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.2: # %bb2
 ; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
 ; CHECK32-NEXT:    .cfi_def_cfa_offset 4
-; CHECK32-NEXT:    jmp foo # TAILCALL
+; CHECK32-NEXT:    jmp bar # TAILCALL
 ; CHECK32-NEXT:    # encoding: [0xeb,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
-; CHECK32-NEXT:  .LBB1_2: # %bb2
+; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB1_1: # %bb1
 ; CHECK32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
 ; CHECK32-NEXT:    .cfi_def_cfa_offset 4
-; CHECK32-NEXT:    jmp bar # TAILCALL
+; CHECK32-NEXT:    jmp foo # TAILCALL
 ; CHECK32-NEXT:    # encoding: [0xeb,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
 ;
 ; CHECK64-LABEL: f_non_leaf:
 ; CHECK64:       # %bb.0: # %entry
@@ -87,21 +87,21 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize {
 ; CHECK64-NEXT:    #APP
 ; CHECK64-NEXT:    #NO_APP
 ; CHECK64-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
-; CHECK64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  # %bb.1: # %bb1
+; CHECK64-NEXT:    je .LBB1_1 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.2: # %bb2
 ; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 8
-; CHECK64-NEXT:    jmp foo # TAILCALL
+; CHECK64-NEXT:    jmp bar # TAILCALL
 ; CHECK64-NEXT:    # encoding: [0xeb,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  .LBB1_2: # %bb2
+; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB1_1: # %bb1
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 8
-; CHECK64-NEXT:    jmp bar # TAILCALL
+; CHECK64-NEXT:    jmp foo # TAILCALL
 ; CHECK64-NEXT:    # encoding: [0xeb,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
 ;
 ; WIN64-LABEL: f_non_leaf:
 ; WIN64:       # %bb.0: # %entry
@@ -111,19 +111,19 @@ define void @f_non_leaf(i32 %x, i32 %y) optsize {
 ; WIN64-NEXT:    #APP
 ; WIN64-NEXT:    #NO_APP
 ; WIN64-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
-; WIN64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
-; WIN64-NEXT:  # %bb.1: # %bb1
+; WIN64-NEXT:    je .LBB1_1 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.2: # %bb2
 ; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
-; WIN64-NEXT:    jmp foo # TAILCALL
+; WIN64-NEXT:    jmp bar # TAILCALL
 ; WIN64-NEXT:    # encoding: [0xeb,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
-; WIN64-NEXT:  .LBB1_2: # %bb2
+; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB1_1: # %bb1
 ; WIN64-NEXT:    nop # encoding: [0x90]
 ; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
-; WIN64-NEXT:    jmp bar # TAILCALL
+; WIN64-NEXT:    jmp foo # TAILCALL
 ; WIN64-NEXT:    # encoding: [0xeb,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
 ; WIN64-NEXT:    .seh_handlerdata
 ; WIN64-NEXT:    .text
 ; WIN64-NEXT:    .seh_endproc
diff --git a/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll b/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll
index 475d8fcf7f35a7..1ba8b00646efc6 100644
--- a/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll
@@ -1,11 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s
 ; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s
 
 define i32 @fcmp_oeq(float %x, float %y) {
-; CHECK-LABEL: fcmp_oeq
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_1}}
-; CHECK-NEXT:  jp {{LBB.+_1}}
+; CHECK-LABEL: fcmp_oeq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jne LBB0_1
+; CHECK-NEXT:    jp LBB0_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB0_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp oeq float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -15,9 +23,16 @@ bb1:
 }
 
 define i32 @fcmp_ogt(float %x, float %y) {
-; CHECK-LABEL: fcmp_ogt
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jbe {{LBB.+_1}}
+; CHECK-LABEL: fcmp_ogt:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jbe LBB1_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB1_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp ogt float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -27,9 +42,16 @@ bb1:
 }
 
 define i32 @fcmp_oge(float %x, float %y) {
-; CHECK-LABEL: fcmp_oge
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jb {{LBB.+_1}}
+; CHECK-LABEL: fcmp_oge:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jb LBB2_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB2_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp oge float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -39,9 +61,16 @@ bb1:
 }
 
 define i32 @fcmp_olt(float %x, float %y) {
-; CHECK-LABEL: fcmp_olt
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jbe {{LBB.+_1}}
+; CHECK-LABEL: fcmp_olt:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm0, %xmm1
+; CHECK-NEXT:    jbe LBB3_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB3_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp olt float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -51,9 +80,16 @@ bb1:
 }
 
 define i32 @fcmp_ole(float %x, float %y) {
-; CHECK-LABEL: fcmp_ole
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jb {{LBB.+_1}}
+; CHECK-LABEL: fcmp_ole:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm0, %xmm1
+; CHECK-NEXT:    jb LBB4_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB4_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp ole float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -63,9 +99,16 @@ bb1:
 }
 
 define i32 @fcmp_one(float %x, float %y) {
-; CHECK-LABEL: fcmp_one
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_1}}
+; CHECK-LABEL: fcmp_one:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    je LBB5_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB5_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp one float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -75,9 +118,16 @@ bb1:
 }
 
 define i32 @fcmp_ord(float %x, float %y) {
-; CHECK-LABEL: fcmp_ord
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_1}}
+; CHECK-LABEL: fcmp_ord:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jp LBB6_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB6_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp ord float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -87,9 +137,16 @@ bb1:
 }
 
 define i32 @fcmp_uno(float %x, float %y) {
-; CHECK-LABEL: fcmp_uno
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jp {{LBB.+_2}}
+; CHECK-LABEL: fcmp_uno:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jp LBB7_2
+; CHECK-NEXT:  ## %bb.1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB7_2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp uno float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -99,9 +156,16 @@ bb1:
 }
 
 define i32 @fcmp_ueq(float %x, float %y) {
-; CHECK-LABEL: fcmp_ueq
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  je {{LBB.+_2}}
+; CHECK-LABEL: fcmp_ueq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    je LBB8_2
+; CHECK-NEXT:  ## %bb.1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB8_2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp ueq float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -111,9 +175,16 @@ bb1:
 }
 
 define i32 @fcmp_ugt(float %x, float %y) {
-; CHECK-LABEL: fcmp_ugt
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  jae {{LBB.+_1}}
+; CHECK-LABEL: fcmp_ugt:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm0, %xmm1
+; CHECK-NEXT:    jae LBB9_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB9_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp ugt float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -123,9 +194,16 @@ bb1:
 }
 
 define i32 @fcmp_uge(float %x, float %y) {
-; CHECK-LABEL: fcmp_uge
-; CHECK:       ucomiss  %xmm0, %xmm1
-; CHECK-NEXT:  ja {{LBB.+_1}}
+; CHECK-LABEL: fcmp_uge:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm0, %xmm1
+; CHECK-NEXT:    ja LBB10_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB10_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp uge float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -135,9 +213,16 @@ bb1:
 }
 
 define i32 @fcmp_ult(float %x, float %y) {
-; CHECK-LABEL: fcmp_ult
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jae {{LBB.+_1}}
+; CHECK-LABEL: fcmp_ult:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jae LBB11_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB11_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp ult float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -147,9 +232,16 @@ bb1:
 }
 
 define i32 @fcmp_ule(float %x, float %y) {
-; CHECK-LABEL: fcmp_ule
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  ja {{LBB.+_1}}
+; CHECK-LABEL: fcmp_ule:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    ja LBB12_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB12_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp ule float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -159,10 +251,17 @@ bb1:
 }
 
 define i32 @fcmp_une(float %x, float %y) {
-; CHECK-LABEL: fcmp_une
-; CHECK:       ucomiss  %xmm1, %xmm0
-; CHECK-NEXT:  jne {{LBB.+_2}}
-; CHECK-NEXT:  jnp {{LBB.+_1}}
+; CHECK-LABEL: fcmp_une:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jne LBB13_2
+; CHECK-NEXT:    jnp LBB13_1
+; CHECK-NEXT:  LBB13_2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB13_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = fcmp une float %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -172,9 +271,16 @@ bb1:
 }
 
 define i32 @icmp_eq(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_eq
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jne {{LBB.+_1}}
+; CHECK-LABEL: icmp_eq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    je LBB14_2
+; CHECK-NEXT:  ## %bb.1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB14_2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
   %1 = icmp eq i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -184,9 +290,16 @@ bb1:
 }
 
 define i32 @icmp_ne(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ne
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  je {{LBB.+_1}}
+; CHECK-LABEL: icmp_ne:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    je LBB15_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB15_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = icmp ne i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -196,9 +309,16 @@ bb1:
 }
 
 define i32 @icmp_ugt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ugt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jbe {{LBB.+_1}}
+; CHECK-LABEL: icmp_ugt:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    jbe LBB16_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB16_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = icmp ugt i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -208,9 +328,16 @@ bb1:
 }
 
 define i32 @icmp_uge(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_uge
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jb {{LBB.+_1}}
+; CHECK-LABEL: icmp_uge:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    jae LBB17_2
+; CHECK-NEXT:  ## %bb.1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB17_2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
   %1 = icmp uge i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -220,9 +347,16 @@ bb1:
 }
 
 define i32 @icmp_ult(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ult
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jae {{LBB.+_1}}
+; CHECK-LABEL: icmp_ult:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    jae LBB18_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB18_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = icmp ult i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -232,9 +366,16 @@ bb1:
 }
 
 define i32 @icmp_ule(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_ule
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  ja {{LBB.+_1}}
+; CHECK-LABEL: icmp_ule:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    jbe LBB19_2
+; CHECK-NEXT:  ## %bb.1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB19_2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
   %1 = icmp ule i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -244,9 +385,16 @@ bb1:
 }
 
 define i32 @icmp_sgt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sgt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jle {{LBB.+_1}}
+; CHECK-LABEL: icmp_sgt:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    jle LBB20_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB20_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = icmp sgt i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -256,9 +404,16 @@ bb1:
 }
 
 define i32 @icmp_sge(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sge
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jl {{LBB.+_1}}
+; CHECK-LABEL: icmp_sge:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    jge LBB21_2
+; CHECK-NEXT:  ## %bb.1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB21_2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
   %1 = icmp sge i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -268,9 +423,16 @@ bb1:
 }
 
 define i32 @icmp_slt(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_slt
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jge {{LBB.+_1}}
+; CHECK-LABEL: icmp_slt:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    jge LBB22_1
+; CHECK-NEXT:  ## %bb.2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB22_1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
   %1 = icmp slt i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
@@ -280,9 +442,16 @@ bb1:
 }
 
 define i32 @icmp_sle(i32 %x, i32 %y) {
-; CHECK-LABEL: icmp_sle
-; CHECK:       cmpl %esi, %edi
-; CHECK-NEXT:  jg {{LBB.+_1}}
+; CHECK-LABEL: icmp_sle:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    jle LBB23_2
+; CHECK-NEXT:  ## %bb.1: ## %bb2
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB23_2: ## %bb1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
   %1 = icmp sle i32 %x, %y
   br i1 %1, label %bb1, label %bb2
 bb2:
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 76e45f43342f16..bb778fb61c04e5 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -941,17 +941,17 @@ define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind {
 ; X32-SSE2-NEXT:    shldl $24, %ebx, %edi
 ; X32-SSE2-NEXT:    xorl %eax, %edi
 ; X32-SSE2-NEXT:    orl %edi, %ecx
-; X32-SSE2-NEXT:    jne .LBB44_1
-; X32-SSE2-NEXT:  # %bb.2:
+; X32-SSE2-NEXT:    je .LBB44_2
+; X32-SSE2-NEXT:  # %bb.1:
 ; X32-SSE2-NEXT:    popl %esi
 ; X32-SSE2-NEXT:    popl %edi
 ; X32-SSE2-NEXT:    popl %ebx
-; X32-SSE2-NEXT:    jmp _Z3foov # TAILCALL
-; X32-SSE2-NEXT:  .LBB44_1:
+; X32-SSE2-NEXT:    retl
+; X32-SSE2-NEXT:  .LBB44_2:
 ; X32-SSE2-NEXT:    popl %esi
 ; X32-SSE2-NEXT:    popl %edi
 ; X32-SSE2-NEXT:    popl %ebx
-; X32-SSE2-NEXT:    retl
+; X32-SSE2-NEXT:    jmp _Z3foov # TAILCALL
 ;
 ; X64-AVX2-LABEL: PR45265:
 ; X64-AVX2:       # %bb.0:
@@ -964,11 +964,11 @@ define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind {
 ; X64-AVX2-NEXT:    movq (%rsi,%rcx,4), %rcx
 ; X64-AVX2-NEXT:    shrdq $40, %rdi, %rcx
 ; X64-AVX2-NEXT:    cmpq %rax, %rcx
-; X64-AVX2-NEXT:    jne .LBB44_1
-; X64-AVX2-NEXT:  # %bb.2:
-; X64-AVX2-NEXT:    jmp _Z3foov # TAILCALL
-; X64-AVX2-NEXT:  .LBB44_1:
+; X64-AVX2-NEXT:    je .LBB44_2
+; X64-AVX2-NEXT:  # %bb.1:
 ; X64-AVX2-NEXT:    retq
+; X64-AVX2-NEXT:  .LBB44_2:
+; X64-AVX2-NEXT:    jmp _Z3foov # TAILCALL
   %3 = sext i32 %0 to i64
   %4 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %3
   %5 = bitcast %struct.S* %4 to i88*
diff --git a/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll b/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
index 6e41c94e979a1b..8223b15ccafae8 100644
--- a/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
+++ b/llvm/test/CodeGen/X86/indirect-branch-tracking-eh2.ll
@@ -1,59 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple x86_64-unknown-unknown -exception-model sjlj -verify-machineinstrs=0 < %s | FileCheck %s --check-prefix=NUM
 ; RUN: llc -mtriple x86_64-unknown-unknown -exception-model sjlj -verify-machineinstrs=0 < %s | FileCheck %s --check-prefix=SJLJ
 
-; NUM-COUNT-3: endbr64
-
-;SJLJ:       main:                                  # @main
-;SJLJ-NEXT: .Lfunc_begin0:
-;SJLJ-NEXT: # %bb.0:                                # %entry
-;SJLJ-NEXT:         endbr64
-;SJLJ-NEXT:         pushq   %rbp
-;SJLJ:               callq   _Unwind_SjLj_Register
-;SJLJ-NEXT: .Ltmp0:
-;SJLJ-NEXT:         callq   _Z3foov
-;SJLJ-NEXT: .Ltmp1:
-;SJLJ-NEXT: # %bb.1:                                # %invoke.cont
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT: .LBB0_7:                                # %return
-;SJLJ:               callq   _Unwind_SjLj_Unregister
-;SJLJ:               retq
-;SJLJ-NEXT: .LBB0_9:
-;SJLJ-NEXT:         endbr64
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT:         cmpl
-;SJLJ-NEXT:         jb      .LBB0_10
-;SJLJ-NEXT: # %bb.11:
-;SJLJ-NEXT:         ud2
-;SJLJ-NEXT: .LBB0_10:
-;SJLJ-NEXT:         leaq    .LJTI0_0(%rip), %rcx
-;SJLJ-NEXT:         jmpq    *(%rcx,%rax,8)
-;SJLJ-NEXT: .LBB0_2:                                # %lpad
-;SJLJ-NEXT: .Ltmp2:
-;SJLJ-NEXT:         endbr64
-;SJLJ:               jne     .LBB0_4
-;SJLJ-NEXT: # %bb.3:                                # %catch3
-;SJLJ:               callq   __cxa_begin_catch
-;SJLJ:               jmp     .LBB0_6
-;SJLJ-NEXT: .LBB0_4:                                # %catch.fallthrough
-;SJLJ-NEXT:         cmpl
-;SJLJ-NEXT:         jne     .LBB0_8
-;SJLJ-NEXT: # %bb.5:                                # %catch
-;SJLJ:               callq   __cxa_begin_catch
-;SJLJ:               cmpb
-;SJLJ-NEXT: .LBB0_6:                                # %return
-;SJLJ:               callq   __cxa_end_catch
-;SJLJ-NEXT:         jmp     .LBB0_7
-;SJLJ-NEXT: .LBB0_8:                                # %eh.resume
-;SJLJ-NEXT:         movl
-;SJLJ-NEXT: .Lfunc_end0:
-;SJLJ:      .LJTI0_0:
-;SJLJ-NEXT:         .quad   .LBB0_2
-
 @_ZTIi = external dso_local constant i8*
 @_ZTIc = external dso_local constant i8*
 
 ; Function Attrs: noinline norecurse optnone uwtable
 define dso_local i32 @main() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_sj0 to i8*) {
+; NUM-LABEL: main:
+; NUM:       # %bb.0: # %entry
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    pushq %rbp
+; NUM-NEXT:    movq %rsp, %rbp
+; NUM-NEXT:    pushq %r15
+; NUM-NEXT:    pushq %r14
+; NUM-NEXT:    pushq %r13
+; NUM-NEXT:    pushq %r12
+; NUM-NEXT:    pushq %rbx
+; NUM-NEXT:    subq $120, %rsp
+; NUM-NEXT:    movl $0, -44(%rbp)
+; NUM-NEXT:    movq $__gxx_personality_sj0, -120(%rbp)
+; NUM-NEXT:    movq $GCC_except_table0, -112(%rbp)
+; NUM-NEXT:    movq %rbp, -104(%rbp)
+; NUM-NEXT:    movq %rsp, -88(%rbp)
+; NUM-NEXT:    movq $.LBB0_9, -96(%rbp)
+; NUM-NEXT:    movl $1, -144(%rbp)
+; NUM-NEXT:    leaq -152(%rbp), %rdi
+; NUM-NEXT:    callq _Unwind_SjLj_Register
+; NUM-NEXT:  .Ltmp0:
+; NUM-NEXT:    callq _Z3foov
+; NUM-NEXT:  .Ltmp1:
+; NUM-NEXT:  # %bb.1: # %invoke.cont
+; NUM-NEXT:    movl $1, -44(%rbp)
+; NUM-NEXT:  .LBB0_7: # %return
+; NUM-NEXT:    movl -44(%rbp), %ebx
+; NUM-NEXT:    leaq -152(%rbp), %rdi
+; NUM-NEXT:    callq _Unwind_SjLj_Unregister
+; NUM-NEXT:    movl %ebx, %eax
+; NUM-NEXT:    addq $120, %rsp
+; NUM-NEXT:    popq %rbx
+; NUM-NEXT:    popq %r12
+; NUM-NEXT:    popq %r13
+; NUM-NEXT:    popq %r14
+; NUM-NEXT:    popq %r15
+; NUM-NEXT:    popq %rbp
+; NUM-NEXT:    retq
+; NUM-NEXT:  .LBB0_9:
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    movl -144(%rbp), %eax
+; NUM-NEXT:    cmpl $1, %eax
+; NUM-NEXT:    jb .LBB0_10
+; NUM-NEXT:  # %bb.11:
+; NUM-NEXT:    ud2
+; NUM-NEXT:  .LBB0_10:
+; NUM-NEXT:    leaq {{.*}}(%rip), %rcx
+; NUM-NEXT:    jmpq *(%rcx,%rax,8)
+; NUM-NEXT:  .LBB0_2: # %lpad
+; NUM-NEXT:  .Ltmp2:
+; NUM-NEXT:    endbr64
+; NUM-NEXT:    movl -140(%rbp), %ecx
+; NUM-NEXT:    movl -136(%rbp), %eax
+; NUM-NEXT:    movq %rcx, -56(%rbp)
+; NUM-NEXT:    movl %eax, -64(%rbp)
+; NUM-NEXT:    cmpl $2, %eax
+; NUM-NEXT:    je .LBB0_3
+; NUM-NEXT:  # %bb.4: # %catch.fallthrough
+; NUM-NEXT:    cmpl $1, %eax
+; NUM-NEXT:    jne .LBB0_8
+; NUM-NEXT:  # %bb.5: # %catch
+; NUM-NEXT:    movq -56(%rbp), %rdi
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_begin_catch
+; NUM-NEXT:    movb (%rax), %al
+; NUM-NEXT:    movb %al, -45(%rbp)
+; NUM-NEXT:    xorl %ecx, %ecx
+; NUM-NEXT:    cmpb $3, %al
+; NUM-NEXT:    jmp .LBB0_6
+; NUM-NEXT:  .LBB0_3: # %catch3
+; NUM-NEXT:    movq -56(%rbp), %rdi
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_begin_catch
+; NUM-NEXT:    movl (%rax), %eax
+; NUM-NEXT:    movl %eax, -60(%rbp)
+; NUM-NEXT:    xorl %ecx, %ecx
+; NUM-NEXT:    cmpl $5, %eax
+; NUM-NEXT:  .LBB0_6: # %return
+; NUM-NEXT:    setne %cl
+; NUM-NEXT:    movl %ecx, -44(%rbp)
+; NUM-NEXT:    movl $-1, -144(%rbp)
+; NUM-NEXT:    callq __cxa_end_catch
+; NUM-NEXT:    jmp .LBB0_7
+; NUM-NEXT:  .LBB0_8: # %eh.resume
+; NUM-NEXT:    movl $-1, -144(%rbp)
+;
+; SJLJ-LABEL: main:
+; SJLJ:       # %bb.0: # %entry
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    pushq %rbp
+; SJLJ-NEXT:    movq %rsp, %rbp
+; SJLJ-NEXT:    pushq %r15
+; SJLJ-NEXT:    pushq %r14
+; SJLJ-NEXT:    pushq %r13
+; SJLJ-NEXT:    pushq %r12
+; SJLJ-NEXT:    pushq %rbx
+; SJLJ-NEXT:    subq $120, %rsp
+; SJLJ-NEXT:    movl $0, -44(%rbp)
+; SJLJ-NEXT:    movq $__gxx_personality_sj0, -120(%rbp)
+; SJLJ-NEXT:    movq $GCC_except_table0, -112(%rbp)
+; SJLJ-NEXT:    movq %rbp, -104(%rbp)
+; SJLJ-NEXT:    movq %rsp, -88(%rbp)
+; SJLJ-NEXT:    movq $.LBB0_9, -96(%rbp)
+; SJLJ-NEXT:    movl $1, -144(%rbp)
+; SJLJ-NEXT:    leaq -152(%rbp), %rdi
+; SJLJ-NEXT:    callq _Unwind_SjLj_Register
+; SJLJ-NEXT:  .Ltmp0:
+; SJLJ-NEXT:    callq _Z3foov
+; SJLJ-NEXT:  .Ltmp1:
+; SJLJ-NEXT:  # %bb.1: # %invoke.cont
+; SJLJ-NEXT:    movl $1, -44(%rbp)
+; SJLJ-NEXT:  .LBB0_7: # %return
+; SJLJ-NEXT:    movl -44(%rbp), %ebx
+; SJLJ-NEXT:    leaq -152(%rbp), %rdi
+; SJLJ-NEXT:    callq _Unwind_SjLj_Unregister
+; SJLJ-NEXT:    movl %ebx, %eax
+; SJLJ-NEXT:    addq $120, %rsp
+; SJLJ-NEXT:    popq %rbx
+; SJLJ-NEXT:    popq %r12
+; SJLJ-NEXT:    popq %r13
+; SJLJ-NEXT:    popq %r14
+; SJLJ-NEXT:    popq %r15
+; SJLJ-NEXT:    popq %rbp
+; SJLJ-NEXT:    retq
+; SJLJ-NEXT:  .LBB0_9:
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    movl -144(%rbp), %eax
+; SJLJ-NEXT:    cmpl $1, %eax
+; SJLJ-NEXT:    jb .LBB0_10
+; SJLJ-NEXT:  # %bb.11:
+; SJLJ-NEXT:    ud2
+; SJLJ-NEXT:  .LBB0_10:
+; SJLJ-NEXT:    leaq {{.*}}(%rip), %rcx
+; SJLJ-NEXT:    jmpq *(%rcx,%rax,8)
+; SJLJ-NEXT:  .LBB0_2: # %lpad
+; SJLJ-NEXT:  .Ltmp2:
+; SJLJ-NEXT:    endbr64
+; SJLJ-NEXT:    movl -140(%rbp), %ecx
+; SJLJ-NEXT:    movl -136(%rbp), %eax
+; SJLJ-NEXT:    movq %rcx, -56(%rbp)
+; SJLJ-NEXT:    movl %eax, -64(%rbp)
+; SJLJ-NEXT:    cmpl $2, %eax
+; SJLJ-NEXT:    je .LBB0_3
+; SJLJ-NEXT:  # %bb.4: # %catch.fallthrough
+; SJLJ-NEXT:    cmpl $1, %eax
+; SJLJ-NEXT:    jne .LBB0_8
+; SJLJ-NEXT:  # %bb.5: # %catch
+; SJLJ-NEXT:    movq -56(%rbp), %rdi
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_begin_catch
+; SJLJ-NEXT:    movb (%rax), %al
+; SJLJ-NEXT:    movb %al, -45(%rbp)
+; SJLJ-NEXT:    xorl %ecx, %ecx
+; SJLJ-NEXT:    cmpb $3, %al
+; SJLJ-NEXT:    jmp .LBB0_6
+; SJLJ-NEXT:  .LBB0_3: # %catch3
+; SJLJ-NEXT:    movq -56(%rbp), %rdi
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_begin_catch
+; SJLJ-NEXT:    movl (%rax), %eax
+; SJLJ-NEXT:    movl %eax, -60(%rbp)
+; SJLJ-NEXT:    xorl %ecx, %ecx
+; SJLJ-NEXT:    cmpl $5, %eax
+; SJLJ-NEXT:  .LBB0_6: # %return
+; SJLJ-NEXT:    setne %cl
+; SJLJ-NEXT:    movl %ecx, -44(%rbp)
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
+; SJLJ-NEXT:    callq __cxa_end_catch
+; SJLJ-NEXT:    jmp .LBB0_7
+; SJLJ-NEXT:  .LBB0_8: # %eh.resume
+; SJLJ-NEXT:    movl $-1, -144(%rbp)
 entry:
   %retval = alloca i32, align 4
   %exn.slot = alloca i8*
diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
index 78a32200438040..6f09506f666673 100644
--- a/llvm/test/CodeGen/X86/jump_sign.ll
+++ b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -139,11 +139,12 @@ define i32 @func_l2(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    subl %edx, %eax
-; CHECK-NEXT:    jne .LBB8_2
-; CHECK-NEXT:  # %bb.1: # %if.then
+; CHECK-NEXT:    je .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %if.else
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB8_1: # %if.then
 ; CHECK-NEXT:    cmpl %ecx, %edx
 ; CHECK-NEXT:    cmovlel %ecx, %eax
-; CHECK-NEXT:  .LBB8_2: # %if.else
 ; CHECK-NEXT:    retl
   %cmp = icmp eq i32 %b, %a
   %sub = sub nsw i32 %a, %b
@@ -329,12 +330,13 @@ define i8* @func_r(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
 ; CHECK-NEXT:    movl (%edx), %ecx
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    jl .LBB15_2
-; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    jge .LBB15_1
+; CHECK-NEXT:  # %bb.2: # %return
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB15_1: # %if.end
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl %ecx, (%edx)
 ; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:  .LBB15_2: # %return
 ; CHECK-NEXT:    retl
 entry:
   %0 = load i32, i32* %offset, align 8
diff --git a/llvm/test/CodeGen/X86/lsr-negative-stride.ll b/llvm/test/CodeGen/X86/lsr-negative-stride.ll
index 26c6128ab48db5..0d25e141439640 100644
--- a/llvm/test/CodeGen/X86/lsr-negative-stride.ll
+++ b/llvm/test/CodeGen/X86/lsr-negative-stride.ll
@@ -19,11 +19,7 @@ define i32 @t(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK-NEXT:    cmpl %ecx, %edx
-; CHECK-NEXT:    jne .LBB0_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    retl
-; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:    je .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: # %bb.outer
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB0_3 Depth 2
@@ -49,6 +45,9 @@ define i32 @t(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  .LBB0_6: # %bb17
 ; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retl
 entry:
 	%tmp1434 = icmp eq i32 %a, %b		; <i1> [#uses=1]
 	br i1 %tmp1434, label %bb17, label %bb.outer
diff --git a/llvm/test/CodeGen/X86/machine-cse.ll b/llvm/test/CodeGen/X86/machine-cse.ll
index 281aaca7c8d1fc..7ec9cafb5465e6 100644
--- a/llvm/test/CodeGen/X86/machine-cse.ll
+++ b/llvm/test/CodeGen/X86/machine-cse.ll
@@ -110,10 +110,11 @@ define i32 @cross_mbb_phys_cse(i32 %a, i32 %b) nounwind ssp {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmpl %esi, %edi
-; CHECK-NEXT:    ja .LBB2_2
-; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    jbe .LBB2_1
+; CHECK-NEXT:  # %bb.2: # %return
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB2_1: # %if.end
 ; CHECK-NEXT:    sbbl %eax, %eax
-; CHECK-NEXT:  .LBB2_2: # %return
 ; CHECK-NEXT:    retq
 entry:
   %cmp = icmp ugt i32 %a, %b
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 374f573eed7b26..6c7076757d0e9c 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -239,19 +239,19 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB9_3: # %res_block
+; X86-NEXT:    je .LBB9_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB9_1: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3:
 ; X64:       # %bb.0:
@@ -260,16 +260,16 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
+; X64-NEXT:    je .LBB9_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   ret i32 %m
@@ -454,19 +454,19 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB16_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB16_3: # %res_block
+; X86-NEXT:    je .LBB16_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB16_1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5:
 ; X64:       # %bb.0:
@@ -475,16 +475,16 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB16_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB16_3: # %res_block
+; X64-NEXT:    je .LBB16_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB16_1: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   ret i32 %m
@@ -530,16 +530,16 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB18_2
-; X86-NEXT:  .LBB18_3: # %res_block
+; X86-NEXT:    je .LBB18_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    jmp .LBB18_2
+; X86-NEXT:  .LBB18_1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:  .LBB18_2: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -553,20 +553,20 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB18_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %res_block
+; X64-NEXT:    je .LBB18_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB18_1: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   %c = icmp slt i32 %m, 0
@@ -577,53 +577,56 @@ define i32 @length7(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length7:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB19_2
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB19_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    movl 3(%esi), %eax
+; X86-NEXT:    movl 3(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB19_3
-; X86-NEXT:  .LBB19_2: # %res_block
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB19_2
+; X86-NEXT:  .LBB19_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB19_2:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB19_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length7:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB19_2
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    jne .LBB19_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
+; X64-NEXT:    movl 3(%rdi), %eax
+; X64-NEXT:    movl 3(%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB19_3
-; X64-NEXT:  .LBB19_2: # %res_block
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    je .LBB19_2
+; X64-NEXT:  .LBB19_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB19_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB19_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
   ret i32 %m
@@ -660,55 +663,60 @@ define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length7_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB21_2
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB21_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    movl 3(%esi), %eax
+; X86-NEXT:    movl 3(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB21_3
-; X86-NEXT:  .LBB21_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB21_2
+; X86-NEXT:  .LBB21_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
 ; X86-NEXT:  .LBB21_3: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB21_2:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    jmp .LBB21_3
 ;
 ; X64-LABEL: length7_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    jne .LBB21_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
+; X64-NEXT:    movl 3(%rdi), %eax
+; X64-NEXT:    movl 3(%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    je .LBB21_3
 ; X64-NEXT:  .LBB21_2: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB21_3:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB21_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -721,28 +729,30 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB22_2
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB22_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB22_3
-; X86-NEXT:  .LBB22_2: # %res_block
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB22_2
+; X86-NEXT:  .LBB22_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB22_2:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB22_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -944,60 +954,63 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length12:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB29_3
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB29_5
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB29_3
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB29_5
 ; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    movl 8(%esi), %eax
+; X86-NEXT:    movl 8(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB29_4
-; X86-NEXT:  .LBB29_3: # %res_block
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB29_3
+; X86-NEXT:  .LBB29_5: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB29_3:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB29_4: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB29_2
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB29_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    movl 8(%rdi), %eax
+; X64-NEXT:    movl 8(%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB29_3
-; X64-NEXT:  .LBB29_2: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB29_2
+; X64-NEXT:  .LBB29_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB29_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB29_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
@@ -1116,67 +1129,70 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB33_6
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB33_6
 ; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    movl 8(%esi), %eax
+; X86-NEXT:    movl 8(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB33_6
 ; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    movl 12(%esi), %eax
+; X86-NEXT:    movl 12(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB33_5
-; X86-NEXT:  .LBB33_4: # %res_block
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB33_4
+; X86-NEXT:  .LBB33_6: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB33_4:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB33_5: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB33_2
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB33_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB33_3
-; X64-NEXT:  .LBB33_2: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB33_2
+; X64-NEXT:  .LBB33_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB33_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB33_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
@@ -1291,69 +1307,74 @@ define i1 @length16_lt(i8* %x, i8* %y) nounwind {
 ; X86-LABEL: length16_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB35_6
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB35_6
 ; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    movl 8(%esi), %eax
+; X86-NEXT:    movl 8(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB35_6
 ; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    movl 12(%esi), %eax
+; X86-NEXT:    movl 12(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB35_5
-; X86-NEXT:  .LBB35_4: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB35_4
+; X86-NEXT:  .LBB35_6: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
 ; X86-NEXT:  .LBB35_5: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB35_4:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    jmp .LBB35_5
 ;
 ; X64-LABEL: length16_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB35_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    je .LBB35_3
 ; X64-NEXT:  .LBB35_2: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB35_3:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB35_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -1373,39 +1394,41 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind {
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
+; X86-NEXT:    jne .LBB36_6
 ; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movl 4(%esi), %eax
 ; X86-NEXT:    movl 4(%edx), %ecx
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
+; X86-NEXT:    jne .LBB36_6
 ; X86-NEXT:  # %bb.2: # %loadbb2
 ; X86-NEXT:    movl 8(%esi), %eax
 ; X86-NEXT:    movl 8(%edx), %ecx
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
+; X86-NEXT:    jne .LBB36_6
 ; X86-NEXT:  # %bb.3: # %loadbb3
 ; X86-NEXT:    movl 12(%esi), %eax
 ; X86-NEXT:    movl 12(%edx), %ecx
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    je .LBB36_5
-; X86-NEXT:  .LBB36_4: # %res_block
+; X86-NEXT:    je .LBB36_4
+; X86-NEXT:  .LBB36_6: # %res_block
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    setae %dl
-; X86-NEXT:    leal -1(%edx,%edx), %edx
+; X86-NEXT:    leal -1(%edx,%edx), %eax
 ; X86-NEXT:  .LBB36_5: # %endblock
-; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB36_4:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    jmp .LBB36_5
 ;
 ; X64-LABEL: length16_gt:
 ; X64:       # %bb.0:
@@ -1414,24 +1437,26 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB36_2
+; X64-NEXT:    jne .LBB36_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq 8(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB36_3
-; X64-NEXT:  .LBB36_2: # %res_block
+; X64-NEXT:    je .LBB36_2
+; X64-NEXT:  .LBB36_4: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    setae %dl
-; X64-NEXT:    leal -1(%rdx,%rdx), %edx
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
 ; X64-NEXT:  .LBB36_3: # %endblock
-; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB36_2:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp .LBB36_3
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
   %cmp = icmp sgt i32 %call, 0
   ret i1 %cmp
@@ -1549,33 +1574,34 @@ define i32 @length24(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length24:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB38_3
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB38_5
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB38_3
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB38_5
 ; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq 16(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB38_4
-; X64-NEXT:  .LBB38_3: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB38_3
+; X64-NEXT:  .LBB38_5: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB38_3:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB38_4: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
   ret i32 %m
@@ -1708,33 +1734,36 @@ define i1 @length24_lt(i8* %x, i8* %y) nounwind {
 ;
 ; X64-LABEL: length24_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB40_3
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB40_3
 ; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq 16(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    je .LBB40_4
 ; X64-NEXT:  .LBB40_3: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB40_4:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB40_4: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -1763,31 +1792,33 @@ define i1 @length24_gt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB41_3
+; X64-NEXT:    jne .LBB41_5
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq 8(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB41_3
+; X64-NEXT:    jne .LBB41_5
 ; X64-NEXT:  # %bb.2: # %loadbb2
 ; X64-NEXT:    movq 16(%rdi), %rax
 ; X64-NEXT:    movq 16(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB41_4
-; X64-NEXT:  .LBB41_3: # %res_block
+; X64-NEXT:    je .LBB41_3
+; X64-NEXT:  .LBB41_5: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    setae %dl
-; X64-NEXT:    leal -1(%rdx,%rdx), %edx
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
 ; X64-NEXT:  .LBB41_4: # %endblock
-; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB41_3:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp .LBB41_4
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
   %cmp = icmp sgt i32 %call, 0
   ret i1 %cmp
@@ -1907,40 +1938,41 @@ define i32 @length31(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length31:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB43_6
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB43_6
 ; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq 16(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB43_6
 ; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 23(%rdi), %rcx
-; X64-NEXT:    movq 23(%rsi), %rdx
+; X64-NEXT:    movq 23(%rdi), %rax
+; X64-NEXT:    movq 23(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB43_5
-; X64-NEXT:  .LBB43_4: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB43_4
+; X64-NEXT:  .LBB43_6: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB43_4:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB43_5: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 31) nounwind
   ret i32 %m
@@ -2072,40 +2104,43 @@ define i1 @length31_lt(i8* %x, i8* %y) nounwind {
 ;
 ; X64-LABEL: length31_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB45_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB45_4
 ; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq 16(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB45_4
 ; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 23(%rdi), %rcx
-; X64-NEXT:    movq 23(%rsi), %rdx
+; X64-NEXT:    movq 23(%rdi), %rax
+; X64-NEXT:    movq 23(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    je .LBB45_5
 ; X64-NEXT:  .LBB45_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB45_5:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB45_5: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -2134,38 +2169,40 @@ define i1 @length31_gt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
+; X64-NEXT:    jne .LBB46_6
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq 8(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
+; X64-NEXT:    jne .LBB46_6
 ; X64-NEXT:  # %bb.2: # %loadbb2
 ; X64-NEXT:    movq 16(%rdi), %rax
 ; X64-NEXT:    movq 16(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
+; X64-NEXT:    jne .LBB46_6
 ; X64-NEXT:  # %bb.3: # %loadbb3
 ; X64-NEXT:    movq 23(%rdi), %rax
 ; X64-NEXT:    movq 23(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB46_5
-; X64-NEXT:  .LBB46_4: # %res_block
+; X64-NEXT:    je .LBB46_4
+; X64-NEXT:  .LBB46_6: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    setae %dl
-; X64-NEXT:    leal -1(%rdx,%rdx), %edx
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
 ; X64-NEXT:  .LBB46_5: # %endblock
-; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB46_4:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp .LBB46_5
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 31) nounwind
   %cmp = icmp sgt i32 %call, 0
   ret i1 %cmp
@@ -2396,40 +2433,41 @@ define i32 @length32(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB49_6
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB49_6
 ; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq 16(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB49_6
 ; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 24(%rdi), %rcx
-; X64-NEXT:    movq 24(%rsi), %rdx
+; X64-NEXT:    movq 24(%rdi), %rax
+; X64-NEXT:    movq 24(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB49_5
-; X64-NEXT:  .LBB49_4: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB49_4
+; X64-NEXT:  .LBB49_6: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB49_4:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB49_5: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
   ret i32 %m
@@ -2576,40 +2614,43 @@ define i1 @length32_lt(i8* %x, i8* %y) nounwind {
 ;
 ; X64-LABEL: length32_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB51_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB51_4
 ; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq 16(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB51_4
 ; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 24(%rdi), %rcx
-; X64-NEXT:    movq 24(%rsi), %rdx
+; X64-NEXT:    movq 24(%rdi), %rax
+; X64-NEXT:    movq 24(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    je .LBB51_5
 ; X64-NEXT:  .LBB51_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB51_5:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB51_5: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -2638,38 +2679,40 @@ define i1 @length32_gt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
+; X64-NEXT:    jne .LBB52_6
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq 8(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
+; X64-NEXT:    jne .LBB52_6
 ; X64-NEXT:  # %bb.2: # %loadbb2
 ; X64-NEXT:    movq 16(%rdi), %rax
 ; X64-NEXT:    movq 16(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
+; X64-NEXT:    jne .LBB52_6
 ; X64-NEXT:  # %bb.3: # %loadbb3
 ; X64-NEXT:    movq 24(%rdi), %rax
 ; X64-NEXT:    movq 24(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB52_5
-; X64-NEXT:  .LBB52_4: # %res_block
+; X64-NEXT:    je .LBB52_4
+; X64-NEXT:  .LBB52_6: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    setae %dl
-; X64-NEXT:    leal -1(%rdx,%rdx), %edx
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
 ; X64-NEXT:  .LBB52_5: # %endblock
-; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB52_4:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp .LBB52_5
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
   %cmp = icmp sgt i32 %call, 0
   ret i1 %cmp
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index 594a4a68dac00f..03e70e6720972f 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -120,16 +120,16 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_2
-; X86-NEXT:  .LBB4_3: # %res_block
+; X86-NEXT:    je .LBB4_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    jmp .LBB4_2
+; X86-NEXT:  .LBB4_1: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:  .LBB4_2: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -141,16 +141,16 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB4_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_3: # %res_block
+; X64-NEXT:    je .LBB4_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   ret i32 %m
@@ -265,16 +265,16 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_2
-; X86-NEXT:  .LBB9_3: # %res_block
+; X86-NEXT:    je .LBB9_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    jmp .LBB9_2
+; X86-NEXT:  .LBB9_1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:  .LBB9_2: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
@@ -286,16 +286,16 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
+; X64-NEXT:    je .LBB9_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   ret i32 %m
@@ -334,30 +334,32 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB11_2
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB11_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB11_3
-; X86-NEXT:  .LBB11_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB11_2
+; X86-NEXT:  .LBB11_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
 ; X86-NEXT:  .LBB11_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB11_2:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    jmp .LBB11_3
 ;
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
@@ -461,26 +463,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
 ;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB15_2
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB15_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    movl 8(%rdi), %eax
+; X64-NEXT:    movl 8(%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB15_3
-; X64-NEXT:  .LBB15_2: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB15_2
+; X64-NEXT:  .LBB15_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB15_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB15_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
@@ -501,26 +504,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
 ;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB16_2
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB16_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB16_3
-; X64-NEXT:  .LBB16_2: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB16_2
+; X64-NEXT:  .LBB16_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB16_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB16_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 75e9f5975d95c5..d9273a52c523a4 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -120,19 +120,19 @@ define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_2
-; X86-NEXT:  .LBB4_3: # %res_block
+; X86-NEXT:    je .LBB4_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB4_2: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB4_2
 ;
 ; X64-LABEL: length3:
 ; X64:       # %bb.0:
@@ -141,16 +141,16 @@ define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB4_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_3: # %res_block
+; X64-NEXT:    je .LBB4_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   ret i32 %m
@@ -265,19 +265,19 @@ define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_2
-; X86-NEXT:  .LBB9_3: # %res_block
+; X86-NEXT:    je .LBB9_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:  .LBB9_2: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB9_1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB9_2
 ;
 ; X64-LABEL: length5:
 ; X64:       # %bb.0:
@@ -286,16 +286,16 @@ define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
+; X64-NEXT:    je .LBB9_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   ret i32 %m
@@ -334,30 +334,32 @@ define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB11_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB11_3
-; X86-NEXT:  .LBB11_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB11_1
+; X86-NEXT:  .LBB11_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
 ; X86-NEXT:  .LBB11_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB11_1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB11_4
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    jmp .LBB11_3
 ;
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
@@ -461,26 +463,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 {
 ;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB15_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB15_1
+; X64-NEXT:  .LBB15_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB15_1: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %eax
+; X64-NEXT:    movl 8(%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB15_3
-; X64-NEXT:  .LBB15_2: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB15_4
+; X64-NEXT:  # %bb.2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB15_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
@@ -501,26 +504,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 {
 ;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB16_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB16_1
+; X64-NEXT:  .LBB16_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB16_1: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB16_4
+; X64-NEXT:  # %bb.2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB16_3
-; X64-NEXT:  .LBB16_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB16_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index de604ded08d1aa..233183b7f9ad59 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -285,19 +285,19 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB11_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB11_3: # %res_block
+; X86-NEXT:    je .LBB11_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB11_1: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3:
 ; X64:       # %bb.0:
@@ -306,16 +306,16 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB11_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB11_3: # %res_block
+; X64-NEXT:    je .LBB11_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB11_1: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
   ret i32 %m
@@ -500,19 +500,19 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_3: # %res_block
+; X86-NEXT:    je .LBB18_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB18_1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5:
 ; X64:       # %bb.0:
@@ -521,16 +521,16 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB18_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %res_block
+; X64-NEXT:    je .LBB18_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB18_1: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   ret i32 %m
@@ -576,16 +576,16 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB20_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB20_2
-; X86-NEXT:  .LBB20_3: # %res_block
+; X86-NEXT:    je .LBB20_1
+; X86-NEXT:  # %bb.3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    jmp .LBB20_2
+; X86-NEXT:  .LBB20_1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:  .LBB20_2: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -599,20 +599,20 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB20_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB20_3: # %res_block
+; X64-NEXT:    je .LBB20_1
+; X64-NEXT:  # %bb.3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB20_1: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
   %c = icmp slt i32 %m, 0
@@ -623,53 +623,56 @@ define i32 @length7(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length7:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB21_2
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB21_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    movl 3(%esi), %eax
+; X86-NEXT:    movl 3(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB21_3
-; X86-NEXT:  .LBB21_2: # %res_block
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB21_2
+; X86-NEXT:  .LBB21_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB21_2:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB21_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length7:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB21_2
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    jne .LBB21_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
+; X64-NEXT:    movl 3(%rdi), %eax
+; X64-NEXT:    movl 3(%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB21_3
-; X64-NEXT:  .LBB21_2: # %res_block
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    je .LBB21_2
+; X64-NEXT:  .LBB21_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB21_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB21_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 7) nounwind
   ret i32 %m
@@ -679,55 +682,60 @@ define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length7_lt:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB22_2
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB22_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    movl 3(%esi), %eax
+; X86-NEXT:    movl 3(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB22_3
-; X86-NEXT:  .LBB22_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB22_2
+; X86-NEXT:  .LBB22_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
 ; X86-NEXT:  .LBB22_3: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
+; X86-NEXT:  .LBB22_2:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    jmp .LBB22_3
 ;
 ; X64-LABEL: length7_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    jne .LBB22_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
+; X64-NEXT:    movl 3(%rdi), %eax
+; X64-NEXT:    movl 3(%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    cmpl %ecx, %eax
 ; X64-NEXT:    je .LBB22_3
 ; X64-NEXT:  .LBB22_2: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB22_3:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB22_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -767,28 +775,30 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB24_2
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB24_4
 ; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB24_3
-; X86-NEXT:  .LBB24_2: # %res_block
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB24_2
+; X86-NEXT:  .LBB24_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB24_2:
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB24_3: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -977,26 +987,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB31_2
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB31_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    movl 8(%rdi), %eax
+; X64-NEXT:    movl 8(%rsi), %ecx
+; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB31_3
-; X64-NEXT:  .LBB31_2: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB31_2
+; X64-NEXT:  .LBB31_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB31_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB31_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
   ret i32 %m
@@ -1069,26 +1080,27 @@ define i32 @length15(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length15:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB34_2
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB34_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    movq 7(%rsi), %rdx
+; X64-NEXT:    movq 7(%rdi), %rax
+; X64-NEXT:    movq 7(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB34_3
-; X64-NEXT:  .LBB34_2: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB34_2
+; X64-NEXT:  .LBB34_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB34_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB34_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 15) nounwind
   ret i32 %m
@@ -1109,26 +1121,29 @@ define i1 @length15_lt(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length15_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB35_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    movq 7(%rsi), %rdx
+; X64-NEXT:    movq 7(%rdi), %rax
+; X64-NEXT:    movq 7(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    je .LBB35_3
 ; X64-NEXT:  .LBB35_2: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB35_3:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB35_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -1256,26 +1271,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
 ;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB39_2
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    jne .LBB39_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB39_3
-; X64-NEXT:  .LBB39_2: # %res_block
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    je .LBB39_2
+; X64-NEXT:  .LBB39_4: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB39_2:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB39_3: # %endblock
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
   ret i32 %m
@@ -1385,26 +1401,29 @@ define i1 @length16_lt(i8* %x, i8* %y) nounwind {
 ;
 ; X64-LABEL: length16_lt:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq (%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    jne .LBB41_2
 ; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    movq 8(%rdi), %rax
+; X64-NEXT:    movq 8(%rsi), %rcx
+; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    je .LBB41_3
 ; X64-NEXT:  .LBB41_2: # %res_block
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    setae %dl
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
+; X64-NEXT:    shrl $31, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB41_3:
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    setae %al
-; X64-NEXT:    leal -1(%rax,%rax), %eax
-; X64-NEXT:  .LBB41_3: # %endblock
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
@@ -1433,24 +1452,26 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB42_2
+; X64-NEXT:    jne .LBB42_4
 ; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movq 8(%rdi), %rax
 ; X64-NEXT:    movq 8(%rsi), %rcx
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB42_3
-; X64-NEXT:  .LBB42_2: # %res_block
+; X64-NEXT:    je .LBB42_2
+; X64-NEXT:  .LBB42_4: # %res_block
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    cmpq %rcx, %rax
 ; X64-NEXT:    setae %dl
-; X64-NEXT:    leal -1(%rdx,%rdx), %edx
+; X64-NEXT:    leal -1(%rdx,%rdx), %eax
 ; X64-NEXT:  .LBB42_3: # %endblock
-; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
+; X64-NEXT:  .LBB42_2:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp .LBB42_3
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
   %cmp = icmp sgt i32 %call, 0
   ret i1 %cmp
diff --git a/llvm/test/CodeGen/X86/neg_cmp.ll b/llvm/test/CodeGen/X86/neg_cmp.ll
index 47fa7fbb88f0d1..8cc1ed7ff1231e 100644
--- a/llvm/test/CodeGen/X86/neg_cmp.ll
+++ b/llvm/test/CodeGen/X86/neg_cmp.ll
@@ -10,11 +10,11 @@ define void @neg_cmp(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: neg_cmp:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addl %esi, %edi
-; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %if.then
-; CHECK-NEXT:    jmp g # TAILCALL
-; CHECK-NEXT:  .LBB0_1: # %if.end
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %if.end
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_2: # %if.then
+; CHECK-NEXT:    jmp g # TAILCALL
   %sub = sub i32 0, %y
   %cmp = icmp eq i32 %x, %sub
   br i1 %cmp, label %if.then, label %if.end
@@ -31,11 +31,11 @@ define void @neg_cmp_commuted(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: neg_cmp_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addl %esi, %edi
-; CHECK-NEXT:    jne .LBB1_1
-; CHECK-NEXT:  # %bb.2: # %if.then
-; CHECK-NEXT:    jmp g # TAILCALL
-; CHECK-NEXT:  .LBB1_1: # %if.end
+; CHECK-NEXT:    je .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %if.end
 ; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB1_2: # %if.then
+; CHECK-NEXT:    jmp g # TAILCALL
   %sub = sub i32 0, %y
   %cmp = icmp eq i32 %sub, %x
   br i1 %cmp, label %if.then, label %if.end
diff --git a/llvm/test/CodeGen/X86/nobt.ll b/llvm/test/CodeGen/X86/nobt.ll
index b994b7f950fd2c..5e3926dd07a806 100644
--- a/llvm/test/CodeGen/X86/nobt.ll
+++ b/llvm/test/CodeGen/X86/nobt.ll
@@ -9,10 +9,11 @@ define void @test2(i32 %x, i32 %n) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne .LBB0_2
-; CHECK-NEXT:  # %bb.1: # %bb
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1: # %bb
 ; CHECK-NEXT:    calll foo
-; CHECK-NEXT:  .LBB0_2: # %UnifiedReturnBlock
 ; CHECK-NEXT:    retl
 entry:
         %tmp1 = and i32 %x, 1
@@ -34,10 +35,11 @@ define void @test3(i32 %x, i32 %n) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne .LBB1_2
-; CHECK-NEXT:  # %bb.1: # %bb
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %UnifiedReturnBlock
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB1_1: # %bb
 ; CHECK-NEXT:    calll foo
-; CHECK-NEXT:  .LBB1_2: # %UnifiedReturnBlock
 ; CHECK-NEXT:    retl
 entry:
         %tmp1 = and i32 %x, 1
diff --git a/llvm/test/CodeGen/X86/pr29170.ll b/llvm/test/CodeGen/X86/pr29170.ll
index dfbad021d2871f..d5cfc51bf41650 100644
--- a/llvm/test/CodeGen/X86/pr29170.ll
+++ b/llvm/test/CodeGen/X86/pr29170.ll
@@ -11,8 +11,11 @@ define i32 @main() {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne .LBB0_3
-; CHECK-NEXT:  # %bb.1: # %go
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  .LBB0_3: # %if.else
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1: # %go
 ; CHECK-NEXT:    movl $-1, %ecx
 ; CHECK-NEXT:    movsbl b, %edx
 ; CHECK-NEXT:    notl %ecx
@@ -23,9 +26,6 @@ define i32 @main() {
 ; CHECK-NEXT:  # %bb.2: # %if.then
 ; CHECK-NEXT:    movl $42, %eax
 ; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB0_3: # %if.else
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retl
 entry:
   %true = icmp eq i32 0, 0
   %const = bitcast i64 -4294967296 to i64
diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
index a15d633d85381d..cc4857a16c5806 100644
--- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
@@ -9,13 +9,13 @@ define i32 @branch_eq(i64 %a, i64 %b) {
 ; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    jne .LBB0_2
-; CHECK-NEXT:  # %bb.1: # %bb1
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB0_2: # %bb2
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %bb2
 ; CHECK-NEXT:    movl $2, %eax
 ; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp eq i64 %a, %b
 	br i1 %cmp, label %bb1, label %bb2
@@ -55,13 +55,13 @@ define i32 @branch_ule(i64 %a, i64 %b) {
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    jb .LBB2_2
-; CHECK-NEXT:  # %bb.1: # %bb1
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB2_2: # %bb2
+; CHECK-NEXT:    jae .LBB2_1
+; CHECK-NEXT:  # %bb.2: # %bb2
 ; CHECK-NEXT:    movl $2, %eax
 ; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB2_1: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp ule i64 %a, %b
 	br i1 %cmp, label %bb1, label %bb2
diff --git a/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll b/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
index 719d69d16a625d..ea89c76c4d93e0 100644
--- a/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
+++ b/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
@@ -16,8 +16,11 @@
 ; ASM:         .cv_loc 0 1 3 9                 # t.c:3:9
 ; ASM:         movl    %ecx, %eax
 ; ASM:         cmpl    %edx, %ecx
-; ASM:         jl      [[EPILOGUE:LBB0_[0-9]+]]
+; ASM:         jge     LBB0_1
 
+; ASM:         retl    $8
+
+; ASM: LBB0_1: 
 ; ASM:         pushl   %ebx
 ; ASM:         .cv_fpo_pushreg %ebx
 ; ASM:         pushl   %edi
@@ -31,9 +34,7 @@
 ; ASM:         popl    %esi
 ; ASM:         popl    %edi
 ; ASM:         popl    %ebx
-; ASM: [[EPILOGUE]]:                                 # %return
-; ASM:         retl    $8
-; ASM: Ltmp10:
+; ASM: Ltmp11:
 ; ASM:         .cv_fpo_endproc
 
 ; Note how RvaStart advances 7 bytes to skip the shrink-wrapped portion.
@@ -41,7 +42,7 @@
 ; OBJ:    FrameData {
 ; OBJ:      RvaStart: 0x0
 ; OBJ:      CodeSize: 0x36
-; OBJ:      PrologSize: 0x9
+; OBJ:     PrologSize: 0xC
 ; OBJ:      FrameFunc [
 ; OBJ-NEXT:   $T0 .raSearch =
 ; OBJ-NEXT:   $eip $T0 ^ =
@@ -49,8 +50,8 @@
 ; OBJ-NEXT: ]
 ; OBJ:    }
 ; OBJ:    FrameData {
-; OBJ:      RvaStart: 0x7
-; OBJ:      CodeSize: 0x2F
+; OBJ:      RvaStart: 0xA
+; OBJ:      CodeSize: 0x2C
 ; OBJ:      PrologSize: 0x2
 ; OBJ:      FrameFunc [
 ; OBJ-NEXT:   $T0 .raSearch =
@@ -60,8 +61,8 @@
 ; OBJ-NEXT: ]
 ; OBJ:    }
 ; OBJ:    FrameData {
-; OBJ:      RvaStart: 0x8
-; OBJ:      CodeSize: 0x2E
+; OBJ:      RvaStart: 0xB
+; OBJ:      CodeSize: 0x2B
 ; OBJ:      PrologSize: 0x1
 ; OBJ:      FrameFunc [
 ; OBJ-NEXT:   $T0 .raSearch =
@@ -72,8 +73,8 @@
 ; OBJ-NEXT: ]
 ; OBJ:    }
 ; OBJ:    FrameData {
-; OBJ:      RvaStart: 0x9
-; OBJ:      CodeSize: 0x2D
+; OBJ:      RvaStart: 0xC
+; OBJ:      CodeSize: 0x2A
 ; OBJ:      PrologSize: 0x0
 ; OBJ:      FrameFunc [
 ; OBJ-NEXT:   $T0 .raSearch =
diff --git a/llvm/test/Transforms/PGOProfile/counter_promo_mexits.ll b/llvm/test/Transforms/PGOProfile/counter_promo_mexits.ll
index bb799757a47cc2..f462eba5a3ab9b 100644
--- a/llvm/test/Transforms/PGOProfile/counter_promo_mexits.ll
+++ b/llvm/test/Transforms/PGOProfile/counter_promo_mexits.ll
@@ -1,27 +1,95 @@
-; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
-; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck %s --check-prefix=PROMO
+; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck %s --check-prefix=PROMO
 
 @g = common local_unnamed_addr global i32 0, align 4
 
 define void @foo(i32 %arg) local_unnamed_addr {
-; PROMO-LABEL: @foo
+; PROMO-LABEL: @foo(
+; PROMO-NEXT:  bb:
+; PROMO-NEXT:    [[T:%.*]] = add nsw i32 [[ARG:%.*]], -1
+; PROMO-NEXT:    br label [[BB1:%.*]]
+; PROMO:       bb1:
+; PROMO-NEXT:    [[PGOCOUNT213:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[PGOCOUNT212:%.*]], [[BB11:%.*]] ]
+; PROMO-NEXT:    [[PGOCOUNT19:%.*]] = phi i64 [ 0, [[BB]] ], [ [[PGOCOUNT18:%.*]], [[BB11]] ]
+; PROMO-NEXT:    [[PGOCOUNT6:%.*]] = phi i64 [ 0, [[BB]] ], [ [[PGOCOUNT5:%.*]], [[BB11]] ]
+; PROMO-NEXT:    [[T2:%.*]] = phi i32 [ 0, [[BB]] ], [ [[T12:%.*]], [[BB11]] ]
+; PROMO-NEXT:    [[T3:%.*]] = icmp sgt i32 [[T2]], [[ARG]]
+; PROMO-NEXT:    br i1 [[T3]], label [[BB7:%.*]], label [[BB4:%.*]]
+; PROMO:       bb4:
+; PROMO-NEXT:    [[TMP0:%.*]] = add i64 [[PGOCOUNT6]], 1
+; PROMO-NEXT:    tail call void @bar(i32 1)
+; PROMO-NEXT:    [[T5:%.*]] = load i32, i32* @g, align 4
+; PROMO-NEXT:    [[T6:%.*]] = icmp sgt i32 [[T5]], 100
+; PROMO-NEXT:    br i1 [[T6]], label [[BB15_0:%.*]], label [[BB11]]
+; PROMO:       bb7:
+; PROMO-NEXT:    [[T8:%.*]] = icmp slt i32 [[T2]], [[T]]
+; PROMO-NEXT:    br i1 [[T8]], label [[BB9:%.*]], label [[BB10:%.*]]
+; PROMO:       bb9:
+; PROMO-NEXT:    [[TMP1:%.*]] = add i64 [[PGOCOUNT19]], 1
+; PROMO-NEXT:    tail call void @bar(i32 2)
+; PROMO-NEXT:    br label [[BB11]]
+; PROMO:       bb10:
+; PROMO-NEXT:    [[TMP2:%.*]] = add i64 [[PGOCOUNT213]], 1
+; PROMO-NEXT:    tail call void @bar(i32 3)
+; PROMO-NEXT:    br label [[BB11]]
+; PROMO:       bb11:
+; PROMO-NEXT:    [[PGOCOUNT212]] = phi i64 [ [[TMP2]], [[BB10]] ], [ [[PGOCOUNT213]], [[BB9]] ], [ [[PGOCOUNT213]], [[BB4]] ]
+; PROMO-NEXT:    [[PGOCOUNT18]] = phi i64 [ [[PGOCOUNT19]], [[BB10]] ], [ [[TMP1]], [[BB9]] ], [ [[PGOCOUNT19]], [[BB4]] ]
+; PROMO-NEXT:    [[PGOCOUNT5]] = phi i64 [ [[PGOCOUNT6]], [[BB10]] ], [ [[PGOCOUNT6]], [[BB9]] ], [ [[TMP0]], [[BB4]] ]
+; PROMO-NEXT:    [[T12]] = add nuw nsw i32 [[T2]], 1
+; PROMO-NEXT:    [[T13:%.*]] = icmp slt i32 [[T2]], 99
+; PROMO-NEXT:    br i1 [[T13]], label [[BB1]], label [[BB14:%.*]]
+; PROMO:       bb14:
+; PROMO-NEXT:    [[PGOCOUNT_PROMOTED7:%.*]] = load i64, i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 1), align 4
+; PROMO-NEXT:    [[TMP3:%.*]] = add i64 [[PGOCOUNT_PROMOTED7]], [[PGOCOUNT5]]
+; PROMO-NEXT:    store i64 [[TMP3]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 1), align 4
+; PROMO-NEXT:    [[PGOCOUNT_PROMOTED11:%.*]] = load i64, i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 0), align 4
+; PROMO-NEXT:    [[TMP4:%.*]] = add i64 [[PGOCOUNT_PROMOTED11]], [[PGOCOUNT18]]
+; PROMO-NEXT:    store i64 [[TMP4]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 0), align 4
+; PROMO-NEXT:    [[PGOCOUNT_PROMOTED15:%.*]] = load i64, i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 2), align 4
+; PROMO-NEXT:    [[TMP5:%.*]] = add i64 [[PGOCOUNT_PROMOTED15]], [[PGOCOUNT212]]
+; PROMO-NEXT:    store i64 [[TMP5]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 2), align 4
+; PROMO-NEXT:    [[PGOCOUNT3:%.*]] = load i64, i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 3), align 4
+; PROMO-NEXT:    [[TMP6:%.*]] = add i64 [[PGOCOUNT3]], 1
+; PROMO-NEXT:    store i64 [[TMP6]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 3), align 4
+; PROMO-NEXT:    tail call void @bar(i32 0)
+; PROMO-NEXT:    br label [[BB15:%.*]]
+; PROMO:       bb15_0:
+; PROMO-NEXT:    [[PGOCOUNT_PROMOTED:%.*]] = load i64, i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 1), align 4
+; PROMO-NEXT:    [[TMP7:%.*]] = add i64 [[PGOCOUNT_PROMOTED]], [[TMP0]]
+; PROMO-NEXT:    store i64 [[TMP7]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 1), align 4
+; PROMO-NEXT:    [[PGOCOUNT_PROMOTED10:%.*]] = load i64, i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 0), align 4
+; PROMO-NEXT:    [[TMP8:%.*]] = add i64 [[PGOCOUNT_PROMOTED10]], [[PGOCOUNT19]]
+; PROMO-NEXT:    store i64 [[TMP8]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 0), align 4
+; PROMO-NEXT:    [[PGOCOUNT_PROMOTED14:%.*]] = load i64, i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 2), align 4
+; PROMO-NEXT:    [[TMP9:%.*]] = add i64 [[PGOCOUNT_PROMOTED14]], [[PGOCOUNT213]]
+; PROMO-NEXT:    store i64 [[TMP9]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 2), align 4
+; PROMO-NEXT:    [[PGOCOUNT4:%.*]] = load i64, i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 4), align 4
+; PROMO-NEXT:    [[TMP10:%.*]] = add i64 [[PGOCOUNT4]], 1
+; PROMO-NEXT:    store i64 [[TMP10]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* @__profc_foo, i64 0, i64 4), align 4
+; PROMO-NEXT:    br label [[BB15]]
+; PROMO:       bb15:
+; PROMO-NEXT:    tail call void @bar(i32 1)
+; PROMO-NEXT:    ret void
+;
 bb:
-  %tmp = add nsw i32 %arg, -1
+  %t = add nsw i32 %arg, -1
   br label %bb1
 bb1:                                              ; preds = %bb11, %bb
-  %tmp2 = phi i32 [ 0, %bb ], [ %tmp12, %bb11 ]
-  %tmp3 = icmp sgt i32 %tmp2, %arg
-  br i1 %tmp3, label %bb7, label %bb4
+  %t2 = phi i32 [ 0, %bb ], [ %t12, %bb11 ]
+  %t3 = icmp sgt i32 %t2, %arg
+  br i1 %t3, label %bb7, label %bb4
 
 bb4:                                              ; preds = %bb1
   tail call void @bar(i32 1)
-  %tmp5 = load i32, i32* @g, align 4
-  %tmp6 = icmp sgt i32 %tmp5, 100
-  br i1 %tmp6, label %bb15_0, label %bb11
+  %t5 = load i32, i32* @g, align 4
+  %t6 = icmp sgt i32 %t5, 100
+  br i1 %t6, label %bb15_0, label %bb11
 
 bb7:                                              ; preds = %bb1
-  %tmp8 = icmp slt i32 %tmp2, %tmp
-  br i1 %tmp8, label %bb9, label %bb10
+  %t8 = icmp slt i32 %t2, %t
+  br i1 %t8, label %bb9, label %bb10
 
 bb9:                                              ; preds = %bb7
   tail call void @bar(i32 2)
@@ -32,45 +100,18 @@ bb10:                                             ; preds = %bb7
   br label %bb11
 
 bb11:                                             ; preds = %bb10, %bb9, %bb4
-  %tmp12 = add nuw nsw i32 %tmp2, 1
-  %tmp13 = icmp slt i32 %tmp2, 99
-  br i1 %tmp13, label %bb1, label %bb14
+  %t12 = add nuw nsw i32 %t2, 1
+  %t13 = icmp slt i32 %t2, 99
+  br i1 %t13, label %bb1, label %bb14
 
 bb14:                                             ; preds = %bb11
-; PROMO-LABEL: bb14:
   tail call void @bar(i32 0)
   br label %bb15
-; PROMO:  %pgocount.promoted{{.*}} = load {{.*}} @__profc_foo{{.*}} 0)
-; PROMO-NEXT: add 
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}0)
-; PROMO-NEXT:  %pgocount.promoted{{.*}} = load {{.*}} @__profc_foo{{.*}} 1)
-; PROMO-NEXT: add 
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}1)
-; PROMO-NEXT:  %pgocount.promoted{{.*}} = load {{.*}} @__profc_foo{{.*}} 2)
-; PROMO-NEXT: add 
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}2)
-; PROMO-NEXT:  %pgocount{{.*}} = load {{.*}} @__profc_foo{{.*}} 3)
-; PROMO-NEXT: add 
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}3)
+
 
 
 bb15_0:                                             ; preds = %bb11
-; PROMO-LABEL: bb15_0:
   br label %bb15
-; PROMO:  %pgocount.promoted{{.*}} = load {{.*}} @__profc_foo{{.*}} 0)
-; PROMO-NEXT: add 
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}0)
-; PROMO-NEXT:  %pgocount.promoted{{.*}} = load {{.*}} @__profc_foo{{.*}} 1)
-; PROMO-NEXT: add 
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}1)
-; PROMO-NEXT:  %pgocount.promoted{{.*}} = load {{.*}} @__profc_foo{{.*}} 2)
-; PROMO-NEXT: add 
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}2)
-; PROMO-NEXT:  %pgocount{{.*}} = load {{.*}} @__profc_foo{{.*}} 4)
-; PROMO-NEXT: add 
-; PROMO-NEXT: store {{.*}}@__profc_foo{{.*}}4)
-; PROMO-NOT: @__profc_foo
-
 
 bb15:                                             ; preds = %bb14, %bb4
   tail call void @bar(i32 1)
diff --git a/llvm/test/Transforms/PGOProfile/landingpad.ll b/llvm/test/Transforms/PGOProfile/landingpad.ll
index a0ca799fa8a43b..5191e118210fec 100644
--- a/llvm/test/Transforms/PGOProfile/landingpad.ll
+++ b/llvm/test/Transforms/PGOProfile/landingpad.ll
@@ -85,7 +85,8 @@ catch.dispatch:
 ; GEN: catch.dispatch:
 ; GEN-NOT: call void @llvm.instrprof.increment
   %tmp3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
-  %matches = icmp eq i32 %tmp2, %tmp3
+  %c = icmp ne i32 %tmp2, %tmp3
+  %matches = xor i1 %c, -1
   br i1 %matches, label %catch, label %eh.resume
 ; USE: br i1 %matches, label %catch, label %eh.resume
 ; USE-SAME: !prof ![[BW_CATCH_DISPATCH:[0-9]+]]

From 8d58eb11f9dabacc37f1f5e2cc83149b24868180 Mon Sep 17 00:00:00 2001
From: Sameer Arora <sameerarora101@fb.com>
Date: Wed, 29 Jul 2020 07:40:11 -0700
Subject: [PATCH 02/23] [llvm-libtool-darwin] Refactor ArchiveWriter

Refactoring function `writeArchive` in ArchiveWriter. Added a new
function `writeArchiveBuffer` that returns the archive in a memory
buffer instead of writing it out to the disk. This refactor is necessary
so as to allow `llvm-libtool-darwin` to write universal files containing
archives.

Reviewed by jhenderson, MaskRay, smeenai

Differential Revision: https://reviews.llvm.org/D84858
---
 llvm/include/llvm/Object/ArchiveWriter.h |  6 +++
 llvm/lib/Object/ArchiveWriter.cpp        | 49 +++++++++++++++++++-----
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Object/ArchiveWriter.h b/llvm/include/llvm/Object/ArchiveWriter.h
index 274ffd90c05aad..7eaf13e8fb2294 100644
--- a/llvm/include/llvm/Object/ArchiveWriter.h
+++ b/llvm/include/llvm/Object/ArchiveWriter.h
@@ -39,6 +39,12 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
                    bool WriteSymtab, object::Archive::Kind Kind,
                    bool Deterministic, bool Thin,
                    std::unique_ptr<MemoryBuffer> OldArchiveBuf = nullptr);
+
+// writeArchiveToBuffer is similar to writeArchive but returns the Archive in a
+// buffer instead of writing it out to a file.
+Expected<std::unique_ptr<MemoryBuffer>>
+writeArchiveToBuffer(ArrayRef<NewArchiveMember> NewMembers, bool WriteSymtab,
+                     object::Archive::Kind Kind, bool Deterministic, bool Thin);
 }
 
 #endif
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 6f92c547164ba1..ca8ffa7706b0ea 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -552,10 +553,10 @@ Expected<std::string> computeArchiveRelativePath(StringRef From, StringRef To) {
   return std::string(Relative.str());
 }
 
-Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
-                   bool WriteSymtab, object::Archive::Kind Kind,
-                   bool Deterministic, bool Thin,
-                   std::unique_ptr<MemoryBuffer> OldArchiveBuf) {
+static Error writeArchiveToStream(raw_ostream &Out,
+                                  ArrayRef<NewArchiveMember> NewMembers,
+                                  bool WriteSymtab, object::Archive::Kind Kind,
+                                  bool Deterministic, bool Thin) {
   assert((!Thin || !isBSDLike(Kind)) && "Only the gnu format has a thin mode");
 
   SmallString<0> SymNamesBuf;
@@ -608,12 +609,6 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
     }
   }
 
-  Expected<sys::fs::TempFile> Temp =
-      sys::fs::TempFile::create(ArcName + ".temp-archive-%%%%%%%.a");
-  if (!Temp)
-    return Temp.takeError();
-
-  raw_fd_ostream Out(Temp->FD, false);
   if (Thin)
     Out << "!<thin>\n";
   else
@@ -626,6 +621,25 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
     Out << M.Header << M.Data << M.Padding;
 
   Out.flush();
+  return Error::success();
+}
+
+Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
+                   bool WriteSymtab, object::Archive::Kind Kind,
+                   bool Deterministic, bool Thin,
+                   std::unique_ptr<MemoryBuffer> OldArchiveBuf) {
+  Expected<sys::fs::TempFile> Temp =
+      sys::fs::TempFile::create(ArcName + ".temp-archive-%%%%%%%.a");
+  if (!Temp)
+    return Temp.takeError();
+  raw_fd_ostream Out(Temp->FD, false);
+
+  if (Error E = writeArchiveToStream(Out, NewMembers, WriteSymtab, Kind,
+                                     Deterministic, Thin)) {
+    if (Error DiscardError = Temp->discard())
+      return joinErrors(std::move(E), std::move(DiscardError));
+    return E;
+  }
 
   // At this point, we no longer need whatever backing memory
   // was used to generate the NewMembers. On Windows, this buffer
@@ -642,4 +656,19 @@ Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
   return Temp->keep(ArcName);
 }
 
+Expected<std::unique_ptr<MemoryBuffer>>
+writeArchiveToBuffer(ArrayRef<NewArchiveMember> NewMembers, bool WriteSymtab,
+                     object::Archive::Kind Kind, bool Deterministic,
+                     bool Thin) {
+  SmallVector<char, 0> ArchiveBufferVector;
+  raw_svector_ostream ArchiveStream(ArchiveBufferVector);
+
+  if (Error E = writeArchiveToStream(ArchiveStream, NewMembers, WriteSymtab,
+                                     Kind, Deterministic, Thin))
+    return std::move(E);
+
+  return std::make_unique<SmallVectorMemoryBuffer>(
+      std::move(ArchiveBufferVector));
+}
+
 } // namespace llvm

From 05169af5cea2c3b9aa0f38354d0e81ddf6b7a3d9 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Thu, 13 Aug 2020 14:04:43 -0400
Subject: [PATCH 03/23] [flang][openacc] Handle optional end directive in
 combined construct

OpenACC combined construct can have an optional end directive. This patch handle this
case in the parsing/unparsing with a canonicalization step. Unlike OmpEndLoopDirective,
this doesn't need a special treatment in the pre-fir tree as there is no clause attached to
a AccEndCombinedDirective.

Reviewed By: klausler

Differential Revision: https://reviews.llvm.org/D84481
---
 flang/include/flang/Parser/parse-tree.h       |  7 +-
 flang/lib/Parser/executable-parsers.cpp       |  3 +-
 flang/lib/Parser/openacc-parsers.cpp          | 14 +--
 flang/lib/Parser/program-parsers.cpp          |  8 +-
 flang/lib/Parser/type-parsers.h               |  1 +
 flang/lib/Parser/unparse.cpp                  |  5 +-
 flang/lib/Semantics/canonicalize-acc.cpp      | 62 ++++++++++++
 flang/lib/Semantics/check-acc-structure.cpp   | 12 ++-
 flang/test/Lower/pre-fir-tree05.f90           | 16 ++++
 .../acc-canonicalization-validity.f90         | 95 +++++++++++++++++++
 flang/test/Semantics/acc-clause-validity.f90  | 25 +++++
 11 files changed, 230 insertions(+), 18 deletions(-)
 create mode 100644 flang/test/Semantics/acc-canonicalization-validity.f90

diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 2fecac5118d844..695121f8395950 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -258,6 +258,7 @@ struct AssignStmt;
 struct AssignedGotoStmt;
 struct PauseStmt;
 struct OpenACCConstruct;
+struct AccEndCombinedDirective;
 struct OpenACCDeclarativeConstruct;
 struct OpenMPConstruct;
 struct OpenMPDeclarativeConstruct;
@@ -517,6 +518,7 @@ struct ExecutableConstruct {
       common::Indirection<WhereConstruct>, common::Indirection<ForallConstruct>,
       common::Indirection<CompilerDirective>,
       common::Indirection<OpenACCConstruct>,
+      common::Indirection<AccEndCombinedDirective>,
       common::Indirection<OpenMPConstruct>,
       common::Indirection<OmpEndLoopDirective>>
       u;
@@ -3970,6 +3972,7 @@ struct OpenACCStandaloneDeclarativeConstruct {
 
 struct AccBeginCombinedDirective {
   TUPLE_CLASS_BOILERPLATE(AccBeginCombinedDirective);
+  CharBlock source;
   std::tuple<AccCombinedDirective, AccClauseList> t;
 };
 
@@ -3981,7 +3984,9 @@ struct AccEndCombinedDirective {
 struct OpenACCCombinedConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenACCCombinedConstruct);
   CharBlock source;
-  std::tuple<AccBeginCombinedDirective, Block,
+  OpenACCCombinedConstruct(AccBeginCombinedDirective &&a)
+      : t({std::move(a), std::nullopt, std::nullopt}) {}
+  std::tuple<AccBeginCombinedDirective, std::optional<DoConstruct>,
       std::optional<AccEndCombinedDirective>>
       t;
 };
diff --git a/flang/lib/Parser/executable-parsers.cpp b/flang/lib/Parser/executable-parsers.cpp
index d6dd4688dbac1f..a0b5cf232abf7f 100644
--- a/flang/lib/Parser/executable-parsers.cpp
+++ b/flang/lib/Parser/executable-parsers.cpp
@@ -50,8 +50,9 @@ constexpr auto executableConstruct{
         construct<ExecutableConstruct>(indirect(whereConstruct)),
         construct<ExecutableConstruct>(indirect(forallConstruct)),
         construct<ExecutableConstruct>(indirect(ompEndLoopDirective)),
-        construct<ExecutableConstruct>(indirect(openaccConstruct)),
         construct<ExecutableConstruct>(indirect(openmpConstruct)),
+        construct<ExecutableConstruct>(indirect(accEndCombinedDirective)),
+        construct<ExecutableConstruct>(indirect(openaccConstruct)),
         construct<ExecutableConstruct>(indirect(compilerDirective)))};
 
 // R510 execution-part-construct ->
diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp
index 0a61921c90874c..823fbaec0acef1 100644
--- a/flang/lib/Parser/openacc-parsers.cpp
+++ b/flang/lib/Parser/openacc-parsers.cpp
@@ -199,16 +199,9 @@ TYPE_PARSER(sourced(
         parenthesized(Parser<AccObjectListWithModifier>{}))))
 
 // 2.11 Combined constructs
-TYPE_PARSER(startAccLine >> construct<AccEndCombinedDirective>(sourced(
-                                "END"_tok >> Parser<AccCombinedDirective>{})))
-
 TYPE_PARSER(construct<AccBeginCombinedDirective>(
     sourced(Parser<AccCombinedDirective>{}), Parser<AccClauseList>{}))
 
-TYPE_PARSER(construct<OpenACCCombinedConstruct>(
-    Parser<AccBeginCombinedDirective>{} / endAccLine, block,
-    maybe(Parser<AccEndCombinedDirective>{} / endAccLine)))
-
 // 2.12 Atomic constructs
 TYPE_PARSER(construct<AccEndAtomic>(startAccLine >> "END ATOMIC"_tok))
 
@@ -281,4 +274,11 @@ TYPE_CONTEXT_PARSER("OpenACC construct"_en_US,
             construct<OpenACCConstruct>(Parser<OpenACCCacheConstruct>{}),
             construct<OpenACCConstruct>(Parser<OpenACCWaitConstruct>{}),
             construct<OpenACCConstruct>(Parser<OpenACCAtomicConstruct>{})))
+
+TYPE_PARSER(startAccLine >> sourced(construct<AccEndCombinedDirective>(sourced(
+                                "END"_tok >> Parser<AccCombinedDirective>{}))))
+
+TYPE_PARSER(construct<OpenACCCombinedConstruct>(
+    sourced(Parser<AccBeginCombinedDirective>{} / endAccLine)))
+
 } // namespace Fortran::parser
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index 1be1207c8626a8..278cc6fdb51a5c 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -76,10 +76,10 @@ TYPE_CONTEXT_PARSER("specification part"_en_US,
 // are in contexts that impose constraints on the kinds of statements that
 // are allowed, and so we have a variant production for declaration-construct
 // that implements those constraints.
-constexpr auto execPartLookAhead{first(actionStmt >> ok,
-    ompEndLoopDirective >> ok, openaccConstruct >> ok, openmpConstruct >> ok,
-    "ASSOCIATE ("_tok, "BLOCK"_tok, "SELECT"_tok, "CHANGE TEAM"_sptok,
-    "CRITICAL"_tok, "DO"_tok, "IF ("_tok, "WHERE ("_tok, "FORALL ("_tok)};
+constexpr auto execPartLookAhead{
+    first(actionStmt >> ok, openaccConstruct >> ok, openmpConstruct >> ok,
+        "ASSOCIATE ("_tok, "BLOCK"_tok, "SELECT"_tok, "CHANGE TEAM"_sptok,
+        "CRITICAL"_tok, "DO"_tok, "IF ("_tok, "WHERE ("_tok, "FORALL ("_tok)};
 constexpr auto declErrorRecovery{
     stmtErrorRecoveryStart >> !execPartLookAhead >> skipStmtErrorRecovery};
 constexpr auto misplacedSpecificationStmt{Parser<UseStmt>{} >>
diff --git a/flang/lib/Parser/type-parsers.h b/flang/lib/Parser/type-parsers.h
index a2f38e90db2120..d6269cbdc7151d 100644
--- a/flang/lib/Parser/type-parsers.h
+++ b/flang/lib/Parser/type-parsers.h
@@ -131,6 +131,7 @@ constexpr Parser<EntryStmt> entryStmt; // R1541
 constexpr Parser<ContainsStmt> containsStmt; // R1543
 constexpr Parser<CompilerDirective> compilerDirective;
 constexpr Parser<OpenACCConstruct> openaccConstruct;
+constexpr Parser<AccEndCombinedDirective> accEndCombinedDirective;
 constexpr Parser<OpenACCDeclarativeConstruct> openaccDeclarativeConstruct;
 constexpr Parser<OpenMPConstruct> openmpConstruct;
 constexpr Parser<OpenMPDeclarativeConstruct> openmpDeclarativeConstruct;
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 1093cb21709c91..85ed1a2bd60b9d 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2104,10 +2104,9 @@ class UnparseVisitor {
     Walk(std::get<AccBeginCombinedDirective>(x.t));
     Put("\n");
     EndOpenACC();
-    Walk(std::get<Block>(x.t), "");
+    Walk(std::get<std::optional<DoConstruct>>(x.t));
     BeginOpenACC();
-    Word("!$ACC END ");
-    Walk(std::get<std::optional<AccEndCombinedDirective>>(x.t));
+    Walk("!$ACC END ", std::get<std::optional<DoConstruct>>(x.t));
     Put("\n");
     EndOpenACC();
   }
diff --git a/flang/lib/Semantics/canonicalize-acc.cpp b/flang/lib/Semantics/canonicalize-acc.cpp
index 4c4d716fe7defe..8cf04910ba6e13 100644
--- a/flang/lib/Semantics/canonicalize-acc.cpp
+++ b/flang/lib/Semantics/canonicalize-acc.cpp
@@ -16,6 +16,9 @@
 //   1. move structured DoConstruct into
 //      OpenACCLoopConstruct. Compilation will not proceed in case of errors
 //      after this pass.
+//   2. move structured DoConstruct into OpenACCCombinedConstruct. Move
+//      AccEndCombinedConstruct into OpenACCCombinedConstruct if present.
+//      Compilation will not proceed in case of errors after this pass.
 namespace Fortran::semantics {
 
 using namespace parser::literals;
@@ -30,6 +33,16 @@ class CanonicalizationOfAcc {
     for (auto it{block.begin()}; it != block.end(); ++it) {
       if (auto *accLoop{parser::Unwrap<parser::OpenACCLoopConstruct>(*it)}) {
         RewriteOpenACCLoopConstruct(*accLoop, block, it);
+      } else if (auto *accCombined{
+                     parser::Unwrap<parser::OpenACCCombinedConstruct>(*it)}) {
+        RewriteOpenACCCombinedConstruct(*accCombined, block, it);
+      } else if (auto *endDir{
+                     parser::Unwrap<parser::AccEndCombinedDirective>(*it)}) {
+        // Unmatched AccEndCombinedDirective
+        messages_.Say(endDir->v.source,
+            "The %s directive must follow the DO loop associated with the "
+            "loop construct"_err_en_US,
+            parser::ToUpperCaseLetters(endDir->v.source.ToString()));
       }
     } // Block list
   }
@@ -73,6 +86,55 @@ class CanonicalizationOfAcc {
         parser::ToUpperCaseLetters(dir.source.ToString()));
   }
 
+  void RewriteOpenACCCombinedConstruct(parser::OpenACCCombinedConstruct &x,
+      parser::Block &block, parser::Block::iterator it) {
+    // Check the sequence of DoConstruct in the same iteration
+    //
+    // Original:
+    //   ExecutableConstruct -> OpenACCConstruct -> OpenACCCombinedConstruct
+    //     ACCBeginCombinedDirective
+    //   ExecutableConstruct -> DoConstruct
+    //   ExecutableConstruct -> AccEndCombinedDirective (if available)
+    //
+    // After rewriting:
+    //   ExecutableConstruct -> OpenACCConstruct -> OpenACCCombinedConstruct
+    //     ACCBeginCombinedDirective
+    //     DoConstruct
+    //     AccEndCombinedDirective (if available)
+    parser::Block::iterator nextIt;
+    auto &beginDir{std::get<parser::AccBeginCombinedDirective>(x.t)};
+    auto &dir{std::get<parser::AccCombinedDirective>(beginDir.t)};
+
+    nextIt = it;
+    if (++nextIt != block.end()) {
+      if (auto *doCons{parser::Unwrap<parser::DoConstruct>(*nextIt)}) {
+        if (doCons->GetLoopControl()) {
+          // move DoConstruct
+          std::get<std::optional<parser::DoConstruct>>(x.t) =
+              std::move(*doCons);
+          nextIt = block.erase(nextIt);
+          // try to match AccEndCombinedDirective
+          if (nextIt != block.end()) {
+            if (auto *endDir{
+                    parser::Unwrap<parser::AccEndCombinedDirective>(*nextIt)}) {
+              std::get<std::optional<parser::AccEndCombinedDirective>>(x.t) =
+                  std::move(*endDir);
+              block.erase(nextIt);
+            }
+          }
+        } else {
+          messages_.Say(dir.source,
+              "DO loop after the %s directive must have loop control"_err_en_US,
+              parser::ToUpperCaseLetters(dir.source.ToString()));
+        }
+        return; // found do-loop
+      }
+    }
+    messages_.Say(dir.source,
+        "A DO loop must follow the %s directive"_err_en_US,
+        parser::ToUpperCaseLetters(dir.source.ToString()));
+  }
+
   parser::Messages &messages_;
 };
 
diff --git a/flang/lib/Semantics/check-acc-structure.cpp b/flang/lib/Semantics/check-acc-structure.cpp
index 311a7c4d3328e8..4dcf5ed27f7058 100644
--- a/flang/lib/Semantics/check-acc-structure.cpp
+++ b/flang/lib/Semantics/check-acc-structure.cpp
@@ -156,9 +156,17 @@ void AccStructureChecker::Leave(
 }
 
 void AccStructureChecker::Enter(const parser::OpenACCCombinedConstruct &x) {
-  const auto &beginBlockDir{std::get<parser::AccBeginCombinedDirective>(x.t)};
+  const auto &beginCombinedDir{
+      std::get<parser::AccBeginCombinedDirective>(x.t)};
   const auto &combinedDir{
-      std::get<parser::AccCombinedDirective>(beginBlockDir.t)};
+      std::get<parser::AccCombinedDirective>(beginCombinedDir.t)};
+
+  // check matching, End directive is optional
+  if (const auto &endCombinedDir{
+          std::get<std::optional<parser::AccEndCombinedDirective>>(x.t)}) {
+    CheckMatching<parser::AccCombinedDirective>(combinedDir, endCombinedDir->v);
+  }
+
   PushContextAndClauseSets(combinedDir.source, combinedDir.v);
 }
 
diff --git a/flang/test/Lower/pre-fir-tree05.f90 b/flang/test/Lower/pre-fir-tree05.f90
index f635785e3274b4..98af5c2de94431 100644
--- a/flang/test/Lower/pre-fir-tree05.f90
+++ b/flang/test/Lower/pre-fir-tree05.f90
@@ -31,3 +31,19 @@ subroutine foo()
 end subroutine
 ! CHECK-NEXT: EndSubroutine foo
 
+! CHECK: Subroutine foo
+subroutine foo2()
+  ! CHECK-NEXT: <<OpenACCConstruct>>
+  !$acc parallel loop
+  ! CHECK-NEXT: <<DoConstruct>>
+  ! CHECK-NEXT: NonLabelDoStmt
+  do i=1,5
+  ! CHECK-NEXT: EndDoStmt
+  ! CHECK-NEXT: <<End DoConstruct>>
+  end do
+  !$acc end parallel loop
+  ! CHECK-NEXT: <<End OpenACCConstruct>>
+  ! CHECK-NEXT: ContinueStmt
+end subroutine
+! CHECK-NEXT: EndSubroutine foo2
+
diff --git a/flang/test/Semantics/acc-canonicalization-validity.f90 b/flang/test/Semantics/acc-canonicalization-validity.f90
new file mode 100644
index 00000000000000..06c63ed25ddbb7
--- /dev/null
+++ b/flang/test/Semantics/acc-canonicalization-validity.f90
@@ -0,0 +1,95 @@
+! RUN: %S/test_errors.sh %s %t %f18 -fopenacc
+
+! Check OpenACC canonalization validity for the construct defined below:
+!   2.9 Loop
+!   2.11 Parallel Loop
+!   2.11 Kernels Loop
+!   2.11 Serial Loop
+
+program openacc_clause_validity
+
+  implicit none
+
+  integer :: i, j
+  integer :: N = 256
+  real(8) :: a(256)
+
+  !ERROR: A DO loop must follow the LOOP directive
+  !$acc loop
+  i = 1
+
+  !ERROR: DO loop after the LOOP directive must have loop control
+  !$acc loop
+  do
+  end do
+
+  !ERROR: A DO loop must follow the PARALLEL LOOP directive
+  !$acc parallel loop
+  i = 1
+
+  !ERROR: A DO loop must follow the KERNELS LOOP directive
+  !$acc kernels loop
+  i = 1
+
+  !ERROR: A DO loop must follow the SERIAL LOOP directive
+  !$acc serial loop
+  i = 1
+
+  !ERROR: The END PARALLEL LOOP directive must follow the DO loop associated with the loop construct
+  !$acc end parallel loop
+
+  !ERROR: The END KERNELS LOOP directive must follow the DO loop associated with the loop construct
+  !$acc end kernels loop
+
+  !ERROR: The END SERIAL LOOP directive must follow the DO loop associated with the loop construct
+  !$acc end serial loop
+
+  !$acc parallel loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+
+  !$acc kernels loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+
+  !$acc serial loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+
+  !$acc parallel loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+  !$acc end parallel loop
+
+  !$acc kernels loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+  !$acc end kernels loop
+
+  !$acc serial loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+  !$acc end serial loop
+
+  !ERROR: DO loop after the PARALLEL LOOP directive must have loop control
+  !$acc parallel loop
+  do
+  end do
+
+  !ERROR: DO loop after the KERNELS LOOP directive must have loop control
+  !$acc kernels loop
+  do
+  end do
+
+  !ERROR: DO loop after the SERIAL LOOP directive must have loop control
+  !$acc serial loop
+  do
+  end do
+
+end program openacc_clause_validity
diff --git a/flang/test/Semantics/acc-clause-validity.f90 b/flang/test/Semantics/acc-clause-validity.f90
index 75a0efa87d3529..207ca2ec72cdd1 100644
--- a/flang/test/Semantics/acc-clause-validity.f90
+++ b/flang/test/Semantics/acc-clause-validity.f90
@@ -5,6 +5,10 @@
 !   2.5.1 Parallel
 !   2.5.2 Kernels
 !   2.5.3 Serial
+!   2.9 Loop
+!   2.13 Declare
+!   2.14.3 Set
+!   2.14.4 Update
 !   2.15.1 Routine
 !   2.11 Parallel Loop
 !   2.11 Kernels Loop
@@ -162,6 +166,27 @@ program openacc_clause_validity
   end do
   !$acc end serial loop
 
+  !$acc parallel loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+  !ERROR: Unmatched END KERNELS LOOP directive
+  !$acc end kernels loop
+
+  !$acc kernels loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+  !ERROR: Unmatched END SERIAL LOOP directive
+  !$acc end serial loop
+
+  !$acc serial loop
+  do i = 1, N
+    a(i) = 3.14
+  end do
+  !ERROR: Unmatched END PARALLEL LOOP directive
+  !$acc end parallel loop
+
  contains
 
    subroutine sub1(a)

From bd2853f7998d41e0d16c00e2b043e35b688eaf00 Mon Sep 17 00:00:00 2001
From: Sameer Arora <sameerarora101@fb.com>
Date: Tue, 28 Jul 2020 09:50:51 -0700
Subject: [PATCH 04/23] [llvm-libtool-darwin] Add support for -arch_only

Add support for -arch_only option for llvm-libtool-darwin. This diff
also adds support for accepting universal files as input and flattening
them to create the required static library. Supports input universal
files contaning both Mach-O object files or archives.

Differences from cctools' libtool:
- `-arch_only` can be specified multiple times
- archives containing universal files are considered invalid (libtool
allows such archives)

Reviewed by jhenderson, smeenai

Differential Revision: https://reviews.llvm.org/D84770
---
 .../docs/CommandGuide/llvm-libtool-darwin.rst |  43 +--
 .../cpu-subtype-matching.test                 | 269 ++++++++++++++++++
 .../universal-file-flattening.test            | 240 ++++++++++++++++
 llvm/tools/llvm-libtool-darwin/CMakeLists.txt |   1 +
 llvm/tools/llvm-libtool-darwin/LLVMBuild.txt  |   2 +-
 .../llvm-libtool-darwin.cpp                   | 201 ++++++++++---
 6 files changed, 703 insertions(+), 53 deletions(-)
 create mode 100644 llvm/test/tools/llvm-libtool-darwin/cpu-subtype-matching.test
 create mode 100644 llvm/test/tools/llvm-libtool-darwin/universal-file-flattening.test

diff --git a/llvm/docs/CommandGuide/llvm-libtool-darwin.rst b/llvm/docs/CommandGuide/llvm-libtool-darwin.rst
index 2944aa6ee37f0c..a5383c03da5330 100644
--- a/llvm/docs/CommandGuide/llvm-libtool-darwin.rst
+++ b/llvm/docs/CommandGuide/llvm-libtool-darwin.rst
@@ -21,30 +21,34 @@ OPTIONS
 --------
 :program:`llvm-libtool-darwin` supports the following options:
 
-.. option:: -h, -help
-
-  Show help and usage for this command.
+.. option:: -arch_only <architecture>
 
-.. option:: -help-list
-
-  Show help and usage for this command without grouping the options
-  into categories.
+  Build a static library only for the specified `<architecture>` and ignore all
+  other architectures in the files.
 
 .. option:: -color
 
   Use colors in output.
 
-.. option:: -version
+.. option:: -D
 
-  Display the version of this program.
+  Use zero for timestamps and UIDs/GIDs. This is set by default.
 
-.. option:: -D
+.. option:: -filelist <listfile[,dirname]>
 
- Use zero for timestamps and UIDs/GIDs. This is set by default.
+  Read input file names from `<listfile>`. File names are specified in `<listfile>`
+  one per line, separated only by newlines. Whitespace on a line is assumed
+  to be part of the filename. If the directory name, `dirname`, is also
+  specified then it is prepended to each file name in the `<listfile>`.
 
-.. option:: -U
+.. option:: -h, -help
+
+  Show help and usage for this command.
 
- Use actual timestamps and UIDs/GIDs.
+.. option:: -help-list
+
+  Show help and usage for this command without grouping the options
+  into categories.
 
 .. option:: -o <filename>
 
@@ -52,14 +56,15 @@ OPTIONS
 
 .. option:: -static
 
- Produces a static library from the input files.
+  Produces a static library from the input files.
 
-.. option:: -filelist <listfile[,dirname]>
+.. option:: -U
+
+  Use actual timestamps and UIDs/GIDs.
 
- Read input file names from `<listfile>`. File names are specified in `<listfile>`
- one per line, separated only by newlines. Whitespace on a line is assumed
- to be part of the filename. If the directory name, `dirname`, is also
- specified then it is prepended to each file name in the `<listfile>`.
+.. option:: -version
+
+  Display the version of this program.
 
 EXIT STATUS
 -----------
diff --git a/llvm/test/tools/llvm-libtool-darwin/cpu-subtype-matching.test b/llvm/test/tools/llvm-libtool-darwin/cpu-subtype-matching.test
new file mode 100644
index 00000000000000..4789361ba69315
--- /dev/null
+++ b/llvm/test/tools/llvm-libtool-darwin/cpu-subtype-matching.test
@@ -0,0 +1,269 @@
+## This test checks that the CPU subtype matching logic is handled correctly.
+
+# RUN: yaml2obj %s --docnum=1 -o %t.armv6
+# RUN: yaml2obj %s --docnum=2 -o %t.armv7
+
+# RUN: llvm-libtool-darwin -static -o %t.lib %t.armv6 %t.armv7 -arch_only armv7
+
+## Check that only armv7 binary is present:
+# RUN: llvm-ar t %t.lib | \
+# RUN:   FileCheck %s --check-prefix=ARM-NAMES --implicit-check-not={{.}} -DPREFIX=%basename_t.tmp
+
+# ARM-NAMES: [[PREFIX]].armv7
+
+## Check that only armv7 symbol is present:
+# RUN: llvm-nm --print-armap %t.lib | \
+# RUN:   FileCheck %s --check-prefix=ARM-SYMBOLS -DPREFIX=%basename_t.tmp --match-full-lines
+
+# ARM-SYMBOLS:      Archive map
+# ARM-SYMBOLS-NEXT: _armv7 in [[PREFIX]].armv7
+# ARM-SYMBOLS-EMPTY:
+
+## armv6.yaml
+## CPUTYPE:    CPU_TYPE_ARM
+## CPUSUBTYPE: CPU_SUBTYPE_ARM_V6
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACE
+  cputype:         0x0000000C
+  cpusubtype:      0x00000006
+  filetype:        0x00000001
+  ncmds:           2
+  sizeofcmds:      148
+  flags:           0x00002000
+LoadCommands:
+  - cmd:             LC_SEGMENT
+    cmdsize:         124
+    segname:         ''
+    vmaddr:          0
+    vmsize:          24
+    fileoff:         296
+    filesize:        24
+    maxprot:         7
+    initprot:        7
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000000000000
+        size:            24
+        offset:          0x00000128
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         04D04DE208009FE500008DE504D08DE21EFF2FE100000000
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          328
+    nsyms:           1
+    stroff:          340
+    strsize:         8
+LinkEditData:
+  NameList:
+    - n_strx:          1
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ''
+    - _armv6
+    - ''
+...
+
+## armv7.yaml
+## CPUTYPE:    CPU_TYPE_ARM
+## CPUSUBTYPE: CPU_SUBTYPE_ARM_V7
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACE
+  cputype:         0x0000000C
+  cpusubtype:      0x00000009
+  filetype:        0x00000001
+  ncmds:           2
+  sizeofcmds:      148
+  flags:           0x00002000
+LoadCommands:
+  - cmd:             LC_SEGMENT
+    cmdsize:         124
+    segname:         ''
+    vmaddr:          0
+    vmsize:          10
+    fileoff:         280
+    filesize:        10
+    maxprot:         7
+    initprot:        7
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000000000000
+        size:            10
+        offset:          0x00000118
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         81B00020009001B07047
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          292
+    nsyms:           1
+    stroff:          304
+    strsize:         8
+LinkEditData:
+  NameList:
+    - n_strx:          1
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          8
+      n_value:         0
+  StringTable:
+    - ''
+    - _armv7
+    - ''
+...
+
+# RUN: yaml2obj %s --docnum=3 -o %t.x86_64
+# RUN: yaml2obj %s --docnum=4 -o %t.x86_64_h
+
+# RUN: llvm-libtool-darwin -static -o %t.lib %t.x86_64 %t.x86_64_h -arch_only x86_64
+
+## Check that only x86_64 binary is present:
+# RUN: llvm-ar t %t.lib | \
+# RUN:   FileCheck %s --check-prefix=X86-NAMES --implicit-check-not={{.}} -DPREFIX=%basename_t.tmp
+
+# X86-NAMES: [[PREFIX]].x86_64
+
+## Check that only x86_64 symbol is present:
+# RUN: llvm-nm --print-armap %t.lib | \
+# RUN:   FileCheck %s --check-prefix=X86-SYMBOLS -DPREFIX=%basename_t.tmp --match-full-lines
+
+# X86-SYMBOLS:      Archive map
+# X86-SYMBOLS-NEXT: _x86_64 in [[PREFIX]].x86_64
+# X86-SYMBOLS-EMPTY:
+
+## x86_64.yaml
+## CPUTYPE:    CPU_TYPE_X86_64
+## CPUSUBTYPE: CPU_SUBTYPE_X86_64_ALL
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x00000001
+  ncmds:           2
+  sizeofcmds:      176
+  flags:           0x00002000
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         ''
+    vmaddr:          0
+    vmsize:          15
+    fileoff:         312
+    filesize:        15
+    maxprot:         7
+    initprot:        7
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000000000000
+        size:            15
+        offset:          0x00000138
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         554889E531C0C745FC000000005DC3
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          328
+    nsyms:           1
+    stroff:          344
+    strsize:         8
+LinkEditData:
+  NameList:
+    - n_strx:          1
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ''
+    - _x86_64
+    - ''
+...
+
+## x86_64h.yaml
+## CPUTYPE:    CPU_TYPE_X86_64
+## CPUSUBTYPE: CPU_SUBTYPE_X86_64_H
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000008
+  filetype:        0x00000001
+  ncmds:           2
+  sizeofcmds:      176
+  flags:           0x00002000
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         ''
+    vmaddr:          0
+    vmsize:          15
+    fileoff:         312
+    filesize:        15
+    maxprot:         7
+    initprot:        7
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000000000000
+        size:            15
+        offset:          0x00000138
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+        content:         554889E531C0C745FC000000005DC3
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          328
+    nsyms:           1
+    stroff:          344
+    strsize:         8
+LinkEditData:
+  NameList:
+    - n_strx:          1
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ''
+    - _x86_64_h
+    - ''
+...
diff --git a/llvm/test/tools/llvm-libtool-darwin/universal-file-flattening.test b/llvm/test/tools/llvm-libtool-darwin/universal-file-flattening.test
new file mode 100644
index 00000000000000..cd1dfc99858f21
--- /dev/null
+++ b/llvm/test/tools/llvm-libtool-darwin/universal-file-flattening.test
@@ -0,0 +1,240 @@
+## This test checks that a universal file is flattened correctly.
+
+# RUN: yaml2obj %s -o %t-universal.o
+# RUN: yaml2obj %S/Inputs/input1.yaml -o %t-input1.o
+# RUN: yaml2obj %S/Inputs/input2.yaml -o %t-input2.o
+
+# RUN: llvm-libtool-darwin -static -o %t.lib %t-universal.o -arch_only arm64
+
+## Check that the binary is present:
+# RUN: llvm-ar t %t.lib | \
+# RUN:   FileCheck %s --check-prefix=CHECK-NAMES --implicit-check-not={{.}} -DPREFIX=%basename_t.tmp
+
+# CHECK-NAMES: [[PREFIX]]-universal.o
+
+## Check that symbols are present:
+# RUN: llvm-nm --print-armap %t.lib | \
+# RUN:   FileCheck %s --check-prefix=CHECK-SYMBOLS -DPREFIX=%basename_t.tmp --match-full-lines
+
+# CHECK-SYMBOLS:      Archive map
+# CHECK-SYMBOLS-NEXT: _arm64 in [[PREFIX]]-universal.o
+# CHECK-SYMBOLS-EMPTY:
+
+## Check that the output archive is in Darwin format:
+# RUN: llvm-objdump --macho --archive-headers %t.lib | \
+# RUN:   FileCheck %s --check-prefix=FORMAT -DPREFIX=%basename_t.tmp -DARCHIVE=%t.lib
+
+# FORMAT:      Archive : [[ARCHIVE]]
+# FORMAT-NEXT: __.SYMDEF
+# FORMAT-NEXT: [[PREFIX]]-universal.o
+# FORMAT-NOT:  {{.}}
+
+## Passing both a universal file and an object file:
+# RUN: llvm-libtool-darwin -static -o %t.lib %t-universal.o %t-input1.o -arch_only x86_64
+# RUN: llvm-ar t %t.lib | \
+# RUN:   FileCheck %s --check-prefix=BOTH-NAMES --implicit-check-not={{.}} -DPREFIX=%basename_t.tmp
+# RUN: llvm-nm --print-armap %t.lib | \
+# RUN:   FileCheck %s --check-prefix=BOTH-SYMBOLS -DPREFIX=%basename_t.tmp --match-full-lines
+
+# BOTH-NAMES:      [[PREFIX]]-universal.o
+# BOTH-NAMES-NEXT: [[PREFIX]]-input1.o
+
+# BOTH-SYMBOLS:      Archive map
+# BOTH-SYMBOLS-NEXT: _x86_64 in [[PREFIX]]-universal.o
+# BOTH-SYMBOLS-NEXT: _symbol1 in [[PREFIX]]-input1.o
+# BOTH-SYMBOLS-EMPTY:
+
+## Passing both a universal file and an object file but filtering out the object file:
+# RUN: llvm-libtool-darwin -static -o %t.lib %t-universal.o %t-input1.o -arch_only arm64
+# RUN: llvm-ar t %t.lib | \
+# RUN:   FileCheck %s --check-prefix=CHECK-NAMES --implicit-check-not={{.}} -DPREFIX=%basename_t.tmp
+# RUN: llvm-nm --print-armap %t.lib | \
+# RUN:   FileCheck %s --check-prefix=CHECK-SYMBOLS -DPREFIX=%basename_t.tmp --match-full-lines
+
+## Universal file containing an archive:
+# RUN: rm -f %t.ar
+# RUN: llvm-ar cr %t.ar %t-input1.o %t-input2.o
+# RUN: llvm-lipo %t.ar -create -output %t-fat-with-archive.o
+# RUN: llvm-libtool-darwin -static -o %t.lib %t-fat-with-archive.o -arch_only x86_64
+# RUN: llvm-ar t %t.lib | \
+# RUN:   FileCheck %s --check-prefix=ARCHIVE-NAMES --implicit-check-not={{.}} -DPREFIX=%basename_t.tmp
+# RUN: llvm-nm --print-armap %t.lib | \
+# RUN:   FileCheck %s --check-prefix=ARCHIVE-SYMBOLS -DPREFIX=%basename_t.tmp --match-full-lines
+
+# ARCHIVE-NAMES:      [[PREFIX]]-input1.o
+# ARCHIVE-NAMES-NEXT: [[PREFIX]]-input2.o
+
+# ARCHIVE-SYMBOLS:      Archive map
+# ARCHIVE-SYMBOLS-NEXT: _symbol1 in [[PREFIX]]-input1.o
+# ARCHIVE-SYMBOLS-NEXT: _symbol2 in [[PREFIX]]-input2.o
+# ARCHIVE-SYMBOLS-EMPTY:
+
+## Allow arch_only to be specified more than once (pick the last one):
+# RUN: llvm-libtool-darwin -static -o %t.lib %t-universal.o -arch_only arm64 -arch_only x86_64
+# RUN: llvm-ar t %t.lib | \
+# RUN:   FileCheck %s --check-prefix=DOUBLE-NAMES --implicit-check-not={{.}} -DPREFIX=%basename_t.tmp
+# RUN: llvm-nm --print-armap %t.lib | \
+# RUN:   FileCheck %s --check-prefix=DOUBLE-SYMBOLS -DPREFIX=%basename_t.tmp --match-full-lines
+
+# DOUBLE-NAMES: [[PREFIX]]-universal.o
+
+# DOUBLE-SYMBOLS:      Archive map
+# DOUBLE-SYMBOLS-NEXT: _x86_64 in [[PREFIX]]-universal.o
+# DOUBLE-SYMBOLS-EMPTY:
+
+## Invalid architecture:
+# RUN: not llvm-libtool-darwin -static -o %t.lib %t-universal.o -arch_only arch101 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=INVALID-ARCH
+
+# INVALID-ARCH: invalid architecture 'arch101': valid architecture names are
+
+## Empty architecture:
+# RUN: not llvm-libtool-darwin -static -o %t.lib %t-universal.o -arch_only "" 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=EMPTY-ARCH
+
+# EMPTY-ARCH: invalid architecture '': valid architecture names are
+
+## Missing architecture:
+# RUN: not llvm-libtool-darwin -static -o %t.lib %t-universal.o -arch_only ppc 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=MISSING-ARCH
+
+# MISSING-ARCH: error: no library created (no object files in input files matching -arch_only ppc)
+
+## arch_only missing argument:
+# RUN: not llvm-libtool-darwin -static -o %t.lib %t-universal.o -arch_only 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=REQUIRE-ARCH
+
+# REQUIRE-ARCH: for the --arch_only option: requires a value!
+
+## x86_64-arm64-universal.yaml
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       2
+FatArchs:
+  - cputype:         0x01000007
+    cpusubtype:      0x00000003
+    offset:          0x0000000000001000
+    size:            352
+    align:           12
+  - cputype:         0x0100000C
+    cpusubtype:      0x00000000
+    offset:          0x0000000000004000
+    size:            384
+    align:           14
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x01000007
+      cpusubtype:      0x00000003
+      filetype:        0x00000001
+      ncmds:           2
+      sizeofcmds:      176
+      flags:           0x00002000
+      reserved:        0x00000000
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         ''
+        vmaddr:          0
+        vmsize:          15
+        fileoff:         312
+        filesize:        15
+        maxprot:         7
+        initprot:        7
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x0000000000000000
+            size:            15
+            offset:          0x00000138
+            align:           4
+            reloff:          0x00000000
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x00000000
+            reserved2:       0x00000000
+            reserved3:       0x00000000
+            content:         554889E531C0C745FC000000005DC3
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          328
+        nsyms:           1
+        stroff:          344
+        strsize:         8
+    LinkEditData:
+      NameList:
+        - n_strx:          1
+          n_type:          0x0F
+          n_sect:          1
+          n_desc:          0
+          n_value:         0
+      StringTable:
+        - ''
+        - _x86_64
+        - ''
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x0100000C
+      cpusubtype:      0x00000000
+      filetype:        0x00000001
+      ncmds:           2
+      sizeofcmds:      176
+      flags:           0x00002000
+      reserved:        0x00000000
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         ''
+        vmaddr:          0
+        vmsize:          24
+        fileoff:         312
+        filesize:        24
+        maxprot:         7
+        initprot:        7
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x0000000000000000
+            size:            24
+            offset:          0x00000138
+            align:           2
+            reloff:          0x00000000
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x00000000
+            reserved2:       0x00000000
+            reserved3:       0x00000000
+            content:         FF4300D1FF0F00B908008052E00308AAFF430091C0035FD6
+      - cmd:             LC_SYMTAB
+        cmdsize:         24
+        symoff:          336
+        nsyms:           2
+        stroff:          368
+        strsize:         16
+    LinkEditData:
+      NameList:
+        - n_strx:          7
+          n_type:          0x0E
+          n_sect:          1
+          n_desc:          0
+          n_value:         0
+        - n_strx:          1
+          n_type:          0x0F
+          n_sect:          1
+          n_desc:          0
+          n_value:         0
+      StringTable:
+        - ''
+        - _arm64
+        - ltmp0
+        - ''
+        - ''
+        - ''
+...
diff --git a/llvm/tools/llvm-libtool-darwin/CMakeLists.txt b/llvm/tools/llvm-libtool-darwin/CMakeLists.txt
index eb83fa1a3ee935..8e2421f1f3bf74 100644
--- a/llvm/tools/llvm-libtool-darwin/CMakeLists.txt
+++ b/llvm/tools/llvm-libtool-darwin/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_LINK_COMPONENTS
   BinaryFormat
   Object
   Support
+  TextAPI
   )
 
 add_llvm_tool(llvm-libtool-darwin
diff --git a/llvm/tools/llvm-libtool-darwin/LLVMBuild.txt b/llvm/tools/llvm-libtool-darwin/LLVMBuild.txt
index 3c557a3aaf61fd..a31a3a1400f5d3 100644
--- a/llvm/tools/llvm-libtool-darwin/LLVMBuild.txt
+++ b/llvm/tools/llvm-libtool-darwin/LLVMBuild.txt
@@ -17,4 +17,4 @@
 type = Tool
 name = llvm-libtool-darwin
 parent = Tools
-required_libraries = BinaryFormat Object Support
+required_libraries = BinaryFormat Object Support TextAPI
diff --git a/llvm/tools/llvm-libtool-darwin/llvm-libtool-darwin.cpp b/llvm/tools/llvm-libtool-darwin/llvm-libtool-darwin.cpp
index 871a8036dab07b..5e0c356985db15 100644
--- a/llvm/tools/llvm-libtool-darwin/llvm-libtool-darwin.cpp
+++ b/llvm/tools/llvm-libtool-darwin/llvm-libtool-darwin.cpp
@@ -13,11 +13,13 @@
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/WithColor.h"
+#include "llvm/TextAPI/MachO/Architecture.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -33,6 +35,10 @@ static cl::list<std::string> InputFiles(cl::Positional,
                                         cl::ZeroOrMore,
                                         cl::cat(LibtoolCategory));
 
+static cl::opt<std::string> ArchType(
+    "arch_only", cl::desc("Specify architecture type for output library"),
+    cl::value_desc("arch_type"), cl::ZeroOrMore, cl::cat(LibtoolCategory));
+
 enum class Operation { Static };
 
 static cl::opt<Operation> LibraryOperation(
@@ -92,7 +98,51 @@ static Error processFileList() {
   return Error::success();
 }
 
-static Error verifyMachOObject(const NewArchiveMember &Member) {
+static Error validateArchitectureName(StringRef ArchitectureName) {
+  if (!MachOObjectFile::isValidArch(ArchitectureName)) {
+    std::string Buf;
+    raw_string_ostream OS(Buf);
+    for (StringRef Arch : MachOObjectFile::getValidArchs())
+      OS << Arch << " ";
+
+    return createStringError(
+        std::errc::invalid_argument,
+        "invalid architecture '%s': valid architecture names are %s",
+        ArchitectureName.str().c_str(), OS.str().c_str());
+  }
+  return Error::success();
+}
+
+// Check that a file's architecture [FileCPUType, FileCPUSubtype]
+// matches the architecture specified under -arch_only flag.
+static bool acceptFileArch(uint32_t FileCPUType, uint32_t FileCPUSubtype) {
+  uint32_t ArchCPUType, ArchCPUSubtype;
+  std::tie(ArchCPUType, ArchCPUSubtype) = MachO::getCPUTypeFromArchitecture(
+      MachO::getArchitectureFromName(ArchType));
+
+  if (ArchCPUType != FileCPUType)
+    return false;
+
+  switch (ArchCPUType) {
+  case MachO::CPU_TYPE_ARM:
+  case MachO::CPU_TYPE_ARM64_32:
+  case MachO::CPU_TYPE_X86_64:
+    return ArchCPUSubtype == FileCPUSubtype;
+
+  case MachO::CPU_TYPE_ARM64:
+    if (ArchCPUSubtype == MachO::CPU_SUBTYPE_ARM64_ALL)
+      return FileCPUSubtype == MachO::CPU_SUBTYPE_ARM64_ALL ||
+             FileCPUSubtype == MachO::CPU_SUBTYPE_ARM64_V8;
+    else
+      return ArchCPUSubtype == FileCPUSubtype;
+
+  default:
+    return true;
+  }
+}
+
+static Error verifyAndAddMachOObject(std::vector<NewArchiveMember> &Members,
+                                     NewArchiveMember Member) {
   auto MBRef = Member.Buf->getMemBufferRef();
   Expected<std::unique_ptr<object::ObjectFile>> ObjOrErr =
       object::ObjectFile::createObjectFile(MBRef);
@@ -107,6 +157,18 @@ static Error verifyMachOObject(const NewArchiveMember &Member) {
                              "'%s': format not supported",
                              Member.MemberName.data());
 
+  auto *O = dyn_cast<MachOObjectFile>(ObjOrErr->get());
+  uint32_t FileCPUType, FileCPUSubtype;
+  std::tie(FileCPUType, FileCPUSubtype) = MachO::getCPUTypeFromArchitecture(
+      MachO::getArchitectureFromName(O->getArchTriple().getArchName()));
+
+  // If -arch_only is specified then skip this file if it doesn't match
+  // the architecture specified.
+  if (!ArchType.empty() && !acceptFileArch(FileCPUType, FileCPUSubtype)) {
+    return Error::success();
+  }
+
+  Members.push_back(std::move(Member));
   return Error::success();
 }
 
@@ -117,18 +179,94 @@ static Error addChildMember(std::vector<NewArchiveMember> &Members,
   if (!NMOrErr)
     return NMOrErr.takeError();
 
-  // Verify that Member is a Mach-O object file.
-  if (Error E = verifyMachOObject(*NMOrErr))
+  if (Error E = verifyAndAddMachOObject(Members, std::move(*NMOrErr)))
     return E;
 
-  Members.push_back(std::move(*NMOrErr));
+  return Error::success();
+}
+
+static Error processArchive(std::vector<NewArchiveMember> &Members,
+                            object::Archive &Lib, StringRef FileName,
+                            const Config &C) {
+  Error Err = Error::success();
+  for (const object::Archive::Child &Child : Lib.children(Err))
+    if (Error E = addChildMember(Members, Child, C))
+      return createFileError(FileName, std::move(E));
+  if (Err)
+    return createFileError(FileName, std::move(Err));
+
   return Error::success();
 }
 
 static Error
-addMember(std::vector<NewArchiveMember> &Members, StringRef FileName,
-          std::vector<std::unique_ptr<MemoryBuffer>> &ArchiveBuffers,
-          const Config &C) {
+addArchiveMembers(std::vector<NewArchiveMember> &Members,
+                  std::vector<std::unique_ptr<MemoryBuffer>> &ArchiveBuffers,
+                  NewArchiveMember NM, StringRef FileName, const Config &C) {
+  Expected<std::unique_ptr<Archive>> LibOrErr =
+      object::Archive::create(NM.Buf->getMemBufferRef());
+  if (!LibOrErr)
+    return createFileError(FileName, LibOrErr.takeError());
+
+  if (Error E = processArchive(Members, **LibOrErr, FileName, C))
+    return E;
+
+  // Update vector ArchiveBuffers with the MemoryBuffers to transfer
+  // ownership.
+  ArchiveBuffers.push_back(std::move(NM.Buf));
+  return Error::success();
+}
+
+static Error addUniversalMembers(
+    std::vector<NewArchiveMember> &Members,
+    std::vector<std::unique_ptr<MemoryBuffer>> &UniversalBuffers,
+    NewArchiveMember NM, StringRef FileName, const Config &C) {
+  Expected<std::unique_ptr<MachOUniversalBinary>> BinaryOrErr =
+      MachOUniversalBinary::create(NM.Buf->getMemBufferRef());
+  if (!BinaryOrErr)
+    return createFileError(FileName, BinaryOrErr.takeError());
+
+  auto *UO = BinaryOrErr->get();
+  for (const MachOUniversalBinary::ObjectForArch &O : UO->objects()) {
+
+    Expected<std::unique_ptr<MachOObjectFile>> MachOObjOrErr =
+        O.getAsObjectFile();
+    if (MachOObjOrErr) {
+      NewArchiveMember NewMember =
+          NewArchiveMember(MachOObjOrErr->get()->getMemoryBufferRef());
+      NewMember.MemberName = sys::path::filename(NewMember.MemberName);
+
+      if (Error E = verifyAndAddMachOObject(Members, std::move(NewMember)))
+        return E;
+      continue;
+    }
+
+    Expected<std::unique_ptr<Archive>> ArchiveOrError = O.getAsArchive();
+    if (ArchiveOrError) {
+      // A universal file member can either be a MachOObjectFile or an Archive.
+      // In case we can successfully cast the member as an Archive, it is safe
+      // to throw away the error generated due to casting the object as a
+      // MachOObjectFile.
+      consumeError(MachOObjOrErr.takeError());
+
+      if (Error E = processArchive(Members, **ArchiveOrError, FileName, C))
+        return E;
+      continue;
+    }
+
+    Error CombinedError =
+        joinErrors(ArchiveOrError.takeError(), MachOObjOrErr.takeError());
+    return createFileError(FileName, std::move(CombinedError));
+  }
+
+  // Update vector UniversalBuffers with the MemoryBuffers to transfer
+  // ownership.
+  UniversalBuffers.push_back(std::move(NM.Buf));
+  return Error::success();
+}
+
+static Error addMember(std::vector<NewArchiveMember> &Members,
+                       std::vector<std::unique_ptr<MemoryBuffer>> &FileBuffers,
+                       StringRef FileName, const Config &C) {
   Expected<NewArchiveMember> NMOrErr =
       NewArchiveMember::getFile(FileName, C.Deterministic);
   if (!NMOrErr)
@@ -137,43 +275,36 @@ addMember(std::vector<NewArchiveMember> &Members, StringRef FileName,
   // For regular archives, use the basename of the object path for the member
   // name.
   NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
+  file_magic Magic = identify_magic(NMOrErr->Buf->getBuffer());
 
   // Flatten archives.
-  if (identify_magic(NMOrErr->Buf->getBuffer()) == file_magic::archive) {
-    Expected<std::unique_ptr<Archive>> LibOrErr =
-        object::Archive::create(NMOrErr->Buf->getMemBufferRef());
-    if (!LibOrErr)
-      return createFileError(FileName, LibOrErr.takeError());
-    object::Archive &Lib = **LibOrErr;
-
-    Error Err = Error::success();
-    for (const object::Archive::Child &Child : Lib.children(Err))
-      if (Error E = addChildMember(Members, Child, C))
-        return createFileError(FileName, std::move(E));
-    if (Err)
-      return createFileError(FileName, std::move(Err));
-
-    // Update vector ArchiveBuffers with the MemoryBuffers to transfer
-    // ownership.
-    ArchiveBuffers.push_back(std::move(NMOrErr->Buf));
-    return Error::success();
-  }
+  if (Magic == file_magic::archive)
+    return addArchiveMembers(Members, FileBuffers, std::move(*NMOrErr),
+                             FileName, C);
 
-  // Verify that Member is a Mach-O object file.
-  if (Error E = verifyMachOObject(*NMOrErr))
-    return E;
+  // Flatten universal files.
+  if (Magic == file_magic::macho_universal_binary)
+    return addUniversalMembers(Members, FileBuffers, std::move(*NMOrErr),
+                               FileName, C);
 
-  Members.push_back(std::move(*NMOrErr));
+  if (Error E = verifyAndAddMachOObject(Members, std::move(*NMOrErr)))
+    return E;
   return Error::success();
 }
 
 static Error createStaticLibrary(const Config &C) {
   std::vector<NewArchiveMember> NewMembers;
-  std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
-  for (StringRef Member : InputFiles)
-    if (Error E = addMember(NewMembers, Member, ArchiveBuffers, C))
+  std::vector<std::unique_ptr<MemoryBuffer>> FileBuffers;
+  for (StringRef FileName : InputFiles)
+    if (Error E = addMember(NewMembers, FileBuffers, FileName, C))
       return E;
 
+  if (NewMembers.empty() && !ArchType.empty())
+    return createStringError(std::errc::invalid_argument,
+                             "no library created (no object files in input "
+                             "files matching -arch_only %s)",
+                             ArchType.c_str());
+
   if (Error E =
           writeArchive(OutputFile, NewMembers,
                        /*WriteSymtab=*/true,
@@ -201,6 +332,10 @@ static Expected<Config> parseCommandLine(int Argc, char **Argv) {
     return createStringError(std::errc::invalid_argument,
                              "no input files specified");
 
+  if (ArchType.getNumOccurrences())
+    if (Error E = validateArchitectureName(ArchType))
+      return std::move(E);
+
   return C;
 }
 

From adaadbfeac98ab9d5ce34b8bb2ceedddc5dc1fd4 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 12 Aug 2020 17:43:28 -0700
Subject: [PATCH 05/23] [JITLink][MachO] Return an error when MachO TLV
 relocations are encountered.

MachO TLV relocations aren't supported yet. Error out rather than falling
through to llvm_unreachable.
---
 llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 134b01f0f6560c..a70b0dcd8f8574 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -339,6 +339,9 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
           assert(TargetSymbol && "No target symbol from parsePairRelocation?");
           break;
         }
+        case PCRel32TLV:
+          return make_error<JITLinkError>(
+              "MachO TLV relocations not yet supported");
         default:
           llvm_unreachable("Special relocation kind should not appear in "
                            "mach-o file");

From e137b550587a85b0d9c9c539edc79de0122b6946 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 12 Aug 2020 20:44:05 -0700
Subject: [PATCH 06/23] [llvm-jitlink] Don't demote unreferenced definitions in
 -harness mode.

Demoting unreferenced externals is unsafe if multiple interdependent test
objects are used, including objects loaded from archives.
---
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 8e651d903a3b72..49424bf9774607 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -187,7 +187,7 @@ static Error applyHarnessPromotions(Session &S, LinkGraph &G) {
 
   // If this graph is part of the test then promote any symbols referenced by
   // the harness to default scope, remove all symbols that clash with harness
-  // definitions, demote all other definitions.
+  // definitions.
   std::vector<Symbol *> DefinitionsToRemove;
   for (auto *Sym : G.defined_symbols()) {
 
@@ -219,10 +219,6 @@ static Error applyHarnessPromotions(Session &S, LinkGraph &G) {
     } else if (S.HarnessDefinitions.count(Sym->getName())) {
       LLVM_DEBUG(dbgs() << "  Externalizing " << Sym->getName() << "\n");
       DefinitionsToRemove.push_back(Sym);
-    } else {
-      LLVM_DEBUG(dbgs() << "  Demoting " << Sym->getName() << "\n");
-      Sym->setScope(Scope::Local);
-      Sym->setLive(false);
     }
   }
 
@@ -521,7 +517,8 @@ Error LLVMJITLinkObjectLinkingLayer::add(JITDylib &JD,
       return SymFlagsOrErr.takeError();
 
     // Skip symbols not defined in this object file.
-    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
+    if ((*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) ||
+        !(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
       continue;
 
     auto Name = Sym.getName();
@@ -551,10 +548,8 @@ Error LLVMJITLinkObjectLinkingLayer::add(JITDylib &JD,
         *SymFlags &= ~JITSymbolFlags::Exported;
     } else if (S.HarnessExternals.count(*Name)) {
       *SymFlags |= JITSymbolFlags::Exported;
-    } else {
-      // Skip symbols that aren't in the HarnessExternals set.
+    } else if (S.HarnessDefinitions.count(*Name))
       continue;
-    }
 
     auto InternedName = S.ES.intern(*Name);
     SymbolFlags[InternedName] = std::move(*SymFlags);

From 2f7adf5ee37934ee5769276644fcafbc9d4dcda3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=C3=A1vid=20Bolvansk=C3=BD?= <david.bolvansky@gmail.com>
Date: Thu, 13 Aug 2020 20:19:21 +0200
Subject: [PATCH 07/23] [Diagnostics] Skip var decl of structs for
 -Wstring-concatenation

---
 clang/lib/Sema/SemaDecl.cpp     | 5 +++--
 clang/test/Sema/string-concat.c | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fee748bf9f9da3..ab14963372109e 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -12886,7 +12886,8 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
                                                AttributeCommonInfo::AS_Pragma));
   }
 
-  if (var->hasInit() && isa<InitListExpr>(var->getInit())) {
+  if (!var->getType()->isStructureType() && var->hasInit() &&
+      isa<InitListExpr>(var->getInit())) {
     const auto *ILE = cast<InitListExpr>(var->getInit());
     unsigned NumInits = ILE->getNumInits();
     if (NumInits > 2)
@@ -12927,7 +12928,7 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
             Diag(SL->getBeginLoc(),
                  diag::note_concatenated_string_literal_silence);
           }
-          // Warn just once.
+          // In any case, stop now.
           break;
         }
       }
diff --git a/clang/test/Sema/string-concat.c b/clang/test/Sema/string-concat.c
index b6bae9c95b0b09..63abf100c020f0 100644
--- a/clang/test/Sema/string-concat.c
+++ b/clang/test/Sema/string-concat.c
@@ -148,6 +148,12 @@ const A not_warn6 =  A{"",
                       ""};
 #endif
 
+static A not_warn7 = {"",
+
+  ""
+  "",
+  ""};
+
 
 // Do not warn when all the elements in the initializer are concatenated together.
 const char *all_elems_in_init_concatenated[] = {"a" "b" "c"};

From 3944d3df4f062db1e1fb1deab24e4c40bd5c8095 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?D=C3=A1vid=20Bolvansk=C3=BD?= <david.bolvansky@gmail.com>
Date: Thu, 13 Aug 2020 20:21:19 +0200
Subject: [PATCH 08/23] [Tests] Removed debug copy command

---
 compiler-rt/test/profile/Linux/counter_promo_for.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/profile/Linux/counter_promo_for.c b/compiler-rt/test/profile/Linux/counter_promo_for.c
index 7cab70b08773bc..464c97cb7dd3d8 100644
--- a/compiler-rt/test/profile/Linux/counter_promo_for.c
+++ b/compiler-rt/test/profile/Linux/counter_promo_for.c
@@ -2,7 +2,7 @@
 // RUN: rm -fr %t.nopromo.prof
 // RUN: %clang_pgogen=%t.promo.prof/ -o %t.promo.gen -O2 %s
 // RUN: %clang_pgogen=%t.promo.prof/ -o %t.promo.gen.ll -emit-llvm -S -O2 %s
-// RUN: cp %t.promo.gen.ll /tmp/d.txt ; cat %t.promo.gen.ll | FileCheck --check-prefix=PROMO %s
+// RUN: cat %t.promo.gen.ll | FileCheck --check-prefix=PROMO %s
 // RUN: %run %t.promo.gen
 // RUN: llvm-profdata merge -o %t.promo.profdata %t.promo.prof/
 // RUN: llvm-profdata show --counts --all-functions %t.promo.profdata  > %t.promo.dump

From 1a8c9cd1d96e680a3c519e9b3295ba9c4a34736d Mon Sep 17 00:00:00 2001
From: Aditya Kumar <1894981+hiraditya@users.noreply.github.com>
Date: Thu, 13 Aug 2020 00:06:22 -0700
Subject: [PATCH 09/23] Fix PR45442: Bail out when MemorySSA information is not
 available

Reviewers: sebpop, uabelho, fhahn
Reviewed by: fhahn

Differential Revision: https://reviews.llvm.org/D85881
---
 llvm/lib/Transforms/Scalar/GVNHoist.cpp  | 11 ++++-------
 llvm/test/Transforms/GVNHoist/pr45442.ll | 14 +++++++-------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 07f40de6a1f2f5..74c4a480098a31 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -521,10 +521,6 @@ class GVNHoist {
     if (NewPt == OldPt)
       return true;
 
-    // MemoryUseDef information is not available, bail out.
-    if (!U)
-      return false;
-
     const BasicBlock *NewBB = NewPt->getParent();
     const BasicBlock *OldBB = OldPt->getParent();
     const BasicBlock *UBB = U->getBlock();
@@ -609,9 +605,10 @@ class GVNHoist {
         if (safeToHoistScalar(BB, Insn->getParent(), NumBBsOnAllPaths))
           Safe.push_back(CHI);
       } else {
-        MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn);
-        if (safeToHoistLdSt(BB->getTerminator(), Insn, UD, K, NumBBsOnAllPaths))
-          Safe.push_back(CHI);
+        auto *T = BB->getTerminator();
+        if (MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn))
+          if (safeToHoistLdSt(T, Insn, UD, K, NumBBsOnAllPaths))
+            Safe.push_back(CHI);
       }
     }
   }
diff --git a/llvm/test/Transforms/GVNHoist/pr45442.ll b/llvm/test/Transforms/GVNHoist/pr45442.ll
index db3ecb2a8d878c..fe6a8be01507df 100644
--- a/llvm/test/Transforms/GVNHoist/pr45442.ll
+++ b/llvm/test/Transforms/GVNHoist/pr45442.ll
@@ -1,32 +1,32 @@
 ; RUN: opt < %s -gvn-hoist -S | FileCheck %s
 
 ; gvn-hoist shouldn't crash in this case.
-; CHECK-LABEL: @func()
+; CHECK-LABEL: @func(i1 %b)
 ; CHECK:       entry:
 ; CHECK-NEXT:  br i1
 ; CHECK:  bb1:
-; CHECK-NEXT:  unreachable
+; CHECK-NEXT:  ret void
 ; CHECK:  bb2:
 ; CHECK-NEXT:  call
 ; CHECK-NEXT:  call
-; CHECK-NEXT:  unreachable
+; CHECK-NEXT:  ret void
 
 define void @v_1_0() #0 {
 entry:
   ret void
 }
 
-define void @func()  {
+define void @func(i1 %b) {
 entry:
-  br i1 undef, label %bb1, label %bb2
+  br i1 %b, label %bb1, label %bb2
 
 bb1:
-  unreachable
+  ret void
 
 bb2:
   call void @v_1_0()
   call void @v_1_0()
-  unreachable
+  ret void
 }
 
 attributes #0 = { nounwind readonly }

From d25cb5a8a23ec9192e32a318eb565e956b87f553 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 13 Aug 2020 11:22:01 -0700
Subject: [PATCH 10/23] [AMDGPU] Fix misleading SDWA verifier error. NFC.

The old error from GFX9 shall be updated to GFX9+.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0cd7acb7a789b9..1221b927b58379 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3695,7 +3695,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       } else {
         // No immediates on GFX9
         if (!MO.isReg()) {
-          ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
+          ErrInfo =
+            "Only reg allowed as operands in SDWA instructions on GFX9+";
           return false;
         }
       }

From 0462aef5f359497b29bc811f94e8d6f7c1f2923e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 13 Aug 2020 11:15:44 -0700
Subject: [PATCH 11/23] [AMDGPU] Inhibit SDWA if target instruction has FI

Differential Revision: https://reviews.llvm.org/D85918
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 10 +++++++
 llvm/test/CodeGen/AMDGPU/sdwa-stack.mir   | 32 +++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sdwa-stack.mir

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 87bacc5880ac88..4774041f2b820e 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -992,6 +992,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
   if (Opc == AMDGPU::V_CNDMASK_B32_e32)
     return false;
 
+  if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
+    if (!Src0->isReg() && !Src0->isImm())
+      return false;
+  }
+
+  if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
+    if (!Src1->isReg() && !Src1->isImm())
+      return false;
+  }
+
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-stack.mir b/llvm/test/CodeGen/AMDGPU/sdwa-stack.mir
new file mode 100644
index 00000000000000..d804605c5d2a04
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-stack.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+# Do not fold stack objects into SDWA.
+
+---
+# GCN-LABEL: name: sdwa_stack_object_src0
+# GCN: V_ADD_U32_e64 %stack.0, killed %1
+name: sdwa_stack_object_src0
+stack:
+  - { id: 0, type: default, offset: 0, size: 32, alignment: 4, stack-id: default }
+body:             |
+  bb.0:
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_AND_B32_e32 255, %0, implicit $exec
+    %2:vgpr_32 = V_ADD_U32_e64 %stack.0, killed %1, 0, implicit $exec
+    S_ENDPGM 0, implicit %2
+
+...
+---
+name: sdwa_stack_object_src1
+# GCN-LABEL: name: sdwa_stack_object_src1
+# GCN: V_ADD_U32_e64 killed %1, %stack.0
+stack:
+  - { id: 0, type: default, offset: 0, size: 32, alignment: 4, stack-id: default }
+body:             |
+  bb.0:
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = V_AND_B32_e32 255, %0, implicit $exec
+    %2:vgpr_32 = V_ADD_U32_e64 killed %1, %stack.0, 0, implicit $exec
+    S_ENDPGM 0, implicit %2
+
+...

From 759f9a7acdfc33afd5d8bd1c33446c1b4d721388 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 13 Aug 2020 11:44:42 -0700
Subject: [PATCH 12/23] Revert "[llvm-jitlink] Don't demote unreferenced
 definitions in -harness mode."

This reverts commit e137b550587a85b0d9c9c539edc79de0122b6946.

This commit broke a test case. Reverting while I investigate.
---
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 49424bf9774607..8e651d903a3b72 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -187,7 +187,7 @@ static Error applyHarnessPromotions(Session &S, LinkGraph &G) {
 
   // If this graph is part of the test then promote any symbols referenced by
   // the harness to default scope, remove all symbols that clash with harness
-  // definitions.
+  // definitions, demote all other definitions.
   std::vector<Symbol *> DefinitionsToRemove;
   for (auto *Sym : G.defined_symbols()) {
 
@@ -219,6 +219,10 @@ static Error applyHarnessPromotions(Session &S, LinkGraph &G) {
     } else if (S.HarnessDefinitions.count(Sym->getName())) {
       LLVM_DEBUG(dbgs() << "  Externalizing " << Sym->getName() << "\n");
       DefinitionsToRemove.push_back(Sym);
+    } else {
+      LLVM_DEBUG(dbgs() << "  Demoting " << Sym->getName() << "\n");
+      Sym->setScope(Scope::Local);
+      Sym->setLive(false);
     }
   }
 
@@ -517,8 +521,7 @@ Error LLVMJITLinkObjectLinkingLayer::add(JITDylib &JD,
       return SymFlagsOrErr.takeError();
 
     // Skip symbols not defined in this object file.
-    if ((*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) ||
-        !(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
+    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
       continue;
 
     auto Name = Sym.getName();
@@ -548,8 +551,10 @@ Error LLVMJITLinkObjectLinkingLayer::add(JITDylib &JD,
         *SymFlags &= ~JITSymbolFlags::Exported;
     } else if (S.HarnessExternals.count(*Name)) {
       *SymFlags |= JITSymbolFlags::Exported;
-    } else if (S.HarnessDefinitions.count(*Name))
+    } else {
+      // Skip symbols that aren't in the HarnessExternals set.
       continue;
+    }
 
     auto InternedName = S.ES.intern(*Name);
     SymbolFlags[InternedName] = std::move(*SymFlags);

From d650cbc349ccc4f477568c2827f1bce650020058 Mon Sep 17 00:00:00 2001
From: Haowei Wu <haowei@google.com>
Date: Tue, 11 Aug 2020 11:44:22 -0700
Subject: [PATCH 13/23] [elfabi] Move llvm-elfabi related code to InterfaceStub
 library

This change moves elfabi related code to llvm/InterfaceStub library
so it can be shared by multiple llvm tools without causing cyclic
dependencies.

Differential Revision: https://reviews.llvm.org/D85678
---
 .../llvm/InterfaceStub}/ELFObjHandler.h       |  2 +-
 .../{TextAPI/ELF => InterfaceStub}/ELFStub.h  |  8 ++-
 .../ELF => InterfaceStub}/TBEHandler.h        |  2 +-
 llvm/lib/CMakeLists.txt                       |  1 +
 llvm/lib/InterfaceStub/CMakeLists.txt         |  8 +++
 .../InterfaceStub}/ELFObjHandler.cpp          | 53 +++++++++----------
 .../ELF => InterfaceStub}/ELFStub.cpp         |  2 +-
 llvm/lib/InterfaceStub/LLVMBuild.txt          | 21 ++++++++
 .../ELF => InterfaceStub}/TBEHandler.cpp      |  6 +--
 llvm/lib/LLVMBuild.txt                        |  1 +
 llvm/lib/TextAPI/CMakeLists.txt               |  2 -
 llvm/tools/llvm-elfabi/CMakeLists.txt         |  2 +-
 llvm/tools/llvm-elfabi/LLVMBuild.txt          |  2 +-
 llvm/tools/llvm-elfabi/llvm-elfabi.cpp        |  6 +--
 llvm/unittests/CMakeLists.txt                 |  1 +
 llvm/unittests/InterfaceStub/CMakeLists.txt   |  9 ++++
 .../ELFYAMLTest.cpp                           |  4 +-
 llvm/unittests/TextAPI/CMakeLists.txt         |  1 -
 18 files changed, 82 insertions(+), 49 deletions(-)
 rename llvm/{tools/llvm-elfabi => include/llvm/InterfaceStub}/ELFObjHandler.h (96%)
 rename llvm/include/llvm/{TextAPI/ELF => InterfaceStub}/ELFStub.h (93%)
 rename llvm/include/llvm/{TextAPI/ELF => InterfaceStub}/TBEHandler.h (100%)
 create mode 100644 llvm/lib/InterfaceStub/CMakeLists.txt
 rename llvm/{tools/llvm-elfabi => lib/InterfaceStub}/ELFObjHandler.cpp (91%)
 rename llvm/lib/{TextAPI/ELF => InterfaceStub}/ELFStub.cpp (95%)
 create mode 100644 llvm/lib/InterfaceStub/LLVMBuild.txt
 rename llvm/lib/{TextAPI/ELF => InterfaceStub}/TBEHandler.cpp (98%)
 create mode 100644 llvm/unittests/InterfaceStub/CMakeLists.txt
 rename llvm/unittests/{TextAPI => InterfaceStub}/ELFYAMLTest.cpp (98%)

diff --git a/llvm/tools/llvm-elfabi/ELFObjHandler.h b/llvm/include/llvm/InterfaceStub/ELFObjHandler.h
similarity index 96%
rename from llvm/tools/llvm-elfabi/ELFObjHandler.h
rename to llvm/include/llvm/InterfaceStub/ELFObjHandler.h
index 97f0d68f4d4fbc..1ffd9a614eecd3 100644
--- a/llvm/tools/llvm-elfabi/ELFObjHandler.h
+++ b/llvm/include/llvm/InterfaceStub/ELFObjHandler.h
@@ -13,9 +13,9 @@
 #ifndef LLVM_TOOLS_ELFABI_ELFOBJHANDLER_H
 #define LLVM_TOOLS_ELFABI_ELFOBJHANDLER_H
 
+#include "llvm/InterfaceStub/ELFStub.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFTypes.h"
-#include "llvm/TextAPI/ELF/ELFStub.h"
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/TextAPI/ELF/ELFStub.h b/llvm/include/llvm/InterfaceStub/ELFStub.h
similarity index 93%
rename from llvm/include/llvm/TextAPI/ELF/ELFStub.h
rename to llvm/include/llvm/InterfaceStub/ELFStub.h
index 76b2af12166289..7832c1c7413b39 100644
--- a/llvm/include/llvm/TextAPI/ELF/ELFStub.h
+++ b/llvm/include/llvm/InterfaceStub/ELFStub.h
@@ -16,8 +16,8 @@
 
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/VersionTuple.h"
-#include <vector>
 #include <set>
+#include <vector>
 
 namespace llvm {
 namespace elfabi {
@@ -42,15 +42,13 @@ struct ELFSymbol {
   bool Undefined;
   bool Weak;
   Optional<std::string> Warning;
-  bool operator<(const ELFSymbol &RHS) const {
-    return Name < RHS.Name;
-  }
+  bool operator<(const ELFSymbol &RHS) const { return Name < RHS.Name; }
 };
 
 // A cumulative representation of ELF stubs.
 // Both textual and binary stubs will read into and write from this object.
 class ELFStub {
-// TODO: Add support for symbol versioning.
+  // TODO: Add support for symbol versioning.
 public:
   VersionTuple TbeVersion;
   Optional<std::string> SoName;
diff --git a/llvm/include/llvm/TextAPI/ELF/TBEHandler.h b/llvm/include/llvm/InterfaceStub/TBEHandler.h
similarity index 100%
rename from llvm/include/llvm/TextAPI/ELF/TBEHandler.h
rename to llvm/include/llvm/InterfaceStub/TBEHandler.h
index 76484410987fa5..5c523eba037e7d 100644
--- a/llvm/include/llvm/TextAPI/ELF/TBEHandler.h
+++ b/llvm/include/llvm/InterfaceStub/TBEHandler.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_TEXTAPI_ELF_TBEHANDLER_H
 #define LLVM_TEXTAPI_ELF_TBEHANDLER_H
 
-#include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/VersionTuple.h"
 #include <memory>
 
 namespace llvm {
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index abe3ec59aec1aa..35d204d7d63e27 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -3,6 +3,7 @@
 
 add_subdirectory(IR)
 add_subdirectory(FuzzMutate)
+add_subdirectory(InterfaceStub)
 add_subdirectory(IRReader)
 add_subdirectory(CodeGen)
 add_subdirectory(BinaryFormat)
diff --git a/llvm/lib/InterfaceStub/CMakeLists.txt b/llvm/lib/InterfaceStub/CMakeLists.txt
new file mode 100644
index 00000000000000..be2529f6d60566
--- /dev/null
+++ b/llvm/lib/InterfaceStub/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_llvm_component_library(LLVMInterfaceStub
+  ELFObjHandler.cpp
+  ELFStub.cpp
+  TBEHandler.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  "${LLVM_MAIN_INCLUDE_DIR}/llvm/InterfaceStub"
+)
diff --git a/llvm/tools/llvm-elfabi/ELFObjHandler.cpp b/llvm/lib/InterfaceStub/ELFObjHandler.cpp
similarity index 91%
rename from llvm/tools/llvm-elfabi/ELFObjHandler.cpp
rename to llvm/lib/InterfaceStub/ELFObjHandler.cpp
index 124fffbb9cf6a7..82e7a3c8b1baab 100644
--- a/llvm/tools/llvm-elfabi/ELFObjHandler.cpp
+++ b/llvm/lib/InterfaceStub/ELFObjHandler.cpp
@@ -6,14 +6,14 @@
 //
 //===-----------------------------------------------------------------------===/
 
-#include "ELFObjHandler.h"
+#include "llvm/InterfaceStub/ELFObjHandler.h"
+#include "llvm/InterfaceStub/ELFStub.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ELFTypes.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/TextAPI/ELF/ELFStub.h"
 
 using llvm::MemoryBufferRef;
 using llvm::object::ELFObjectFile;
@@ -128,19 +128,17 @@ static Error populateDynamic(DynamicEntries &Dyn,
         "Couldn't locate dynamic symbol table (no DT_SYMTAB entry)");
   }
   if (Dyn.SONameOffset.hasValue() && *Dyn.SONameOffset >= Dyn.StrSize) {
-    return createStringError(
-        object_error::parse_failed,
-        "DT_SONAME string offset (0x%016" PRIx64
-        ") outside of dynamic string table",
-        *Dyn.SONameOffset);
+    return createStringError(object_error::parse_failed,
+                             "DT_SONAME string offset (0x%016" PRIx64
+                             ") outside of dynamic string table",
+                             *Dyn.SONameOffset);
   }
   for (uint64_t Offset : Dyn.NeededLibNames) {
     if (Offset >= Dyn.StrSize) {
-      return createStringError(
-          object_error::parse_failed,
-          "DT_NEEDED string offset (0x%016" PRIx64
-          ") outside of dynamic string table",
-          Offset);
+      return createStringError(object_error::parse_failed,
+                               "DT_NEEDED string offset (0x%016" PRIx64
+                               ") outside of dynamic string table",
+                               Offset);
     }
   }
 
@@ -212,16 +210,16 @@ static Expected<uint64_t> getNumSyms(DynamicEntries &Dyn,
 static ELFSymbolType convertInfoToType(uint8_t Info) {
   Info = Info & 0xf;
   switch (Info) {
-    case ELF::STT_NOTYPE:
-      return ELFSymbolType::NoType;
-    case ELF::STT_OBJECT:
-      return ELFSymbolType::Object;
-    case ELF::STT_FUNC:
-      return ELFSymbolType::Func;
-    case ELF::STT_TLS:
-      return ELFSymbolType::TLS;
-    default:
-      return ELFSymbolType::Unknown;
+  case ELF::STT_NOTYPE:
+    return ELFSymbolType::NoType;
+  case ELF::STT_OBJECT:
+    return ELFSymbolType::Object;
+  case ELF::STT_FUNC:
+    return ELFSymbolType::Func;
+  case ELF::STT_TLS:
+    return ELFSymbolType::TLS;
+  default:
+    return ELFSymbolType::Unknown;
   }
 }
 
@@ -259,8 +257,8 @@ static ELFSymbol createELFSym(StringRef SymName,
 /// @param DynStr StringRef to the dynamic string table.
 template <class ELFT>
 static Error populateSymbols(ELFStub &TargetStub,
-                            const typename ELFT::SymRange DynSym,
-                            StringRef DynStr) {
+                             const typename ELFT::SymRange DynSym,
+                             StringRef DynStr) {
   // Skips the first symbol since it's the NULL symbol.
   for (auto RawSym : DynSym.drop_front(1)) {
     // If a symbol does not have global or weak binding, ignore it.
@@ -311,7 +309,7 @@ buildStub(const ELFObjectFile<ELFT> &ElfObj) {
   if (Error Err = populateDynamic<ELFT>(DynEnt, *DynTable))
     return std::move(Err);
 
-    // Get pointer to in-memory location of .dynstr section.
+  // Get pointer to in-memory location of .dynstr section.
   Expected<const uint8_t *> DynStrPtr =
       ElfFile->toMappedAddr(DynEnt.StrTabAddr);
   if (!DynStrPtr)
@@ -355,9 +353,8 @@ buildStub(const ELFObjectFile<ELFT> &ElfObj) {
     if (!DynSymPtr)
       return appendToError(DynSymPtr.takeError(),
                            "when locating .dynsym section contents");
-    Elf_Sym_Range DynSyms =
-        ArrayRef<Elf_Sym>(reinterpret_cast<const Elf_Sym *>(*DynSymPtr),
-                          *SymCount);
+    Elf_Sym_Range DynSyms = ArrayRef<Elf_Sym>(
+        reinterpret_cast<const Elf_Sym *>(*DynSymPtr), *SymCount);
     Error SymReadError = populateSymbols<ELFT>(*DestStub, DynSyms, DynStr);
     if (SymReadError)
       return appendToError(std::move(SymReadError),
diff --git a/llvm/lib/TextAPI/ELF/ELFStub.cpp b/llvm/lib/InterfaceStub/ELFStub.cpp
similarity index 95%
rename from llvm/lib/TextAPI/ELF/ELFStub.cpp
rename to llvm/lib/InterfaceStub/ELFStub.cpp
index f8463497093b14..3c637695d8e714 100644
--- a/llvm/lib/TextAPI/ELF/ELFStub.cpp
+++ b/llvm/lib/InterfaceStub/ELFStub.cpp
@@ -6,7 +6,7 @@
 //
 //===-----------------------------------------------------------------------===/
 
-#include "llvm/TextAPI/ELF/ELFStub.h"
+#include "llvm/InterfaceStub/ELFStub.h"
 
 using namespace llvm;
 using namespace llvm::elfabi;
diff --git a/llvm/lib/InterfaceStub/LLVMBuild.txt b/llvm/lib/InterfaceStub/LLVMBuild.txt
new file mode 100644
index 00000000000000..e69544d4f5f67f
--- /dev/null
+++ b/llvm/lib/InterfaceStub/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/InterfaceStub/LLVMBuild.txt ------------------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = InterfaceStub
+parent = Libraries
+required_libraries = Object Support
diff --git a/llvm/lib/TextAPI/ELF/TBEHandler.cpp b/llvm/lib/InterfaceStub/TBEHandler.cpp
similarity index 98%
rename from llvm/lib/TextAPI/ELF/TBEHandler.cpp
rename to llvm/lib/InterfaceStub/TBEHandler.cpp
index cb597d8896e811..06a2f9bf2451b1 100644
--- a/llvm/lib/TextAPI/ELF/TBEHandler.cpp
+++ b/llvm/lib/InterfaceStub/TBEHandler.cpp
@@ -6,12 +6,12 @@
 //
 //===-----------------------------------------------------------------------===/
 
-#include "llvm/TextAPI/ELF/TBEHandler.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/InterfaceStub/TBEHandler.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/InterfaceStub/ELFStub.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
-#include "llvm/TextAPI/ELF/ELFStub.h"
 
 using namespace llvm;
 using namespace llvm::elfabi;
diff --git a/llvm/lib/LLVMBuild.txt b/llvm/lib/LLVMBuild.txt
index 824abd36fc9986..a81c6a1fe3fa46 100644
--- a/llvm/lib/LLVMBuild.txt
+++ b/llvm/lib/LLVMBuild.txt
@@ -30,6 +30,7 @@ subdirectories =
  FuzzMutate
  LineEditor
  Linker
+ InterfaceStub
  IR
  IRReader
  LTO
diff --git a/llvm/lib/TextAPI/CMakeLists.txt b/llvm/lib/TextAPI/CMakeLists.txt
index 36528f0995d8e3..b63bc64b31f14d 100644
--- a/llvm/lib/TextAPI/CMakeLists.txt
+++ b/llvm/lib/TextAPI/CMakeLists.txt
@@ -1,6 +1,4 @@
 add_llvm_component_library(LLVMTextAPI
-  ELF/ELFStub.cpp
-  ELF/TBEHandler.cpp
   MachO/Architecture.cpp
   MachO/ArchitectureSet.cpp
   MachO/InterfaceFile.cpp
diff --git a/llvm/tools/llvm-elfabi/CMakeLists.txt b/llvm/tools/llvm-elfabi/CMakeLists.txt
index bd3ec851887aba..43b4b5b5faa963 100644
--- a/llvm/tools/llvm-elfabi/CMakeLists.txt
+++ b/llvm/tools/llvm-elfabi/CMakeLists.txt
@@ -1,11 +1,11 @@
 set(LLVM_LINK_COMPONENTS
+  InterfaceStub
   Object
   Support
   TextAPI
   )
 
 add_llvm_tool(llvm-elfabi
-  ELFObjHandler.cpp
   ErrorCollector.cpp
   llvm-elfabi.cpp
   )
diff --git a/llvm/tools/llvm-elfabi/LLVMBuild.txt b/llvm/tools/llvm-elfabi/LLVMBuild.txt
index b1a80e9e4a15f9..4e8021442fb6b4 100644
--- a/llvm/tools/llvm-elfabi/LLVMBuild.txt
+++ b/llvm/tools/llvm-elfabi/LLVMBuild.txt
@@ -18,4 +18,4 @@
 type = Tool
 name = llvm-elfabi
 parent = Tools
-required_libraries = Object Support TextAPI
+required_libraries = InterfaceStub Object Support TextAPI
diff --git a/llvm/tools/llvm-elfabi/llvm-elfabi.cpp b/llvm/tools/llvm-elfabi/llvm-elfabi.cpp
index 044b5f77c6d71d..8bf2ad4ed53736 100644
--- a/llvm/tools/llvm-elfabi/llvm-elfabi.cpp
+++ b/llvm/tools/llvm-elfabi/llvm-elfabi.cpp
@@ -6,16 +6,16 @@
 //
 //===-----------------------------------------------------------------------===/
 
-#include "ELFObjHandler.h"
 #include "ErrorCollector.h"
+#include "llvm/InterfaceStub/ELFObjHandler.h"
+#include "llvm/InterfaceStub/TBEHandler.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/WithColor.h"
-#include "llvm/TextAPI/ELF/TBEHandler.h"
+#include "llvm/Support/raw_ostream.h"
 #include <string>
 
 namespace llvm {
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index d7dbaeaa32fe87..850bc14b207fd1 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -26,6 +26,7 @@ add_subdirectory(Demangle)
 add_subdirectory(ExecutionEngine)
 add_subdirectory(Frontend)
 add_subdirectory(FuzzMutate)
+add_subdirectory(InterfaceStub)
 add_subdirectory(IR)
 add_subdirectory(LineEditor)
 add_subdirectory(Linker)
diff --git a/llvm/unittests/InterfaceStub/CMakeLists.txt b/llvm/unittests/InterfaceStub/CMakeLists.txt
new file mode 100644
index 00000000000000..22ff0af67a3d4b
--- /dev/null
+++ b/llvm/unittests/InterfaceStub/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_LINK_COMPONENTS
+  InterfaceStub
+)
+
+add_llvm_unittest(InterfaceStubTests
+  ELFYAMLTest.cpp
+)
+
+target_link_libraries(InterfaceStubTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/TextAPI/ELFYAMLTest.cpp b/llvm/unittests/InterfaceStub/ELFYAMLTest.cpp
similarity index 98%
rename from llvm/unittests/TextAPI/ELFYAMLTest.cpp
rename to llvm/unittests/InterfaceStub/ELFYAMLTest.cpp
index 8217507b5a5dc5..2b86b89fbbb6ff 100644
--- a/llvm/unittests/TextAPI/ELFYAMLTest.cpp
+++ b/llvm/unittests/InterfaceStub/ELFYAMLTest.cpp
@@ -7,8 +7,8 @@
 //===-----------------------------------------------------------------------===/
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/TextAPI/ELF/ELFStub.h"
-#include "llvm/TextAPI/ELF/TBEHandler.h"
+#include "llvm/InterfaceStub/ELFStub.h"
+#include "llvm/InterfaceStub/TBEHandler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/TextAPI/CMakeLists.txt b/llvm/unittests/TextAPI/CMakeLists.txt
index 775ec2f1d1e886..d575d57610b96e 100644
--- a/llvm/unittests/TextAPI/CMakeLists.txt
+++ b/llvm/unittests/TextAPI/CMakeLists.txt
@@ -3,7 +3,6 @@ set(LLVM_LINK_COMPONENTS
 )
 
 add_llvm_unittest(TextAPITests
-  ELFYAMLTest.cpp
   TextStubV1Tests.cpp
   TextStubV2Tests.cpp
   TextStubV3Tests.cpp

From 25bbceb047a3ce85394d510a16bd3fcfd69b8c75 Mon Sep 17 00:00:00 2001
From: shafik <syaghmour@apple.com>
Date: Thu, 13 Aug 2020 10:49:40 -0700
Subject: [PATCH 14/23] [LLDB] Fix how ValueObjectChild handles bit-fields
 stored in a Scalar in UpdateValue()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When bit-field data was stored in a Scalar in ValueObjectChild during UpdateValue()
it was extracting the bit-field value. Later on in lldb_private::DumpDataExtractor(…)
we were again attempting to extract the bit-field. Which would then not obtain the
correct value. This will remove the extra extraction in UpdateValue().
We hit this specific case when values are passed in registers, which we could only
reproduce in an optimized build.

Differential Revision: https://reviews.llvm.org/D85376
---
 lldb/source/Core/ValueObjectChild.cpp         |   6 +-
 .../DW_AT_data_bit_offset-DW_OP_stack_value.s | 312 ++++++++++++++++++
 2 files changed, 313 insertions(+), 5 deletions(-)
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/DW_AT_data_bit_offset-DW_OP_stack_value.s

diff --git a/lldb/source/Core/ValueObjectChild.cpp b/lldb/source/Core/ValueObjectChild.cpp
index 28cb49328f34eb..1059c8f34b3b74 100644
--- a/lldb/source/Core/ValueObjectChild.cpp
+++ b/lldb/source/Core/ValueObjectChild.cpp
@@ -199,11 +199,7 @@ bool ValueObjectChild::UpdateValue() {
           // try to extract the child value from the parent's scalar value
           {
             Scalar scalar(m_value.GetScalar());
-            if (m_bitfield_bit_size)
-              scalar.ExtractBitfield(m_bitfield_bit_size,
-                                     m_bitfield_bit_offset);
-            else
-              scalar.ExtractBitfield(8 * m_byte_size, 8 * m_byte_offset);
+            scalar.ExtractBitfield(8 * m_byte_size, 8 * m_byte_offset);
             m_value.GetScalar() = scalar;
           }
           break;
diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_data_bit_offset-DW_OP_stack_value.s b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_data_bit_offset-DW_OP_stack_value.s
new file mode 100644
index 00000000000000..074da09bc61eed
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_data_bit_offset-DW_OP_stack_value.s
@@ -0,0 +1,312 @@
+# RUN: llvm-mc -filetype=obj -o %t -triple x86_64-apple-macosx10.15.0 %s
+# RUN: %lldb %t -o "target variable ug" -b | FileCheck %s
+
+# CHECK: (lldb) target variable ug
+# CHECK: (U) ug = {
+# CHECK:   raw = 1688469761
+# CHECK:    = (a = 1, b = 1, c = 36, d = 2, e = 36, f = 1)
+# CHECK: }
+
+# We are testing how ValueObject deals with bit-fields when an argument is
+# passed by register. Compiling at -O1 allows us to capture this case and
+# test it.
+#
+# typedef union {
+#   unsigned raw;
+#   struct {
+#      unsigned a : 8;
+#      unsigned b : 8;
+#      unsigned c : 6;
+#      unsigned d : 2;
+#      unsigned e : 6;
+#      unsigned f : 2;
+#   };
+# } U;
+#
+# // This appears first in the debug info and pulls the type definition in...
+# static U __attribute__((used)) _type_anchor;
+# // ... then our useful variable appears last in the debug info and we can
+# // tweak the assembly without needing to edit a lot of offsets by hand.
+# static U ug;
+#
+# extern void f(U);
+#
+# // Omit debug info for main.
+# __attribute__((nodebug))
+# int main() {
+#   ug.raw = 0x64A40101;
+#   f(ug);
+#   f((U)ug.raw);
+# }
+#
+# Compiled as follows:
+#
+#   clang -O1 -gdwarf-4 weird.c -S -o weird.s
+#
+# Then the DWARF was hand modified to get DW_AT_LOCATION for ug from:
+#
+#   DW_AT_location	(DW_OP_addr 0x3f8, DW_OP_deref, DW_OP_constu 0x64a40101, DW_OP_mul, DW_OP_lit0, DW_OP_plus, DW_OP_stack_value)
+#
+# to this:
+#
+#   DW_AT_location	(DW_OP_constu 0x64a40101, DW_OP_stack_value)
+#
+# to work-around a seperate bug.
+
+.zerofill __DATA,__bss,__type_anchor,4,2 ## @_type_anchor
+.zerofill __DATA,__bss,_ug.0,1,2        ## @ug.0
+	.no_dead_strip	__type_anchor
+	.section	__DWARF,__debug_str,regular,debug
+Linfo_string:
+	.zero 138
+	.asciz	"_type_anchor"          ## string offset=138
+	.asciz	"U"                     ## string offset=151
+	.asciz	"raw"                   ## string offset=153
+	.asciz	"unsigned int"          ## string offset=157
+	.asciz	"a"                     ## string offset=170
+	.asciz	"b"                     ## string offset=172
+	.asciz	"c"                     ## string offset=174
+	.asciz	"d"                     ## string offset=176
+	.asciz	"e"                     ## string offset=178
+	.asciz	"f"                     ## string offset=180
+	.asciz	"ug"                    ## string offset=182
+	.section	__DWARF,__debug_abbrev,regular,debug
+Lsection_abbrev:
+	.byte	1                       ## Abbreviation Code
+	.byte	17                      ## DW_TAG_compile_unit
+	.byte	1                       ## DW_CHILDREN_yes
+	.byte	37                      ## DW_AT_producer
+	.byte	14                      ## DW_FORM_strp
+	.byte	19                      ## DW_AT_language
+	.byte	5                       ## DW_FORM_data2
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.ascii	"\202|"                 ## DW_AT_LLVM_sysroot
+	.byte	14                      ## DW_FORM_strp
+	.ascii	"\357\177"              ## DW_AT_APPLE_sdk
+	.byte	14                      ## DW_FORM_strp
+	.byte	16                      ## DW_AT_stmt_list
+	.byte	23                      ## DW_FORM_sec_offset
+	.byte	27                      ## DW_AT_comp_dir
+	.byte	14                      ## DW_FORM_strp
+	.ascii	"\341\177"              ## DW_AT_APPLE_optimized
+	.byte	25                      ## DW_FORM_flag_present
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	2                       ## Abbreviation Code
+	.byte	52                      ## DW_TAG_variable
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	2                       ## DW_AT_location
+	.byte	24                      ## DW_FORM_exprloc
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	3                       ## Abbreviation Code
+	.byte	22                      ## DW_TAG_typedef
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	4                       ## Abbreviation Code
+	.byte	23                      ## DW_TAG_union_type
+	.byte	1                       ## DW_CHILDREN_yes
+	.byte	11                      ## DW_AT_byte_size
+	.byte	11                      ## DW_FORM_data1
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	5                       ## Abbreviation Code
+	.byte	13                      ## DW_TAG_member
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	56                      ## DW_AT_data_member_location
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	6                       ## Abbreviation Code
+	.byte	13                      ## DW_TAG_member
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	56                      ## DW_AT_data_member_location
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	7                       ## Abbreviation Code
+	.byte	19                      ## DW_TAG_structure_type
+	.byte	1                       ## DW_CHILDREN_yes
+	.byte	11                      ## DW_AT_byte_size
+	.byte	11                      ## DW_FORM_data1
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	8                       ## Abbreviation Code
+	.byte	13                      ## DW_TAG_member
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	73                      ## DW_AT_type
+	.byte	19                      ## DW_FORM_ref4
+	.byte	58                      ## DW_AT_decl_file
+	.byte	11                      ## DW_FORM_data1
+	.byte	59                      ## DW_AT_decl_line
+	.byte	11                      ## DW_FORM_data1
+	.byte	13                      ## DW_AT_bit_size
+	.byte	11                      ## DW_FORM_data1
+	.byte	107                     ## DW_AT_data_bit_offset
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	9                       ## Abbreviation Code
+	.byte	36                      ## DW_TAG_base_type
+	.byte	0                       ## DW_CHILDREN_no
+	.byte	3                       ## DW_AT_name
+	.byte	14                      ## DW_FORM_strp
+	.byte	62                      ## DW_AT_encoding
+	.byte	11                      ## DW_FORM_data1
+	.byte	11                      ## DW_AT_byte_size
+	.byte	11                      ## DW_FORM_data1
+	.byte	0                       ## EOM(1)
+	.byte	0                       ## EOM(2)
+	.byte	0                       ## EOM(3)
+	.section	__DWARF,__debug_info,regular,debug
+Lsection_info:
+Lcu_begin0:
+.set Lset0, Ldebug_info_end0-Ldebug_info_start0 ## Length of Unit
+	.long	Lset0
+Ldebug_info_start0:
+	.short	4                       ## DWARF version number
+.set Lset1, Lsection_abbrev-Lsection_abbrev ## Offset Into Abbrev. Section
+	.long	Lset1
+	.byte	8                       ## Address Size (in bytes)
+	.byte	1                       ## Abbrev [1] 0xb:0xd0 DW_TAG_compile_unit
+	.long	0                       ## DW_AT_producer
+	.short	12                      ## DW_AT_language
+	.long	47                      ## DW_AT_name
+	.long	60                      ## DW_AT_LLVM_sysroot
+	.long	117                     ## DW_AT_APPLE_sdk
+        .long   0                       ## DW_AT_stmt_list
+	.long	133                     ## DW_AT_comp_dir
+                                        ## DW_AT_APPLE_optimized
+	.byte	2                       ## Abbrev [2] 0x26:0x15 DW_TAG_variable
+	.long	138                     ## DW_AT_name
+	.long	59                      ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	14                      ## DW_AT_decl_line
+	.byte	9                       ## DW_AT_location
+	.byte	3
+	.quad	__type_anchor
+	.byte	3                       ## Abbrev [3] 0x3b:0xb DW_TAG_typedef
+	.long	70                      ## DW_AT_type
+	.long	151                     ## DW_AT_name
+	.byte	1                       ## DW_AT_decl_file
+	.byte	11                      ## DW_AT_decl_line
+	.byte	4                       ## Abbrev [4] 0x46:0x6c DW_TAG_union_type
+	.byte	4                       ## DW_AT_byte_size
+	.byte	1                       ## DW_AT_decl_file
+	.byte	1                       ## DW_AT_decl_line
+	.byte	5                       ## Abbrev [5] 0x4a:0xc DW_TAG_member
+	.long	153                     ## DW_AT_name
+	.long	178                     ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	2                       ## DW_AT_decl_line
+	.byte	0                       ## DW_AT_data_member_location
+	.byte	6                       ## Abbrev [6] 0x56:0x8 DW_TAG_member
+	.long	94                      ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	3                       ## DW_AT_decl_line
+	.byte	0                       ## DW_AT_data_member_location
+	.byte	7                       ## Abbrev [7] 0x5e:0x53 DW_TAG_structure_type
+	.byte	4                       ## DW_AT_byte_size
+	.byte	1                       ## DW_AT_decl_file
+	.byte	3                       ## DW_AT_decl_line
+	.byte	8                       ## Abbrev [8] 0x62:0xd DW_TAG_member
+	.long	170                     ## DW_AT_name
+	.long	178                     ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	4                       ## DW_AT_decl_line
+	.byte	8                       ## DW_AT_bit_size
+	.byte	0                       ## DW_AT_data_bit_offset
+	.byte	8                       ## Abbrev [8] 0x6f:0xd DW_TAG_member
+	.long	172                     ## DW_AT_name
+	.long	178                     ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	5                       ## DW_AT_decl_line
+	.byte	8                       ## DW_AT_bit_size
+	.byte	8                       ## DW_AT_data_bit_offset
+	.byte	8                       ## Abbrev [8] 0x7c:0xd DW_TAG_member
+	.long	174                     ## DW_AT_name
+	.long	178                     ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	6                       ## DW_AT_decl_line
+	.byte	6                       ## DW_AT_bit_size
+	.byte	16                      ## DW_AT_data_bit_offset
+	.byte	8                       ## Abbrev [8] 0x89:0xd DW_TAG_member
+	.long	176                     ## DW_AT_name
+	.long	178                     ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	7                       ## DW_AT_decl_line
+	.byte	2                       ## DW_AT_bit_size
+	.byte	22                      ## DW_AT_data_bit_offset
+	.byte	8                       ## Abbrev [8] 0x96:0xd DW_TAG_member
+	.long	178                     ## DW_AT_name
+	.long	178                     ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	8                       ## DW_AT_decl_line
+	.byte	6                       ## DW_AT_bit_size
+	.byte	24                      ## DW_AT_data_bit_offset
+	.byte	8                       ## Abbrev [8] 0xa3:0xd DW_TAG_member
+	.long	180                     ## DW_AT_name
+	.long	178                     ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	9                       ## DW_AT_decl_line
+	.byte	2                       ## DW_AT_bit_size
+	.byte	30                      ## DW_AT_data_bit_offset
+	.byte	0                       ## End Of Children Mark
+	.byte	0                       ## End Of Children Mark
+	.byte	9                       ## Abbrev [9] 0xb2:0x7 DW_TAG_base_type
+	.long	157                     ## DW_AT_name
+	.byte	7                       ## DW_AT_encoding
+	.byte	4                       ## DW_AT_byte_size
+	.byte	2                       ## Abbrev [2] 0xb9:0x21 DW_TAG_variable
+	.long	182                     ## DW_AT_name
+	.long	59                      ## DW_AT_type
+	.byte	1                       ## DW_AT_decl_file
+	.byte	17                      ## DW_AT_decl_line
+	.byte	7                       ## DW_AT_location
+	.byte	16
+	.ascii	"\201\202\220\245\006"
+	.byte	159
+	.byte	0                       ## End Of Children Mark
+Ldebug_info_end0:

From 34a5669ccd8b8c4edd35488a5ece407f0ed77601 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Sun, 2 Aug 2020 14:36:02 +0200
Subject: [PATCH 15/23] [ORC] Fix SymbolLookupSet::containsDuplicates()

---
 llvm/include/llvm/ExecutionEngine/Orc/Core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index a117acefd2d361..101017f89def17 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -287,7 +287,7 @@ class SymbolLookupSet {
     for (UnderlyingVector::size_type I = 1; I != Symbols.size(); ++I)
       if (Symbols[I].first == Symbols[I - 1].first)
         return true;
-    return true;
+    return false;
   }
 #endif
 

From f12db8cf750bb16515ba635143ca34b0c012968a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Sun, 2 Aug 2020 14:37:33 +0200
Subject: [PATCH 16/23] [ORC] cloneToNewContext() can work with a const-ref to
 ThreadSafeModule

---
 llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h | 2 +-
 llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h b/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
index 58c96737e58076..82f2b7464953f1 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ThreadSafeModule.h
@@ -162,7 +162,7 @@ using GVModifier = std::function<void(GlobalValue &)>;
 
 /// Clones the given module on to a new context.
 ThreadSafeModule
-cloneToNewContext(ThreadSafeModule &TSMW,
+cloneToNewContext(const ThreadSafeModule &TSMW,
                   GVPredicate ShouldCloneDef = GVPredicate(),
                   GVModifier UpdateClonedDefSource = GVModifier());
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
index 1f4e6f1321150d..2e128dd2374439 100644
--- a/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ThreadSafeModule.cpp
@@ -15,7 +15,7 @@
 namespace llvm {
 namespace orc {
 
-ThreadSafeModule cloneToNewContext(ThreadSafeModule &TSM,
+ThreadSafeModule cloneToNewContext(const ThreadSafeModule &TSM,
                                    GVPredicate ShouldCloneDef,
                                    GVModifier UpdateClonedDefSource) {
   assert(TSM && "Can not clone null module");

From fa4b3147e3368f63e27b86ef66cd35f484ceb6d6 Mon Sep 17 00:00:00 2001
From: River Riddle <riddleriver@gmail.com>
Date: Thu, 13 Aug 2020 12:04:57 -0700
Subject: [PATCH 17/23] [mlir][DialectConversion] Update the documentation for
 dialect conversion

This revision updates the documentation for dialect conversion, as many concepts have changed/evolved over time.

Differential Revision: https://reviews.llvm.org/D85167
---
 mlir/docs/DialectConversion.md | 345 +++++++++++++++++++++++----------
 1 file changed, 242 insertions(+), 103 deletions(-)

diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
index c7174147b72eb0..8a308dd6788224 100644
--- a/mlir/docs/DialectConversion.md
+++ b/mlir/docs/DialectConversion.md
@@ -7,7 +7,7 @@ of pattern-based operation rewriting patterns.
 
 [TOC]
 
-To utilize the framework, a few things must be provided:
+The dialect conversion framework consists of the following components:
 
 *   A [Conversion Target](#conversion-target)
 *   A set of [Rewrite Patterns](#rewrite-pattern-specification)
@@ -15,41 +15,44 @@ To utilize the framework, a few things must be provided:
 
 ## Modes of Conversion
 
-When applying a conversion to a set of operations, there are several conversion
-modes that can be selected from:
+When applying a conversion to a set of operations, there are several different
+conversion modes that may be selected from:
 
 *   Partial Conversion
 
     -   A partial conversion will legalize as many operations to the target as
         possible, but will allow pre-existing operations that were not
-        explicitly marked as `illegal` to remain unconverted. This allows for
-        partially lowering parts of the module in the presence of unknown
+        explicitly marked as "illegal" to remain unconverted. This allows for
+        partially lowering parts of the input in the presence of unknown
         operations.
     -   A partial conversion can be applied via `applyPartialConversion`.
 
 *   Full Conversion
 
-    -   A full conversion is only successful if all operations are properly
-        legalized to the given conversion target. This ensures that only known
-        operations will exist after the conversion process.
+    -   A full conversion legalizes all input operations, and is only successful
+        if all operations are properly legalized to the given conversion target.
+        This ensures that only known operations will exist after the conversion
+        process.
     -   A full conversion can be applied via `applyFullConversion`.
 
 *   Analysis Conversion
 
     -   An analysis conversion will analyze which operations are legalizable to
-        the given conversion target if a conversion were to be applied. Note
-        that no rewrites, or transformations, are actually applied to the input
+        the given conversion target if a conversion were to be applied. This is
+        done by performing a 'partial' conversion and recording which operations
+        would have been successfully converted if successful. Note that no
+        rewrites, or transformations, are actually applied to the input
         operations.
     -   An analysis conversion can be applied via `applyAnalysisConversion`.
 
 ## Conversion Target
 
-The conversion target is the formal definition of what is considered to be legal
+The conversion target is a formal definition of what is considered to be legal
 during the conversion process. The final operations generated by the conversion
 framework must be marked as legal on the `ConversionTarget` for the rewrite to
-be a success. Existing operations need not always be legal, though; see the
-different conversion modes for why. Operations and dialects may be marked with
-any of the provided legality actions below:
+be a success. Depending on the conversion mode, existing operations need not
+always be legal. Operations and dialects may be marked with any of the provided
+legality actions below:
 
 *   Legal
 
@@ -68,7 +71,7 @@ any of the provided legality actions below:
 *   Illegal
 
     -   This action signals that no instance of a given operation is legal.
-        Operations marked as `illegal` must always be converted for the
+        Operations marked as "illegal" must always be converted for the
         conversion to be successful. This action also allows for selectively
         marking specific operations as illegal in an otherwise legal dialect.
 
@@ -123,13 +126,12 @@ struct MyTarget : public ConversionTarget {
 
 ### Recursive Legality
 
-In some cases, it may be desirable to mark entire regions of operations as
-legal. This provides an additional granularity of context to the concept of
-"legal". The `ConversionTarget` supports marking operations, that were
-previously added as `Legal` or `Dynamic`, as `recursively` legal. Recursive
-legality means that if an operation instance is legal, either statically or
-dynamically, all of the operations nested within are also considered legal. An
-operation can be marked via `markOpRecursivelyLegal<>`:
+In some cases, it may be desirable to mark entire regions as legal. This
+provides an additional granularity of context to the concept of "legal". If an
+operation is marked recursively legal, either statically or dynamically, then
+all of the operations nested within are also considered legal even if they would
+otherwise be considered "illegal". An operation can be marked via
+`markOpRecursivelyLegal<>`:
 
 ```c++
 ConversionTarget &target = ...;
@@ -149,14 +151,12 @@ target.markOpRecursivelyLegal<MyOp>([](MyOp op) { ... });
 ## Rewrite Pattern Specification
 
 After the conversion target has been defined, a set of legalization patterns
-must be provided to transform illegal operations into legal ones. The patterns
-supplied here, that do not [require type changes](#conversion-patterns), are the
-same as those described in the
-[quickstart rewrites guide](Tutorials/QuickstartRewrites.md#adding-patterns), but have a
-few additional [restrictions](#restrictions). The patterns provided do not need
-to generate operations that are directly legal on the target. The framework will
-automatically build a graph of conversions to convert non-legal operations into
-a set of legal ones.
+must be provided to transform illegal operations into legal ones. The structure
+of the patterns supplied here is the same as those described in the
+[quickstart rewrites guide](Tutorials/QuickstartRewrites.md#adding-patterns).
+The patterns provided do not need to generate operations that are directly legal
+on the target. The framework will automatically build a graph of conversions to
+convert non-legal operations into a set of legal ones.
 
 As an example, say you define a target that supports one operation: `foo.add`.
 When providing the following patterns: [`bar.add` -> `baz.add`, `baz.add` ->
@@ -165,38 +165,139 @@ When providing the following patterns: [`bar.add` -> `baz.add`, `baz.add` ->
 means that you don’t have to define a direct legalization pattern for `bar.add`
 -> `foo.add`.
 
-### Restrictions
+### Conversion Patterns
+
+Along with the general `RewritePattern` classes, the conversion framework
+provides a special type of rewrite pattern that can be used when a pattern
+relies on interacting with constructs specific to the conversion process, the
+`ConversionPattern`. For example, the conversion process does not necessarily
+update operations in-place and instead creates a mapping of events such as
+replacements and erasures, and only applies them when the entire conversion
+process is successful. Certain classes of patterns rely on using the
+updated/remapped operands of an operation, such as when the types of results
+defined by an operation have changed. The general Rewrite Patterns can no longer
+be used in these situations, as the types of the operands of the operation being
+matched will not correspond with those expected by the user. This pattern
+provides, as an additional argument to the `matchAndRewrite` and `rewrite`
+methods, the list of operands that the operation should use after conversion. If
+an operand was the result of a non-converted operation, for example if it was
+already legal, the original operand is used. This means that the operands
+provided always have a 1-1 non-null correspondence with the operands on the
+operation. The original operands of the operation are still intact and may be
+inspected as normal. These patterns also utilize a special `PatternRewriter`,
+`ConversionPatternRewriter`, that provides special hooks for use with the
+conversion infrastructure.
 
-The framework processes operations in topological order, trying to legalize them
-individually. As such, patterns used in the conversion framework have a few
-additional restrictions:
+```c++
+struct MyConversionPattern : public ConversionPattern {
+  /// The `matchAndRewrite` hooks on ConversionPatterns take an additional
+  /// `operands` parameter, containing the remapped operands of the original
+  /// operation.
+  virtual LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const;
+};
+```
 
-1.  If a pattern matches, it must erase or replace the op it matched on.
-    Operations can *not* be updated in place.
-2.  Match criteria should not be based on the IR outside of the op itself. The
-    preceding ops will already have been processed by the framework (although it
-    may not update uses), and the subsequent IR will not yet be processed. This
-    can create confusion if a pattern attempts to match against a sequence of
-    ops (e.g. rewrite A + B -> C). That sort of rewrite should be performed in a
-    separate pass.
+#### Type Safety
+
+The types of the remapped operands provided to a conversion pattern must be of a
+type expected by the pattern. The expected types of a pattern are determined by
+a provided [TypeConverter](#type-converter). If no type converter is provided,
+the types of the remapped operands are expected to match the types of the
+original operands. If a type converter is provided, the types of the remapped
+operands are expected to be legal as determined by the converter. If the
+remapped operand types are not of an expected type, and a materialization to the
+expected type could not be performed, the pattern fails application before the
+`matchAndRewrite` hook is invoked. This ensures that patterns do not have to
+explicitly ensure type safety, or sanitize the types of the incoming remapped
+operands. More information on type conversion is detailed in the
+[dedicated section](#type-conversion) below.
 
 ## Type Conversion
 
 It is sometimes necessary as part of a conversion to convert the set types of
 being operated on. In these cases, a `TypeConverter` object may be defined that
-details how types should be converted. The `TypeConverter` is used by patterns
-and by the general conversion infrastructure to convert the signatures of blocks
-and regions.
+details how types should be converted when interfacing with a pattern. A
+`TypeConverter` may be used to convert the signatures of block arguments and
+regions, to define the expected inputs types of the pattern, and to reconcile
+type differences in general.
 
 ### Type Converter
 
-As stated above, the `TypeConverter` contains several hooks for detailing how to
-convert types. Several of these hooks are detailed below:
+The `TypeConverter` contains several hooks for detailing how to convert types,
+and how to materialize conversions between types in various situations. The two
+main aspects of the `TypeConverter` are conversion and materialization.
+
+A `conversion` describes how a given illegal source `Type` should be converted
+to N target types. If the source type is already "legal", it should convert to
+itself. Type conversions are specified via the `addConversion` method described
+below.
+
+A `materialization` describes how a set of values should be converted to a
+single value of a desired type. An important distinction with a `conversion` is
+that a `materialization` can produce IR, whereas a `conversion` cannot. These
+materializations are used by the conversion framework to ensure type safety
+during the conversion process. There are several types of materializations
+depending on the situation.
+
+*   Argument Materialization
+
+    -   An argument materialization is used when converting the type of a block
+        argument during a [signature conversion](#region-signature-conversion).
+
+*   Source Materialization
+
+    -   A source materialization converts from a value with a "legal" target
+        type, back to a specific source type. This is used when an operation is
+        "legal" during the conversion process, but contains a use of an illegal
+        type. This may happen during a conversion where some operations are
+        converted to those with different resultant types, but still retain
+        users of the original type system.
+    -   This materialization is used in the following situations:
+        *   When a block argument has been converted to a different type, but
+            the original argument still has users that will remain live after
+            the conversion process has finished.
+        *   When the result type of an operation has been converted to a
+            different type, but the original result still has users that will
+            remain live after the conversion process is finished.
+
+*   Target Materialization
+
+    -   A target materialization converts from a value with an "illegal" source
+        type, to a value of a "legal" type. This is used when a pattern expects
+        the remapped operands to be of a certain set of types, but the original
+        input operands have not been converted. This may happen during a
+        conversion where some operations are converted to those with different
+        resultant types, but still retain uses of the original type system.
+    -   This materialization is used in the following situations:
+        *   When the remapped operands of a
+            [conversion pattern](#conversion-patterns) are not legal for the
+            type conversion provided by the pattern.
+
+If a converted value is used by an operation that isn't converted, it needs a
+conversion back to the `source` type, hence source materialization; if an
+unconverted value is used by an operation that is being converted, it needs
+conversion to the `target` type, hence target materialization.
+
+As noted above, the conversion process guarantees that the type contract of the
+IR is preserved during the conversion. This means that the types of value uses
+will not implicitly change during the conversion process. When the type of a
+value definition, either block argument or operation result, is being changed,
+the users of that definition must also be updated during the conversion process.
+If they aren't, a type conversion must be materialized to ensure that a value of
+the expected type is still present within the IR. If a target materialization is
+required, but cannot be performed, the pattern application fails. If a source
+materialization is required, but cannot be performed, the entire conversion
+process fails.
+
+Several of the available hooks are detailed below:
 
 ```c++
 class TypeConverter {
  public:
-  /// Register a conversion function. A conversion function must be convertible
+  /// Register a conversion function. A conversion function defines how a given
+  /// source type should be converted. A conversion function must be convertible
   /// to any of the following forms(where `T` is a class derived from `Type`:
   ///   * Optional<Type>(T)
   ///     - This form represents a 1-1 type conversion. It should return nullptr
@@ -210,56 +311,53 @@ class TypeConverter {
   ///       existing value are expected to be removed during conversion. If
   ///       `llvm::None` is returned, the converter is allowed to try another
   ///       conversion function to perform the conversion.
-  ///
-  /// When attempting to convert a type, e.g. via `convertType`, the
-  /// `TypeConverter` will invoke each of the converters starting with the one
-  /// most recently registered.
-  template <typename ConversionFnT>
-  void addConversion(ConversionFnT &&callback);
-
-  /// Register a materialization function, which must be convertibe to the
-  /// following form
-  ///   `Optional<Value>(PatternRewriter &, T, ValueRange, Location)`,
-  /// where `T` is any subclass of `Type`. This function is responsible for
-  /// creating an operation, using the PatternRewriter and Location provided,
-  /// that "casts" a range of values into a single value of the given type `T`.
-  /// It must return a Value of the converted type on success, an `llvm::None`
-  /// if it failed but other materialization can be attempted, and `nullptr` on
-  /// unrecoverable failure. It will only be called for (sub)types of `T`.
-  /// Materialization functions must be provided when a type conversion
-  /// results in more than one type, or if a type conversion may persist after
-  /// the conversion has finished.
-  template <typename FnT>
-  void addMaterialization(FnT &&callback);
-};
-```
-
-### Conversion Patterns
-
-When type conversion comes into play, the general Rewrite Patterns can no longer
-be used. This is due to the fact that the operands of the operation being
-matched will not correspond with the operands of the correct type as determined
-by `TypeConverter`. The operation rewrites on type boundaries must thus use a
-special pattern, the `ConversionPattern`. This pattern provides, as an
-additional argument to the `matchAndRewrite` and `rewrite` methods, the set of
-remapped operands corresponding to the desired type. These patterns also utilize
-a special `PatternRewriter`, `ConversionPatternRewriter`, that provides special
-hooks for use with the conversion infrastructure.
+  /// Note: When attempting to convert a type, e.g. via 'convertType', the
+  ///       mostly recently added conversions will be invoked first.
+  template <typename FnT,
+            typename T = typename llvm::function_traits<FnT>::template arg_t<0>>
+  void addConversion(FnT &&callback) {
+    registerConversion(wrapCallback<T>(std::forward<FnT>(callback)));
+  }
 
-```c++
-struct MyConversionPattern : public ConversionPattern {
-  /// The `matchAndRewrite` hooks on ConversionPatterns take an additional
-  /// `operands` parameter, containing the remapped operands of the original
-  /// operation.
-  virtual LogicalResult
-  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const;
+  /// Register a materialization function, which must be convertible to the
+  /// following form:
+  ///   `Optional<Value> (OpBuilder &, T, ValueRange, Location)`,
+  ///   where `T` is any subclass of `Type`.
+  /// This function is responsible for creating an operation, using the
+  /// OpBuilder and Location provided, that "converts" a range of values into a
+  /// single value of the given type `T`. It must return a Value of the
+  /// converted type on success, an `llvm::None` if it failed but other
+  /// materialization can be attempted, and `nullptr` on unrecoverable failure.
+  /// It will only be called for (sub)types of `T`.
+  ///
+  /// This method registers a materialization that will be called when
+  /// converting an illegal block argument type, to a legal type.
+  template <typename FnT,
+            typename T = typename llvm::function_traits<FnT>::template arg_t<1>>
+  void addArgumentMaterialization(FnT &&callback) {
+    argumentMaterializations.emplace_back(
+        wrapMaterialization<T>(std::forward<FnT>(callback)));
+  }
+  /// This method registers a materialization that will be called when
+  /// converting a legal type to an illegal source type. This is used when
+  /// conversions to an illegal type must persist beyond the main conversion.
+  template <typename FnT,
+            typename T = typename llvm::function_traits<FnT>::template arg_t<1>>
+  void addSourceMaterialization(FnT &&callback) {
+    sourceMaterializations.emplace_back(
+        wrapMaterialization<T>(std::forward<FnT>(callback)));
+  }
+  /// This method registers a materialization that will be called when
+  /// converting type from an illegal, or source, type to a legal type.
+  template <typename FnT,
+            typename T = typename llvm::function_traits<FnT>::template arg_t<1>>
+  void addTargetMaterialization(FnT &&callback) {
+    targetMaterializations.emplace_back(
+        wrapMaterialization<T>(std::forward<FnT>(callback)));
+  }
 };
 ```
 
-These patterns have the same [restrictions](#restrictions) as the basic rewrite
-patterns used in dialect conversion.
-
 ### Region Signature Conversion
 
 From the perspective of type conversion, the types of block arguments are a bit
@@ -268,15 +366,16 @@ different operations. Given this, the conversion of the types for blocks must be
 done explicitly via a conversion pattern. To convert the types of block
 arguments within a Region, a custom hook on the `ConversionPatternRewriter` must
 be invoked; `convertRegionTypes`. This hook uses a provided type converter to
-apply type conversions to all blocks within the region, and all blocks that move
-into that region. This hook also takes an optional
-`TypeConverter::SignatureConversion` parameter that applies a custom conversion
-to the entry block of the region. The types of the entry block arguments are
-often tied semantically to details on the operation, e.g. FuncOp, AffineForOp,
-etc. To convert the signature of just the region entry block, and not any other
-blocks within the region, the `applySignatureConversion` hook may be used
-instead. A signature conversion, `TypeConverter::SignatureConversion`, can be
-built programmatically:
+apply type conversions to all blocks within a given region, and all blocks that
+move into that region. As noted above, the conversions performed by this method
+use the argument materialization hook on the `TypeConverter`. This hook also
+takes an optional `TypeConverter::SignatureConversion` parameter that applies a
+custom conversion to the entry block of the region. The types of the entry block
+arguments are often tied semantically to details on the operation, e.g. FuncOp,
+AffineForOp, etc. To convert the signature of just the region entry block, and
+not any other blocks within the region, the `applySignatureConversion` hook may
+be used instead. A signature conversion, `TypeConverter::SignatureConversion`,
+can be built programmatically:
 
 ```c++
 class SignatureConversion {
@@ -303,3 +402,43 @@ public:
 The `TypeConverter` provides several default utilities for signature conversion
 and legality checking:
 `convertSignatureArgs`/`convertBlockSignature`/`isLegal(Region *|Type)`.
+
+## Debugging
+
+To debug the execution of the dialect conversion framework,
+`-debug-only=dialect-conversion` may be used. This command line flag activates
+LLVM's debug logging infrastructure solely for the conversion framework. The
+output is formatted as a tree structure, mirroring the structure of the
+conversion process. This output contains all of the actions performed by the
+rewriter, how generated operations get legalized, and why they fail.
+
+Example output is shown below:
+
+```
+//===-------------------------------------------===//
+Legalizing operation : 'std.return'(0x608000002e20) {
+  "std.return"() : () -> ()
+
+  * Fold {
+  } -> FAILURE : unable to fold
+
+  * Pattern : 'std.return -> ()' {
+    ** Insert  : 'spv.Return'(0x6070000453e0)
+    ** Replace : 'std.return'(0x608000002e20)
+
+    //===-------------------------------------------===//
+    Legalizing operation : 'spv.Return'(0x6070000453e0) {
+      "spv.Return"() : () -> ()
+
+    } -> SUCCESS : operation marked legal by the target
+    //===-------------------------------------------===//
+  } -> SUCCESS : pattern applied successfully
+} -> SUCCESS
+//===-------------------------------------------===//
+```
+
+This output is describing the legalization of an `std.return` operation. We
+first try to legalize by folding the operation, but that is unsuccessful for
+`std.return`. From there, a pattern is applied that replaces the `std.return`
+with a `spv.Return`. The newly generated `spv.Return` is then processed for
+legalization, but is found to already legal as per the target.

From f7a13479b809cdeb9d63d0daa0d6ab61f04d5f7a Mon Sep 17 00:00:00 2001
From: River Riddle <riddleriver@gmail.com>
Date: Thu, 13 Aug 2020 12:05:04 -0700
Subject: [PATCH 18/23] [mlir][docs] Update/Add documentation for MLIRs Pattern
 Rewrite infrastructure

This infrastructure has evolved a lot over the course of MLIRs lifetime, and has never truly been documented outside of rationale or proposals. This revision aims to document the infrastructure and user facing API, with the rationale specific portions moved to the Rationale folder and updated.

Differential Revision: https://reviews.llvm.org/D85260
---
 mlir/docs/DialectConversion.md                |  12 +-
 mlir/docs/GenericDAGRewriter.md               | 415 ------------------
 mlir/docs/PatternRewriter.md                  | 256 +++++++++++
 mlir/docs/Rationale/MLIRForGraphAlgorithms.md |   2 +-
 .../Rationale/RationaleGenericDAGRewriter.md  | 286 ++++++++++++
 mlir/docs/Tutorials/Toy/Ch-3.md               |   2 +-
 6 files changed, 550 insertions(+), 423 deletions(-)
 delete mode 100644 mlir/docs/GenericDAGRewriter.md
 create mode 100644 mlir/docs/PatternRewriter.md
 create mode 100644 mlir/docs/Rationale/RationaleGenericDAGRewriter.md

diff --git a/mlir/docs/DialectConversion.md b/mlir/docs/DialectConversion.md
index 8a308dd6788224..4d3be5ed2a98cd 100644
--- a/mlir/docs/DialectConversion.md
+++ b/mlir/docs/DialectConversion.md
@@ -151,12 +151,12 @@ target.markOpRecursivelyLegal<MyOp>([](MyOp op) { ... });
 ## Rewrite Pattern Specification
 
 After the conversion target has been defined, a set of legalization patterns
-must be provided to transform illegal operations into legal ones. The structure
-of the patterns supplied here is the same as those described in the
-[quickstart rewrites guide](Tutorials/QuickstartRewrites.md#adding-patterns).
-The patterns provided do not need to generate operations that are directly legal
-on the target. The framework will automatically build a graph of conversions to
-convert non-legal operations into a set of legal ones.
+must be provided to transform illegal operations into legal ones. The patterns
+supplied here have the same structure and restrictions as those described in the
+main [Pattern](PatternRewriter.md) documentation. The patterns provided do not
+need to generate operations that are directly legal on the target. The framework
+will automatically build a graph of conversions to convert non-legal operations
+into a set of legal ones.
 
 As an example, say you define a target that supports one operation: `foo.add`.
 When providing the following patterns: [`bar.add` -> `baz.add`, `baz.add` ->
diff --git a/mlir/docs/GenericDAGRewriter.md b/mlir/docs/GenericDAGRewriter.md
deleted file mode 100644
index a187c989889016..00000000000000
--- a/mlir/docs/GenericDAGRewriter.md
+++ /dev/null
@@ -1,415 +0,0 @@
-# Generic DAG Rewriter Infrastructure
-
-## Introduction and Motivation
-
-The goal of a compiler IR is to represent code - at various levels of
-abstraction which pose different sets of tradeoffs in terms of representational
-capabilities and ease of transformation. However, the ability to represent code
-is not itself very useful - you also need to be able to implement those
-transformations.
-
-There are many different sorts of compiler transformations, but this document
-focuses on a particularly important class of transformation that comes up
-repeatedly at scale, and is important for the immediate goals of MLIR: that of
-pattern matching on a set of operations and replacing with another set. This is
-the key algorithm required to implement the "op fission" algorithm used by the
-tf2xla bridge, pattern matching rewrites from TF ops to TF/Lite, peephole
-optimizations like "eliminate identity nodes" or "replace x+0 with x", as well
-as a useful abstraction to implement optimization algorithms for MLIR graphs at
-all levels.
-
-A particular strength of MLIR (and a major difference vs other compiler
-infrastructures like LLVM, GCC, XLA, TensorFlow, etc) is that it uses a single
-compiler IR to represent code at multiple levels of abstraction: an MLIR
-operation can be a "TensorFlow operation", an "XLA HLO", a "TF Lite
-FlatBufferModel op", a TPU LLO instruction, an LLVM IR instruction (transitively
-including X86, Lanai, CUDA, and other target specific instructions), or anything
-else that the MLIR type system can reasonably express. Because MLIR spans such a
-wide range of different problems, a single infrastructure for performing
-graph-to-graph rewrites can help solve many diverse domain challenges, including
-TensorFlow graph level down to the machine code level.
-
-[Static single assignment](https://en.wikipedia.org/wiki/Static_single_assignment_form)
-(SSA) representations like MLIR make it easy to access the operands and "users"
-of an operation. As such, a natural abstraction for these graph-to-graph
-rewrites is that of DAG pattern matching: clients define DAG tile patterns, and
-each pattern includes a result DAG to produce and the cost of the result (or,
-inversely, the benefit of doing the replacement). A common infrastructure
-efficiently finds and perform the rewrites.
-
-While this concept is simple, the details are more nuanced. This proposal
-defines and explores a set of abstractions that we feel can solve a wide range
-of different problems, and can be applied to many different sorts of problems
-that MLIR is - and is expected to - face over time. We do this by separating the
-pattern definition and matching algorithm from the "driver" of the computation
-loop, and make space for the patterns to be defined declaratively in the future.
-
-## Related Work
-
-There is a huge amount of related work to consider, given that pretty much every
-compiler in existence has to solve this problem many times over. Here are a few
-graph rewrite systems we have used, along with the pros and cons of this related
-work. One unifying problem with all of these is that these systems are only
-trying to solve one particular and usually narrow problem: our proposal would
-like to solve many of these problems with a single infrastructure. Of these, the
-most similar design to our proposal is the LLVM DAG-to-DAG instruction selection
-algorithm at the end.
-
-### Constant folding
-
-A degenerate but pervasive case of DAG-to-DAG pattern matching is constant
-folding: given an operation whose operands contain constants can often be folded
-to a result constant value.
-
-MLIR already has constant folding routines which provide a simpler API than a
-general DAG-to-DAG pattern matcher, and we expect it to remain because the
-simpler contract makes it applicable in some cases that a generic matcher would
-not. For example, a DAG-rewrite can remove arbitrary nodes in the current
-function, which could invalidate iterators. Constant folding as an API does not
-remove any nodes, it just provides a (list of) constant values and allows the
-clients to update their data structures as necessary.
-
-### AST-Level Pattern Matchers
-
-The literature is full of source-to-source translators which transform
-identities in order to improve performance (e.g. transforming `X*0` into `0`).
-One large example that I'm aware of is the GCC `fold` function, which performs
-[many optimizations](https://github.com/gcc-mirror/gcc/blob/master/gcc/fold-const.c)
-on ASTs. Clang has
-[similar routines](http://releases.llvm.org/3.5.0/tools/clang/docs/InternalsManual.html#constant-folding-in-the-clang-ast)
-for simple constant folding of expressions (as required by the C++ standard) but
-doesn't perform general optimizations on its ASTs.
-
-The primary downside of tree optimizers is that you can't see across operations
-that have multiple uses. It is
-[well known in literature](https://llvm.org/pubs/2008-06-LCTES-ISelUsingSSAGraphs.pdf)
-that DAG pattern matching is more powerful than tree pattern matching, but OTOH,
-DAG pattern matching can lead to duplication of computation which needs to be
-checked for.
-
-### "Combiners" and other peephole optimizers
-
-Compilers end up with a lot of peephole optimizers for various things, e.g. the
-GCC
-["combine" routines](https://github.com/gcc-mirror/gcc/blob/master/gcc/combine.c)
-(which try to merge two machine instructions into a single one), the LLVM
-[Inst Combine](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/)
-[pass](https://llvm.org/docs/Passes.html#instcombine-combine-redundant-instructions),
-LLVM's
-[DAG Combiner](https://github.com/llvm-mirror/llvm/blob/master/lib/CodeGen/SelectionDAG/DAGCombiner.cpp),
-the Swift compiler's
-[SIL Combiner](https://github.com/apple/swift/tree/master/lib/SILOptimizer/SILCombiner),
-etc. These generally match one or more operations and produce zero or more
-operations as a result. The LLVM
-[Legalization](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/)
-infrastructure has a different outer loop but otherwise works the same way.
-
-These passes have a lot of diversity, but also have a unifying structure: they
-mostly have a worklist outer loop which visits operations. They then use the C++
-visitor pattern (or equivalent) to switch over the class of operation and
-dispatch to a method. That method contains a long list of hand-written C++ code
-that pattern-matches various special cases. LLVM introduced a "match" function
-that allows writing patterns in a somewhat more declarative style using template
-metaprogramming (MLIR has similar facilities). Here's a simple example:
-
-```c++
-  // Y - (X + 1) --> ~X + Y
-  if (match(Op1, m_OneUse(m_Add(m_Value(X), m_One()))))
-    return BinaryOperator::CreateAdd(Builder.CreateNot(X), Op0);
-```
-
-Here is a somewhat more complicated one (this is not the biggest or most
-complicated :)
-
-```c++
-  // C2 is ODD
-  // LHS = XOR(Y,C1), Y = AND(Z,C2), C1==(C2+1) => LHS == NEG(OR(Z, ~C2))
-  // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
-  if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
-    if (C1->countTrailingZeros() == 0)
-      if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
-        Value NewOr = Builder.CreateOr(Z, ~(*C2));
-        return Builder.CreateSub(RHS, NewOr, "sub");
-      }
-```
-
-These systems are simple to set up, and pattern matching templates have some
-advantages (they are extensible for new sorts of sub-patterns, look compact at
-point of use). OTOH, they have lots of well known problems, for example:
-
-*   These patterns are very error prone to write, and contain lots of
-    redundancies.
-*   The IR being matched often has identities (e.g. when matching commutative
-    operators) and the C++ code has to handle it manually - take a look at
-    [the full code](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/InstCombine/InstCombineAddSub.cpp?view=markup#l775)
-    for checkForNegativeOperand that defines the second pattern).
-*   The matching code compiles slowly, both because it generates tons of code
-    and because the templates instantiate slowly.
-*   Adding new patterns (e.g. for count leading zeros in the example above) is
-    awkward and doesn't often happen.
-*   The cost model for these patterns is not really defined - it is emergent
-    based on the order the patterns are matched in code.
-*   They are non-extensible without rebuilding the compiler.
-*   It isn't practical to apply theorem provers and other tools to these
-    patterns - they cannot be reused for other purposes.
-
-In addition to structured "combiners" like these, there are lots of ad-hoc
-systems like the
-[LLVM Machine code peephole optimizer](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp?view=markup)
-which are related.
-
-### LLVM's DAG-to-DAG Instruction Selection Infrastructure
-
-The instruction selection subsystem in LLVM is the result of many years worth of
-iteration and discovery, driven by the need for LLVM to support code generation
-for lots of targets, the complexity of code generators for modern instruction
-sets (e.g. X86), and the fanatical pursuit of reusing code across targets. Eli
-wrote a
-[nice short overview](https://eli.thegreenplace.net/2013/02/25/a-deeper-look-into-the-llvm-code-generator-part-1)
-of how this works, and the
-[LLVM documentation](https://llvm.org/docs/CodeGenerator.html#select-instructions-from-dag)
-describes it in more depth including its advantages and limitations. It allows
-writing patterns like this.
-
-```
-def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
-          (BLCI64rr GR64:$src)>;
-```
-
-This example defines a matcher for the
-["blci" instruction](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#TBM_\(Trailing_Bit_Manipulation\))
-in the
-[X86 target description](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.td?view=markup),
-there are many others in that file (look for `Pat<>` patterns, since they aren't
-entangled in details of the compiler like assembler/disassembler generation
-logic).
-
-For our purposes, there is much to like about this system, for example:
-
-*   It is defined in a declarative format.
-*   It is extensible to target-defined operations.
-*   It automates matching across identities, like commutative patterns.
-*   It allows custom abstractions and intense factoring of target-specific
-    commonalities.
-*   It generates compact code - it compiles into a state machine, which is
-    interpreted.
-*   It allows the instruction patterns to be defined and reused for multiple
-    purposes.
-*   The patterns are "type checked" at compile time, detecting lots of bugs
-    early and eliminating redundancy from the pattern specifications.
-*   It allows the use of general C++ code for weird/complex cases.
-
-While there is a lot that is good here, there is also a lot of bad things:
-
-*   All of this machinery is only applicable to instruction selection. Even
-    directly adjacent problems like the DAGCombiner and Legalizer can't use it.
-*   This isn't extensible at compiler runtime, you have to rebuild the compiler
-    to extend it.
-*   The error messages when failing to match a pattern
-    [are not exactly optimal](https://www.google.com/search?q=llvm+cannot+select).
-*   It has lots of implementation problems and limitations (e.g. can't write a
-    pattern for a multi-result operation) as a result of working with the
-    awkward SelectionDAG representation and being designed and implemented
-    lazily.
-*   This stuff all grew organically over time and has lots of sharp edges.
-
-### Summary
-
-MLIR will face a wide range of pattern matching and graph rewrite problems, and
-one of the major advantages of having a common representation for code at
-multiple levels that it allows us to invest in - and highly leverage - a single
-infra for doing this sort of work.
-
-## Goals
-
-This proposal includes support for defining pattern matching and rewrite
-algorithms on MLIR. We'd like these algorithms to encompass many problems in the
-MLIR space, including 1-to-N expansions (e.g. as seen in the TF/XLA bridge when
-lowering a "tf.AddN" to multiple "add" HLOs), M-to-1 patterns (as seen in
-Grappler optimization passes, e.g. that convert multiple/add into a single
-muladd op), as well as general M-to-N patterns (e.g. instruction selection for
-target instructions). Patterns should have a cost associated with them, and the
-common infrastructure should be responsible for sorting out the lowest cost
-match for a given application.
-
-We separate the task of picking a particular locally optimal pattern from a
-given root node, the algorithm used to rewrite an entire graph given a
-particular set of goals, and the definition of the patterns themselves. We do
-this because DAG tile pattern matching is NP complete, which means that there
-are no known polynomial time algorithms to optimally solve this problem.
-Additionally, we would like to support iterative rewrite algorithms that
-progressively transform the input program through multiple steps. Furthermore,
-we would like to support many different sorts of clients across the MLIR stack,
-and they may have different tolerances for compile time cost, different demands
-for optimality, and other algorithmic goals or constraints.
-
-We aim for MLIR transformations to be easy to implement and reduce the
-likelihood for compiler bugs. We expect there to be a very very large number of
-patterns that are defined over time, and we believe that these sorts of patterns
-will have a very large number of legality/validity constraints - many of which
-are difficult to reason about in a consistent way, may be target specific, and
-whose implementation may be particularly bug-prone. As such, we aim to design the
-API around pattern definition to be simple, resilient to programmer errors, and
-allow separation of concerns between the legality of the nodes generated from
-the idea of the pattern being defined.
-
-Finally, error handling is a topmost concern: in addition to allowing patterns
-to be defined in a target-independent way that may not apply for all hardware,
-we also want failure for any pattern to match to be diagnosable in a reasonable
-way. To be clear, this is not a solvable problem in general - the space of
-malfunction is too great to be fully enumerated and handled optimally, but there
-are better and worse ways to handle the situation. MLIR is already designed to
-represent the provenance of an operation well. This project aims to propagate
-that provenance information precisely, as well as diagnose pattern match
-failures with the rationale for why a set of patterns do not apply.
-
-### Non goals
-
-This proposal doesn't aim to solve all compiler problems, it is simply a
-DAG-to-DAG pattern matching system, starting with a greedy driver algorithm.
-Compiler algorithms that require global dataflow analysis (e.g. common
-subexpression elimination, conditional constant propagation, and many many
-others) will not be directly solved by this infrastructure.
-
-This proposal is limited to DAG patterns, which (by definition) prevent the
-patterns from seeing across cycles in a graph. In an SSA-based IR like MLIR,
-this means that these patterns don't see across PHI nodes / basic block
-arguments. We consider this acceptable given the set of problems we are trying
-to solve - we don't know of any other system that attempts to do so, and
-consider the payoff of worrying about this to be low.
-
-This design includes the ability for DAG patterns to have associated costs
-(benefits), but those costs are defined in terms of magic numbers (typically
-equal to the number of nodes being replaced). For any given application, the
-units of magic numbers will have to be defined.
-
-## Overall design
-
-We decompose the problem into four major pieces:
-
-1.  the code that is used to define patterns to match, cost, and their
-    replacement actions
-1.  the driver logic to pick the best match for a given root node
-1.  the client that is implementing some transformation (e.g. a combiner)
-1.  (future) the subsystem that allows patterns to be described with a
-    declarative syntax, which sugars step #1.
-
-We sketch the first three of these pieces, each in turn. This is not intended to
-be a concrete API proposal, merely to describe the design
-
-### Defining Patterns
-
-Each pattern will be an instance of a mlir::Pattern class, whose subclasses
-implement methods like this. Note that this API is meant for exposition, the
-actual details are different for efficiency and coding standards reasons (e.g.
-the memory management of `PatternState` is not specified below, etc):
-
-```c++
-class Pattern {
-  /// Return the benefit (the inverse of "cost") of matching this pattern.  The
-  /// benefit of a Pattern is always static - rewrites that may have dynamic
-  /// benefit can be instantiated multiple times (different Pattern instances)
-  /// for each benefit that they may return, and be guarded by different match
-  /// condition predicates.
-  PatternBenefit getBenefit() const { return benefit; }
-
-  /// Return the root node that this pattern matches.  Patterns that can
-  /// match multiple root types are instantiated once per root.
-  OperationName getRootKind() const { return rootKind; }
-
-  /// Attempt to match against code rooted at the specified operation,
-  /// which is the same operation code as getRootKind().  On failure, this
-  /// returns a None value.  On success it a (possibly null) pattern-specific
-  /// state wrapped in a Some.  This state is passed back into its rewrite
-  /// function if this match is selected.
-  virtual Optional<PatternState*> match(Operation *op) const = 0;
-
-  /// Rewrite the IR rooted at the specified operation with the result of
-  /// this pattern, generating any new operations with the specified
-  /// rewriter.  If an unexpected error is encountered (an internal
-  /// compiler error), it is emitted through the normal MLIR diagnostic
-  /// hooks and the IR is left in a valid state.
-  virtual void rewrite(Operation *op, PatternState *state,
-                       PatternRewriter &rewriter) const;
-};
-```
-
-In practice, the first patterns we implement will directly subclass and
-implement this stuff, but we will define some helpers to reduce boilerplate.
-When we have a declarative way to describe patterns, this should be
-automatically generated from the description.
-
-Instances of `Pattern` have a benefit that is static upon construction of the
-pattern instance, but may be computed dynamically at pattern initialization
-time, e.g. allowing the benefit to be derived from domain specific information,
-like the target architecture). This limitation allows us MLIR to (eventually)
-perform pattern fusion and compile patterns into an efficient state machine, and
-[Thier, Ertl, and Krall](https://dl.acm.org/citation.cfm?id=3179501) have shown
-that match predicates eliminate the need for dynamically computed costs in
-almost all cases: you can simply instantiate the same pattern one time for each
-possible cost and use the predicate to guard the match.
-
-The two-phase nature of this API (match separate from rewrite) is important for
-two reasons: 1) some clients may want to explore different ways to tile the
-graph, and only rewrite after committing to one tiling. 2) We want to support
-runtime extensibility of the pattern sets, but want to be able to statically
-compile the bulk of known patterns into a state machine at "compiler compile
-time". Both of these reasons lead to us needing to match multiple patterns
-before committing to an answer.
-
-### Picking and performing a replacement
-
-In the short term, this API can be very simple, something like this can work and
-will be useful for many clients:
-
-```c++
-class PatternMatcher {
-   // Create a pattern matcher with a bunch of patterns.  This constructor
-   // looks across all of the specified patterns, and builds an internal
-   // data structure that allows efficient matching.
-   PatternMatcher(ArrayRef<Pattern*> patterns);
-
-   // Given a specific operation, see if there is some rewrite that is
-   // interesting.  If so, return success and return the list of new
-   // operations that were created.  If not, return failure.
-   bool matchAndRewrite(Operation *op,
-                        SmallVectorImpl<Operation*> &newlyCreatedOps);
-};
-```
-
-In practice the interesting part of this class is the acceleration structure it
-builds internally. It buckets up the patterns by root operation, and sorts them
-by their static benefit. When performing a match, it tests any dynamic patterns,
-then tests statically known patterns from highest to lowest benefit.
-
-### First Client: A Greedy Worklist Combiner
-
-We expect that there will be lots of clients for this, but a simple greedy
-worklist-driven combiner should be powerful enough to serve many important ones,
-including the
-[TF2XLA op expansion logic](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/tf2xla/kernels),
-many of the pattern substitution passes of the
-[TOCO compiler](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco)
-for TF-Lite, many
-[Grappler](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/grappler)
-passes, and other general performance optimizations for applying identities.
-
-The structure of this algorithm is straight-forward, here is pseudo code:
-
-*   Walk a function in preorder, adding each operation to a worklist.
-*   While the worklist is non-empty, pull something off the back (processing
-    things generally in postorder)
-    *   Perform matchAndRewrite on the operation. If failed, continue to the
-        next operation.
-    *   On success, add the newly created ops to the worklist and continue.
-
-## Future directions
-
-It is important to get implementation and usage experience with this, and many
-patterns can be defined using this sort of framework. Over time, we can look to
-make it easier to declare patterns in a declarative form (e.g. with the LLVM
-tblgen tool or something newer/better). Once we have that, we can define an
-internal abstraction for describing the patterns to match, allowing better high
-level optimization of patterns (including fusion of the matching logic across
-patterns, which the LLVM instruction selector does) and allow the patterns to be
-defined without rebuilding the compiler itself.
diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md
new file mode 100644
index 00000000000000..2a2c30d98e04a7
--- /dev/null
+++ b/mlir/docs/PatternRewriter.md
@@ -0,0 +1,256 @@
+# Pattern Rewriting : Generic DAG-to-DAG Rewriting
+
+[TOC]
+
+This document details the design and API of the pattern rewriting infrastructure
+present in MLIR, a general DAG-to-DAG transformation framework. This framework
+is widely used throughout MLIR for canonicalization, conversion, and general
+transformation.
+
+For an introduction to DAG-to-DAG transformation, and the rationale behind this
+framework please take a look at the
+[Generic DAG Rewriter Rationale](Rationale/RationaleGenericDAGRewriter.md).
+
+## Introduction
+
+The pattern rewriting framework can largely be decomposed into two parts:
+Pattern Definition and Pattern Application.
+
+## Defining Patterns
+
+Patterns are defined by inheriting from the `RewritePattern` class. This class
+represents the base class of all rewrite patterns within MLIR, and is comprised
+of the following components:
+
+### Benefit
+
+This is the expected benefit of applying a given pattern. This benefit is static
+upon construction of the pattern, but may be computed dynamically at pattern
+initialization time, e.g. allowing the benefit to be derived from domain
+specific information (like the target architecture). This limitation allows for
+performing pattern fusion and compiling patterns into an efficient state
+machine, and
+[Thier, Ertl, and Krall](https://dl.acm.org/citation.cfm?id=3179501) have shown
+that match predicates eliminate the need for dynamically computed costs in
+almost all cases: you can simply instantiate the same pattern one time for each
+possible cost and use the predicate to guard the match.
+
+### Root Operation Name (Optional)
+
+The name of the root operation that this pattern matches against. If specified,
+only operations with the given root name will be provided to the `match` and
+`rewrite` implementation. If not specified, any operation type may be provided.
+The root operation name should be provided whenever possible, because it
+simplifies the analysis of patterns when applying a cost model. To match any
+operation type, a special tag must be provided to make the intent explicit:
+`MatchAnyOpTypeTag`.
+
+### `match` and `rewrite` implementation
+
+This is the chunk of code that matches a given root `Operation` and performs a
+rewrite of the IR. A `RewritePattern` can specify this implementation either via
+separate `match` and `rewrite` methods, or via a combined `matchAndRewrite`
+method. When using the combined `matchAndRewrite` method, no IR mutation should
+take place before the match is deemed successful. The combined `matchAndRewrite`
+is useful when non-trivially recomputable information is required by the
+matching and rewriting phase. See below for examples:
+
+```c++
+class MyPattern : public RewritePattern {
+public:
+  /// This overload constructs a pattern that only matches operations with the
+  /// root name of `MyOp`.
+  MyPattern(PatternBenefit benefit, MLIRContext *context)
+      : RewritePattern(MyOp::getOperationName(), benefit, context) {}
+  /// This overload constructs a pattern that matches any operation type.
+  MyPattern(PatternBenefit benefit)
+      : RewritePattern(benefit, MatchAnyOpTypeTag()) {}
+
+  /// In this section, the `match` and `rewrite` implementation is specified
+  /// using the separate hooks.
+  LogicalResult match(Operation *op) const override {
+    // The `match` method returns `success()` if the pattern is a match, failure
+    // otherwise.
+    // ...
+  }
+  void rewrite(Operation *op, PatternRewriter &rewriter) {
+    // The `rewrite` method performs mutations on the IR rooted at `op` using
+    // the provided rewriter. All mutations must go through the provided
+    // rewriter.
+  }
+
+  /// In this section, the `match` and `rewrite` implementation is specified
+  /// using a single hook.
+  LogicalResult matchAndRewrite(Operation *op, PatternRewriter &rewriter) {
+    // The `matchAndRewrite` method performs both the matching and the mutation.
+    // Note that the match must reach a successful point before IR mutation may
+    // take place.
+  }
+};
+```
+
+#### Restrictions
+
+Within the `match` section of a pattern, the following constraints apply:
+
+*   No mutation of the IR is allowed.
+
+Within the `rewrite` section of a pattern, the following constraints apply:
+
+*   All IR mutations, including creation, *must* be performed by the given
+    `PatternRewriter`. This class provides hooks for performing all of the
+    possible mutations that may take place within a pattern. For example, this
+    means that an operation should not be erased via its `erase` method. To
+    erase an operation, the appropriate `PatternRewriter` hook (in this case
+    `eraseOp`) should be used instead.
+*   The root operation is required to either be: updated in-place, replaced, or
+    erased.
+
+### Pattern Rewriter
+
+A `PatternRewriter` is a special class that allows for a pattern to communicate
+with the driver of pattern application. As noted above, *all* IR mutations,
+including creations, are required to be performed via the `PatternRewriter`
+class. This is required because the underlying pattern driver may have state
+that would be invalidated when a mutation takes place. Examples of some of the
+more prevalent `PatternRewriter` API is shown below, please refer to the
+[class documentation](https://github.com/llvm/llvm-project/blob/master/mlir/include/mlir/IR/PatternMatch.h#L235)
+for a more up-to-date listing of the available API:
+
+*   Erase an Operation : `eraseOp`
+
+This method erases an operation that either has no results, or whose results are
+all known to have no uses.
+
+*   Notify why a `match` failed : `notifyMatchFailure`
+
+This method allows for providing a diagnostic message within a `matchAndRewrite`
+as to why a pattern failed to match. How this message is displayed back to the
+user is determined by the specific pattern driver.
+
+*   Replace an Operation : `replaceOp`/`replaceOpWithNewOp`
+
+This method replaces an operation's results with a set of provided values, and
+erases the operation.
+
+*   Update an Operation in-place : `(start|cancel|finalize)RootUpdate`
+
+This is a collection of methods that provide a transaction-like API for updating
+the attributes, location, operands, or successors of an operation in-place
+within a pattern. An in-place update transaction is started with
+`startRootUpdate`, and may either be canceled or finalized with
+`cancelRootUpdate` and `finalizeRootUpdate` respectively. A convenience wrapper,
+`updateRootInPlace`, is provided that wraps a `start` and `finalize` around a
+callback.
+
+*   OpBuilder API
+
+The `PatternRewriter` inherits from the `OpBuilder` class, and thus provides all
+of the same functionality present within an `OpBuilder`. This includes operation
+creation, as well as many useful attribute and type construction methods.
+
+## Pattern Application
+
+After a set of patterns have been defined, they are collected and provided to a
+specific driver for application. A driver consists of several high levels parts:
+
+*   Input `OwningRewritePatternList`
+
+The input patterns to a driver are provided in the form of an
+`OwningRewritePatternList`. This class provides a simplified API for building a
+list of patterns.
+
+*   Driver-specific `PatternRewriter`
+
+To ensure that the driver state does not become invalidated by IR mutations
+within the pattern rewriters, a driver must provide a `PatternRewriter` instance
+with the necessary hooks overridden. If a driver does not need to hook into
+certain mutations, a default implementation is provided that will perform the
+mutation directly.
+
+*   Pattern Application and Cost Model
+
+Each driver is responsible for defining its own operation visitation order as
+well as pattern cost model, but the final application is performed via a
+`PatternApplicator` class. This class takes as input the
+`OwningRewritePatternList` and transforms the patterns based upon a provided
+cost model. This cost model computes a final benefit for a given rewrite
+pattern, using whatever driver specific information necessary. After a cost
+model has been computed, the driver may begin to match patterns against
+operations using `PatternApplicator::matchAndRewrite`.
+
+An example is shown below:
+
+```c++
+class MyPattern : public RewritePattern {
+public:
+  MyPattern(PatternBenefit benefit, MLIRContext *context)
+      : RewritePattern(MyOp::getOperationName(), benefit, context) {}
+};
+
+/// Populate the pattern list.
+void collectMyPatterns(OwningRewritePatternList &patterns, MLIRContext *ctx) {
+  patterns.insert<MyPattern>(/*benefit=*/1, ctx);
+}
+
+/// Define a custom PatternRewriter for use by the driver.
+class MyPatternRewriter : public PatternRewriter {
+public:
+  MyPatternRewriter(MLIRContext *ctx) : PatternRewriter(ctx) {}
+
+  /// Override the necessary PatternRewriter hooks here.
+};
+
+/// Apply the custom driver to `op`.
+void applyMyPatternDriver(Operation *op,
+                          const OwningRewritePatternList &patterns) {
+  // Initialize the custom PatternRewriter.
+  MyPatternRewriter rewriter(op->getContext());
+
+  // Create the applicator and apply our cost model.
+  PatternApplicator applicator(patterns);
+  applicator.applyCostModel([](const RewritePattern &pattern) {
+    // Apply a default cost model.
+    // Note: This is just for demonstration, if the default cost model is truly
+    //       desired `applicator.applyDefaultCostModel()` should be used
+    //       instead.
+    return pattern.getBenefit();
+  });
+
+  // Try to match and apply a pattern.
+  LogicalResult result = applicator.matchAndRewrite(op, rewriter);
+  if (failed(result)) {
+    // ... No patterns were applied.
+  }
+  // ... A pattern was successfully applied.
+}
+```
+
+## Common Pattern Drivers
+
+MLIR provides several common pattern drivers that serve a variety of different
+use cases.
+
+### Dialect Conversion Driver
+
+This driver provides a framework in which to perform operation conversions
+between, and within dialects using a concept of "legality". This framework
+allows for transforming illegal operations to those supported by a provided
+conversion target, via a set of pattern-based operation rewriting patterns. This
+framework also provides support for type conversions. More information on this
+driver can be found [here](DialectConversion.nd).
+
+### Greedy Pattern Rewrite Driver
+
+This driver performs a post order traversal over the provided operations and
+greedily applies the patterns that locally have the most benefit. The benefit of
+a pattern is decided solely by the benefit specified on the pattern, and the
+relative order of the pattern within the pattern list (when two patterns have
+the same local benefit). Patterns are iteratively applied to operations until a
+fixed point is reached, at which point the driver finishes. This driver may be
+used via the following: `applyPatternsAndFoldGreedily` and
+`applyOpPatternsAndFold`. The latter of which only applies patterns to the
+provided operation, and will not traverse the IR.
+
+Note: This driver is the one used by the [canonicalization](Canonicalization.md)
+[pass](Passes.md#-canonicalize-canonicalize-operations) in MLIR.
diff --git a/mlir/docs/Rationale/MLIRForGraphAlgorithms.md b/mlir/docs/Rationale/MLIRForGraphAlgorithms.md
index ac26e5beb9b938..8bd2d9ce8f3543 100644
--- a/mlir/docs/Rationale/MLIRForGraphAlgorithms.md
+++ b/mlir/docs/Rationale/MLIRForGraphAlgorithms.md
@@ -254,7 +254,7 @@ and the API is easier to work with from an ergonomics perspective.
 ### Unified Graph Rewriting Infrastructure
 
 This is still a work in progress, but we have sightlines towards a
-[general rewriting infrastructure](GenericDAGRewriter.md) for transforming DAG
+[general rewriting infrastructure](RationaleGenericDAGRewriter.md) for transforming DAG
 tiles into other DAG tiles, using a declarative pattern format. DAG to DAG
 rewriting is a generalized solution for many common compiler optimizations,
 lowerings, and other rewrites and having an IR enables us to invest in building
diff --git a/mlir/docs/Rationale/RationaleGenericDAGRewriter.md b/mlir/docs/Rationale/RationaleGenericDAGRewriter.md
new file mode 100644
index 00000000000000..289750bdb4abde
--- /dev/null
+++ b/mlir/docs/Rationale/RationaleGenericDAGRewriter.md
@@ -0,0 +1,286 @@
+# Generic DAG Rewriter Infrastructure Rationale
+
+This document details the rationale behind a general DAG-to-DAG rewrite
+infrastructure for MLIR. For up-to-date documentation on the user facing API,
+please look at the main [Pattern Rewriting document](../PatternRewriter.md).
+
+## Introduction and Motivation
+
+The goal of a compiler IR is to represent code - at various levels of
+abstraction which pose different sets of tradeoffs in terms of representational
+capabilities and ease of transformation. However, the ability to represent code
+is not itself very useful - you also need to be able to implement those
+transformations.
+
+There are many different types of compiler transformations, but this document
+focuses on a particularly important class of transformation that comes up
+repeatedly at scale, and is important for the goals of MLIR: matching one DAG of
+operations, and replacing with another. This is an integral part of many
+compilers and necessary for peephole optimizations like "eliminate identity
+nodes" or "replace x+0 with x", a generalized canonicalization framework (e.g.
+Instruction Combiner in LLVM), as well as a useful abstraction to implement
+optimization algorithms for optimization algorithms for IR at multiple levels.
+
+A particular strength of MLIR (and a major difference vs other compiler
+infrastructures like LLVM, GCC, XLA, TensorFlow, etc) is that it uses a single
+compiler IR to represent code at multiple levels of abstraction: an MLIR
+operation can be a "TensorFlow operation", an "XLA HLO", an Affine Loop Nest, an
+LLVM IR instruction (transitively including X86, Lanai, PTX, and other target
+specific instructions), or anything else that the MLIR operation system can
+reasonably express. Given that MLIR spans such a wide range of different problem
+scopes, a single infrastructure for performing graph-to-graph rewrites can help
+solve many diverse domain challenges.
+
+[Static single assignment](https://en.wikipedia.org/wiki/Static_single_assignment_form)
+(SSA) representations like MLIR make it easy to access the operands and "users"
+of an operation. As such, a natural abstraction for these graph-to-graph
+rewrites is that of DAG pattern matching: clients define DAG tile patterns
+(where a tile is a sequence of operations defining a subgraph of the DAG), and
+each pattern includes a result DAG to produce and the cost of the result (or,
+inversely, the benefit of doing the replacement). A common infrastructure
+efficiently finds and performs the rewrites.
+
+While this concept is simple, the details are more nuanced. This document
+defines and explores a set of abstractions that can solve a wide range of
+different problems, and be applied to many different sorts of problems that MLIR
+is - and is expected to - face over time. We do this by separating the pattern
+application algorithm from the "driver" of the computation loop, and make space
+for the patterns to be defined declaratively.
+
+### Constant folding
+
+A degenerate but pervasive case of DAG-to-DAG pattern matching is constant
+folding: an operation whose operands contain constants can often be folded to a
+result constant value.
+
+MLIR operations may override a
+[`fold`](../Canonicalization.md/#canonicalizing-with-fold) routine, which
+exposes a simpler API compared to a general DAG-to-DAG pattern matcher, and
+allows for it to be applicable in cases that a generic matcher would not. For
+example, a DAG-rewrite can remove arbitrary nodes in the current function, which
+could invalidate iterators. Constant folding as an API does not remove any
+nodes, it just provides a (list of) constant values and allows the clients to
+update their data structures as necessary.
+
+## Related Work
+
+There is a huge amount of related work to consider, given that nearly every
+compiler in existence has to solve this problem many times over. One unifying
+problem is that all of these systems are designed to solve one particular, and
+usually, narrow problem: MLIR on the other hand would like to solve many of
+these problems within a single infrastructure. Here are a few related graph
+rewrite systems, along with the pros and cons of their work (The most similar
+design to the infrastructure present in MLIR is the LLVM DAG-to-DAG instruction
+selection algorithm).
+
+### AST-Level Pattern Matchers
+
+The literature is full of source-to-source translators which transform
+identities in order to improve performance (e.g. transforming `X*0` into `0`).
+One large example is the GCC `fold` function, which performs
+[many optimizations](https://github.com/gcc-mirror/gcc/blob/master/gcc/fold-const.c)
+on ASTs. Clang has
+[similar routines](https://clang.llvm.org/docs/InternalsManual.html#constant-folding-in-the-clang-ast)
+for simple constant folding of expressions (as required by the C++ standard) but
+doesn't perform general optimizations on its ASTs.
+
+The primary downside of AST optimizers is that you can't see across operations
+that have multiple uses. It is
+[well known in literature](https://llvm.org/pubs/2008-06-LCTES-ISelUsingSSAGraphs.pdf)
+that DAG pattern matching is more powerful than tree pattern matching, but on
+the other hand, DAG pattern matching can lead to duplication of computation
+which needs to be checked for.
+
+### "Combiners" and other peephole optimizers
+
+Compilers end up with a lot of peephole optimizers for various things, e.g. the
+GCC
+["combine" routines](https://github.com/gcc-mirror/gcc/blob/master/gcc/combine.c)
+(which try to merge two machine instructions into a single one), the LLVM
+[Inst Combine](https://github.com/llvm/llvm-project/tree/master/llvm/lib/Transforms/InstCombine)
+[pass](https://llvm.org/docs/Passes.html#instcombine-combine-redundant-instructions),
+LLVM's
+[DAG Combiner](https://github.com/llvm-mirror/llvm/blob/master/lib/CodeGen/SelectionDAG/DAGCombiner.cpp),
+the Swift compiler's
+[SIL Combiner](https://github.com/apple/swift/tree/master/lib/SILOptimizer/SILCombiner),
+etc. These generally match one or more operations and produce zero or more
+operations as a result. The LLVM
+[Legalization](https://github.com/llvm/llvm-project/tree/master/llvm/lib/CodeGen/SelectionDAG)
+infrastructure has a different outer loop but otherwise works the same way.
+
+These passes have a lot of diversity, but also have a unifying structure: they
+mostly have a worklist outer loop which visits operations. They then use a
+visitor pattern (or equivalent) to switch over the class of operation and
+dispatch to a method. That method contains a long list of hand-written C++ code
+that pattern-matches various special cases. LLVM introduced a "match" function
+that allows writing patterns in a somewhat more declarative style using template
+metaprogramming (MLIR has similar facilities). Here's a simple example:
+
+```c++
+  // Y - (X + 1) --> ~X + Y
+  if (match(Op1, m_OneUse(m_Add(m_Value(X), m_One()))))
+    return BinaryOperator::CreateAdd(Builder.CreateNot(X), Op0);
+```
+
+Here is a somewhat more complicated one (this is not the biggest or most
+complicated :)
+
+```c++
+  // C2 is ODD
+  // LHS = XOR(Y,C1), Y = AND(Z,C2), C1==(C2+1) => LHS == NEG(OR(Z, ~C2))
+  // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
+  if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
+    if (C1->countTrailingZeros() == 0)
+      if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
+        Value NewOr = Builder.CreateOr(Z, ~(*C2));
+        return Builder.CreateSub(RHS, NewOr, "sub");
+      }
+```
+
+These systems are simple to set up, and pattern matching templates have some
+advantages (they are extensible for new sorts of sub-patterns, look compact at
+point of use). On the other hand, they have lots of well known problems, for
+example:
+
+*   These patterns are very error prone to write, and contain lots of
+    redundancies.
+*   The IR being matched often has identities (e.g. when matching commutative
+    operators) and the C++ code has to handle it manually - take a look at
+    [the full code](https://github.com/llvm/llvm-project/blob/c0b5000bd848303320c03f80fbf84d71e74518c9/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp#L767)
+    for `checkForNegativeOperand` that defines the second pattern).
+*   The matching code compiles slowly, both because it generates tons of code
+    and because the templates instantiate slowly.
+*   Adding new patterns (e.g. for count leading zeros in the example above) is
+    awkward and doesn't often happen.
+*   The cost model for these patterns is not really defined - it is emergent
+    based on the order the patterns are matched in code.
+*   They are non-extensible without rebuilding the compiler.
+*   It isn't practical to apply theorem provers and other tools to these
+    patterns - they cannot be reused for other purposes.
+
+In addition to structured "combiners" like these, there are lots of ad-hoc
+systems like the
+[LLVM Machine code peephole optimizer](http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp?view=markup)
+which are related.
+
+### LLVM's DAG-to-DAG Instruction Selection Infrastructure
+
+The instruction selection subsystem in LLVM is the result of many years worth of
+iteration and discovery, driven by the need for LLVM to support code generation
+for lots of targets, the complexity of code generators for modern instruction
+sets (e.g. X86), and the fanatical pursuit of reusing code across targets. Eli
+Bendersky wrote a
+[nice short overview](https://eli.thegreenplace.net/2013/02/25/a-deeper-look-into-the-llvm-code-generator-part-1)
+of how this works, and the
+[LLVM documentation](https://llvm.org/docs/CodeGenerator.html#select-instructions-from-dag)
+describes it in more depth including its advantages and limitations. It allows
+writing patterns like this.
+
+```
+def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+          (BLCI64rr GR64:$src)>;
+```
+
+This example defines a matcher for the
+["blci" instruction](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#TBM_\(Trailing_Bit_Manipulation\))
+in the
+[X86 target description](https://github.com/llvm/llvm-project/blob/master/llvm/lib/Target/X86/X86InstrInfo.td),
+there are many others in that file (look for `Pat<>` patterns, since they aren't
+entangled in details of the compiler like assembler/disassembler generation
+logic).
+
+For the purposes of MLIR, there is much to like about this system, for example:
+
+*   It is defined in a declarative format.
+*   It is extensible to target-defined operations.
+*   It automates matching across identities, like commutative patterns.
+*   It allows custom abstractions and intense factoring of target-specific
+    commonalities.
+*   It generates compact code - it compiles into a state machine, which is
+    interpreted.
+*   It allows the instruction patterns to be defined and reused for multiple
+    purposes.
+*   The patterns are "type checked" at compile time, detecting lots of bugs
+    early and eliminating redundancy from the pattern specifications.
+*   It allows the use of general C++ code for weird/complex cases.
+
+While there is a lot that is good here, there are also a few undesirable bits:
+
+*   The representation is specifically designed and only applicable for
+    instruction selection, meaning that the directly adjacent problems like the
+    DAGCombiner and Legalizer can't use it.
+*   This isn't extensible at compiler runtime, you have to rebuild the compiler
+    to extend it.
+*   The error messages when failing to match a pattern
+    [are not exactly optimal](https://www.google.com/search?q=llvm+cannot+select).
+*   It has lots of implementation problems and limitations (e.g. can't write a
+    pattern for a multi-result operation) as a result of working with the
+    awkward SelectionDAG representation and being designed and implemented on
+    demand.
+*   Organic growth over time has left lots of sharp edges.
+
+### Summary
+
+MLIR faces a wide range of pattern matching and graph rewrite problems, and one
+of the major advantages of having a common representation for code at multiple
+levels is that it allows for investing in - and highly leveraging - a single
+infrastructure for doing this sort of work.
+
+## Goals
+
+We'd like the to encompass many problems in the MLIR space, including 1-to-N
+expansions (e.g. such as in type legalization during instruction selection when
+an add of one bit width may be split into multiple adds of a smaller bit width),
+M-to-1 patterns (e.g. when converting a multiply+add into a single muladd
+operation), as well as general M-to-N patterns (e.g. instruction selection for
+target instructions). Patterns have a benefit associated with them, and the
+common infrastructure should be responsible for sorting out the highest benefit
+match for a given application.
+
+We separate the task of picking a particular optimal pattern from a given root
+node, the algorithm used to rewrite an entire graph given a particular set of
+goals, and the definition of the patterns themselves. We do this because DAG
+tile pattern matching is NP complete. Additionally, we would like to support
+iterative rewrite algorithms that progressively transform the input program
+through multiple steps. Furthermore, we would like to support many different
+sorts of clients across the MLIR stack, and they may have different tolerances
+for compile time cost, different demands for optimality, and other algorithmic
+goals or constraints.
+
+We aim for MLIR transformations to be easy to implement and reduce the
+likelihood for compiler bugs. We expect there to be a very large number of
+patterns that are defined over time, and we believe that these sorts of patterns
+will have a very large number of legality/validity constraints - many of which
+are difficult to reason about in a consistent way, may be target specific, and
+whose implementation may be particularly bug-prone. As such, we aim to design
+the API around pattern definition to be simple, resilient to programmer errors,
+and allow separation of concerns between the legality of the nodes generated
+from the idea of the pattern being defined.
+
+Finally, error handling is a topmost concern, we want pattern match failures to
+be diagnosable in a reasonable way. This is a difficult problem in general, as
+the space of malfunction is too great to be fully enumerated and handled
+optimally, but MLIR is already designed to represent the provenance of an
+operation well. The aim of the pattern rewriting infrastructure is simply to
+propagate that provenance information precisely, as well as diagnose pattern
+match failures with the rationale for why a set of patterns do not apply.
+
+### Non goals
+
+The pattern infrastructure does not aim to solve all compiler problems, it is
+simply a DAG-to-DAG pattern matching system. Compiler algorithms that require
+global dataflow analysis (e.g. common subexpression elimination, conditional
+constant propagation, and many many others) will not be directly solved by this
+infrastructure.
+
+This infrastructure is limited to DAG patterns, which (by definition) prevent
+the patterns from seeing across cycles in a graph. In an SSA-based IR like MLIR,
+this means that these patterns don't see across basic block arguments. We
+consider this acceptable given the set of problems we are trying to solve - we
+don't know of any other system that attempts to do so, and consider the payoff
+of worrying about this to be low.
+
+This design includes the ability for DAG patterns to have associated benefits,
+but those benefits are defined in terms of magic numbers (typically equal to the
+number of nodes being replaced). For any given application, the units of magic
+numbers will have to be defined.
diff --git a/mlir/docs/Tutorials/Toy/Ch-3.md b/mlir/docs/Tutorials/Toy/Ch-3.md
index 5353b58acddf8d..7976d7c30db599 100644
--- a/mlir/docs/Tutorials/Toy/Ch-3.md
+++ b/mlir/docs/Tutorials/Toy/Ch-3.md
@@ -13,7 +13,7 @@ We divide compiler transformations into two categories: local and global. In
 this chapter, we focus on how to leverage the Toy Dialect and its high-level
 semantics to perform local pattern-match transformations that would be difficult
 in LLVM. For this, we use MLIR's
-[Generic DAG Rewriter](../../GenericDAGRewriter.md).
+[Generic DAG Rewriter](../../PatternRewriter.md).
 
 There are two methods that can be used to implement pattern-match
 transformations: 1. Imperative, C++ pattern-match and rewrite 2. Declarative,

From c2807b2e56c05080354818c221ed4a35abd8a5c8 Mon Sep 17 00:00:00 2001
From: Alex Lorenz <arphaman@gmail.com>
Date: Thu, 13 Aug 2020 12:05:57 -0700
Subject: [PATCH 19/23] [darwin][driver] fix isMacosxVersionLT minimum
 supported OS version check

The previous Driver's triple check only worked for -target, but not for -arch -mmacosx-version-min invocations
---
 clang/lib/Driver/ToolChains/Darwin.h                        | 6 +++++-
 .../macos-apple-silicon-slice-link-libs-darwin-only.cpp     | 6 ++++++
 clang/test/Driver/macos-apple-silicon-slice-link-libs.cpp   | 1 +
 3 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Driver/macos-apple-silicon-slice-link-libs-darwin-only.cpp

diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index 64c252efea7df1..e67b2c5c87cd75 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -436,7 +436,11 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public MachO {
   bool isMacosxVersionLT(unsigned V0, unsigned V1 = 0, unsigned V2 = 0) const {
     assert(isTargetMacOS() && getTriple().isMacOSX() &&
            "Unexpected call for non OS X target!");
-    VersionTuple MinVers = getTriple().getMinimumSupportedOSVersion();
+    // The effective triple might not be initialized yet, so construct a
+    // pseudo-effective triple to get the minimum supported OS version.
+    VersionTuple MinVers =
+        llvm::Triple(getTriple().getArchName(), "apple", "macos")
+            .getMinimumSupportedOSVersion();
     return (!MinVers.empty() && MinVers > TargetVersion
                 ? MinVers
                 : TargetVersion) < VersionTuple(V0, V1, V2);
diff --git a/clang/test/Driver/macos-apple-silicon-slice-link-libs-darwin-only.cpp b/clang/test/Driver/macos-apple-silicon-slice-link-libs-darwin-only.cpp
new file mode 100644
index 00000000000000..ec3b710c4da8cc
--- /dev/null
+++ b/clang/test/Driver/macos-apple-silicon-slice-link-libs-darwin-only.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang -### -arch arm64 -mmacosx-version-min=10.7 %s 2>&1 | FileCheck -check-prefix=ARM64-10_7 %s
+// RUN: %clang -### -arch x86_64 -mmacosx-version-min=10.7 %s 2>&1 | FileCheck -check-prefix=x86_64-10_7 %s
+// REQUIRES: system-darwin
+
+// ARM64-10_7-NOT: -lcrt1.10.6.o
+// x86_64-10_7:    -lcrt1.10.6.o
diff --git a/clang/test/Driver/macos-apple-silicon-slice-link-libs.cpp b/clang/test/Driver/macos-apple-silicon-slice-link-libs.cpp
index 522fda34987e9d..4a2a029c736fc9 100644
--- a/clang/test/Driver/macos-apple-silicon-slice-link-libs.cpp
+++ b/clang/test/Driver/macos-apple-silicon-slice-link-libs.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang -### -target arm64-apple-macos10.7 %s 2>&1 | FileCheck -check-prefix=ARM64-10_7 %s
 // RUN: %clang -### -target x86_64-apple-macos10.7 %s 2>&1 | FileCheck -check-prefix=x86_64-10_7 %s
+// RUN: %clang -### -target arm64-apple-darwin6 %s 2>&1 | FileCheck -check-prefix=ARM64-10_7 %s
 
 // RUN: %clang -### -target arm64-apple-macos10.5 %s 2>&1 | FileCheck -check-prefix=ARM64-10_5 %s
 // RUN: %clang -### -target x86_64-apple-macos10.5 %s 2>&1 | FileCheck -check-prefix=x86_64-10_5 %s

From 5bcd32b7449482d6079d968c08d07a3890c86912 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 11 Aug 2020 13:39:54 +0200
Subject: [PATCH 20/23] [ORC][NFC] Fix typo in comment

---
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index ffe156dcdd62cc..531a71d50b9eca 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -928,7 +928,7 @@ LLJIT::PlatformSupport::~PlatformSupport() {}
 
 Error LLJITBuilderState::prepareForConstruction() {
 
-  LLVM_DEBUG(dbgs() << "Preparing to create LLIT instance...\n");
+  LLVM_DEBUG(dbgs() << "Preparing to create LLJIT instance...\n");
 
   if (!JTMB) {
     LLVM_DEBUG({

From 5092039644eac33bdd9e494045a9f99654f792b3 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 13 Aug 2020 15:43:43 -0400
Subject: [PATCH 21/23] [gn build] (manually) port d650cbc349ccc4

---
 .../gn/secondary/llvm/lib/InterfaceStub/BUILD.gn    | 13 +++++++++++++
 llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn   |  2 --
 .../gn/secondary/llvm/tools/llvm-elfabi/BUILD.gn    |  2 +-
 llvm/utils/gn/secondary/llvm/unittests/BUILD.gn     |  1 +
 .../secondary/llvm/unittests/InterfaceStub/BUILD.gn |  9 +++++++++
 .../gn/secondary/llvm/unittests/TextAPI/BUILD.gn    |  1 -
 6 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn
 create mode 100644 llvm/utils/gn/secondary/llvm/unittests/InterfaceStub/BUILD.gn

diff --git a/llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn
new file mode 100644
index 00000000000000..0d157bdd6751f0
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn
@@ -0,0 +1,13 @@
+static_library("InterfaceStub") {
+  output_name = "LLVMInterfaceStub"
+  deps = [
+    "//llvm/lib/Bitstream/Reader",
+    "//llvm/lib/Support",
+  ]
+
+  sources = [
+    "ELFObjHandler.cpp",
+    "ELFStub.cpp",
+    "TBEHandler.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
index aa695da4bf30d1..6df06f941d4a99 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
@@ -6,8 +6,6 @@ static_library("TextAPI") {
   ]
   include_dirs = [ "." ]
   sources = [
-    "ELF/ELFStub.cpp",
-    "ELF/TBEHandler.cpp",
     "MachO/Architecture.cpp",
     "MachO/ArchitectureSet.cpp",
     "MachO/InterfaceFile.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-elfabi/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-elfabi/BUILD.gn
index dd12e20a924e30..0b3cb1ec41ba2d 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-elfabi/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-elfabi/BUILD.gn
@@ -1,11 +1,11 @@
 executable("llvm-elfabi") {
   deps = [
+    "//llvm/lib/InterfaceStub",
     "//llvm/lib/Object",
     "//llvm/lib/Support",
     "//llvm/lib/TextAPI",
   ]
   sources = [
-    "ELFObjHandler.cpp",
     "ErrorCollector.cpp",
     "llvm-elfabi.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
index 3d960d501e4255..5a235671e5c853 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
@@ -23,6 +23,7 @@ group("unittests") {
     "Frontend:LLVMFrontendTests",
     "FuzzMutate:FuzzMutateTests",
     "IR:IRTests",
+    "InterfaceStub:InterfaceStubTests",
     "LineEditor:LineEditorTests",
     "Linker:LinkerTests",
     "MC:MCTests",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/InterfaceStub/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/InterfaceStub/BUILD.gn
new file mode 100644
index 00000000000000..cb508882778c09
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/unittests/InterfaceStub/BUILD.gn
@@ -0,0 +1,9 @@
+import("//llvm/utils/unittest/unittest.gni")
+
+unittest("InterfaceStubTests") {
+  deps = [
+    "//llvm/lib/InterfaceStub",
+    "//llvm/lib/Testing/Support",
+  ]
+  sources = [ "ELFYAMLTest.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/unittests/TextAPI/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/TextAPI/BUILD.gn
index 8e51943b7e2631..0ebeb4f773bbc3 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/TextAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/TextAPI/BUILD.gn
@@ -6,7 +6,6 @@ unittest("TextAPITests") {
     "//llvm/lib/TextAPI",
   ]
   sources = [
-    "ELFYAMLTest.cpp",
     "TextStubV1Tests.cpp",
     "TextStubV2Tests.cpp",
     "TextStubV3Tests.cpp",

From 661d83aa386fd2c2df769a2509f7c1bd5ba99a7b Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 13 Aug 2020 15:48:03 -0400
Subject: [PATCH 22/23] [gn build] (manually) port d650cbc349ccc4 better

---
 llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn
index 0d157bdd6751f0..4778cb590bfa68 100644
--- a/llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/InterfaceStub/BUILD.gn
@@ -1,7 +1,7 @@
 static_library("InterfaceStub") {
   output_name = "LLVMInterfaceStub"
   deps = [
-    "//llvm/lib/Bitstream/Reader",
+    "//llvm/lib/Object",
     "//llvm/lib/Support",
   ]
 

From 21810b0e14287a7b885a7822c6e19609e3b902c8 Mon Sep 17 00:00:00 2001
From: Cameron McInally <mcinally@cray.com>
Date: Thu, 13 Aug 2020 14:47:34 -0500
Subject: [PATCH 23/23] [SVE] Lower fixed length vector integer UMIN/UMAX

Differential Revision: https://reviews.llvm.org/D85926
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  12 +-
 .../AArch64/sve-fixed-length-int-minmax.ll    | 796 ++++++++++++++++++
 2 files changed, 806 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6f3475e5f97dfe..50da2bf1b71337 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1001,6 +1001,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
       setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
       setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
+      setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
+      setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
+      setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
+      setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
     }
   }
 
@@ -1121,6 +1125,8 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::STORE, VT, Custom);
   setOperationAction(ISD::SUB, VT, Custom);
   setOperationAction(ISD::TRUNCATE, VT, Custom);
+  setOperationAction(ISD::UMAX, VT, Custom);
+  setOperationAction(ISD::UMIN, VT, Custom);
   setOperationAction(ISD::XOR, VT, Custom);
   setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
 }
@@ -3634,12 +3640,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED,
                                /*OverrideNEON=*/true);
   case ISD::UMIN:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED,
+                               /*OverrideNEON=*/true);
   case ISD::SMAX:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED,
                                /*OverrideNEON=*/true);
   case ISD::UMAX:
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED,
+                               /*OverrideNEON=*/true);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
index 6c377786250743..cc9e172de5f889 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-minmax.ll
@@ -765,6 +765,751 @@ define void @smin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
   ret void
 }
 
+;
+; UMAX
+;
+
+; Don't use SVE for 64-bit vectors.
+define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+; CHECK-LABEL: umax_v8i8:
+; CHECK: umax v0.8b, v0.8b, v1.8b
+; CHECK: ret
+  %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2)
+  ret <8 x i8> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+; CHECK-LABEL: umax_v16i8:
+; CHECK: umax v0.16b, v0.16b, v1.16b
+; CHECK: ret
+  %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2)
+  ret <16 x i8> %res
+}
+
+define void @umax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+; CHECK-LABEL: umax_v32i8:
+; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
+; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
+; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
+; CHECK-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
+; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %op1 = load <32 x i8>, <32 x i8>* %a
+  %op2 = load <32 x i8>, <32 x i8>* %b
+  %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2)
+  store <32 x i8> %res, <32 x i8>* %a
+  ret void
+}
+
+define void @umax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+; CHECK-LABEL: umax_v64i8:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
+; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
+; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
+; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
+; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+;
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
+; VBITS_EQ_256-DAG: mov w[[A:[0-9]+]], #32
+; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A]]]
+; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1, x[[A]]]
+; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1]
+; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
+; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
+; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0, x[[A]]]
+; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0]
+; VBITS_EQ_256-NEXT: ret
+  %op1 = load <64 x i8>, <64 x i8>* %a
+  %op2 = load <64 x i8>, <64 x i8>* %b
+  %res = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %op1, <64 x i8> %op2)
+  store <64 x i8> %res, <64 x i8>* %a
+  ret void
+}
+
+define void @umax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+; CHECK-LABEL: umax_v128i8:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
+; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
+; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
+; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
+; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %op1 = load <128 x i8>, <128 x i8>* %a
+  %op2 = load <128 x i8>, <128 x i8>* %b
+  %res = call <128 x i8> @llvm.umax.v128i8(<128 x i8> %op1, <128 x i8> %op2)
+  store <128 x i8> %res, <128 x i8>* %a
+  ret void
+}
+
+define void @umax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+; CHECK-LABEL: umax_v256i8:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
+; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
+; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
+; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
+; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %op1 = load <256 x i8>, <256 x i8>* %a
+  %op2 = load <256 x i8>, <256 x i8>* %b
+  %res = call <256 x i8> @llvm.umax.v256i8(<256 x i8> %op1, <256 x i8> %op2)
+  store <256 x i8> %res, <256 x i8>* %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+; CHECK-LABEL: umax_v4i16:
+; CHECK: umax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
+  %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2)
+  ret <4 x i16> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+; CHECK-LABEL: umax_v8i16:
+; CHECK: umax v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+  %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2)
+  ret <8 x i16> %res
+}
+
+define void @umax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: umax_v16i16:
+; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
+; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
+; CHECK-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
+; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %op1 = load <16 x i16>, <16 x i16>* %a
+  %op2 = load <16 x i16>, <16 x i16>* %b
+  %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2)
+  store <16 x i16> %res, <16 x i16>* %a
+  ret void
+}
+
+define void @umax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+; CHECK-LABEL: umax_v32i16:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
+; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
+; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
+; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
+; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
+; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
+; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x[[B_HI]]]
+; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
+; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
+; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %op1 = load <32 x i16>, <32 x i16>* %a
+  %op2 = load <32 x i16>, <32 x i16>* %b
+  %res = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %op1, <32 x i16> %op2)
+  store <32 x i16> %res, <32 x i16>* %a
+  ret void
+}
+
+define void @umax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+; CHECK-LABEL: umax_v64i16:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
+; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
+; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
+; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %op1 = load <64 x i16>, <64 x i16>* %a
+  %op2 = load <64 x i16>, <64 x i16>* %b
+  %res = call <64 x i16> @llvm.umax.v64i16(<64 x i16> %op1, <64 x i16> %op2)
+  store <64 x i16> %res, <64 x i16>* %a
+  ret void
+}
+
+define void @umax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+; CHECK-LABEL: umax_v128i16:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
+; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
+; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
+; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %op1 = load <128 x i16>, <128 x i16>* %a
+  %op2 = load <128 x i16>, <128 x i16>* %b
+  %res = call <128 x i16> @llvm.umax.v128i16(<128 x i16> %op1, <128 x i16> %op2)
+  store <128 x i16> %res, <128 x i16>* %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+; CHECK-LABEL: umax_v2i32:
+; CHECK: umax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
+  %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2)
+  ret <2 x i32> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+; CHECK-LABEL: umax_v4i32:
+; CHECK: umax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
+  %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2)
+  ret <4 x i32> %res
+}
+
+define void @umax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: umax_v8i32:
+; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
+; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
+; CHECK-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
+; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %op1 = load <8 x i32>, <8 x i32>* %a
+  %op2 = load <8 x i32>, <8 x i32>* %b
+  %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2)
+  store <8 x i32> %res, <8 x i32>* %a
+  ret void
+}
+
+define void @umax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
+; CHECK-LABEL: umax_v16i32:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
+; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
+; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
+; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
+; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
+; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x[[B_HI]]]
+; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
+; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
+; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %op1 = load <16 x i32>, <16 x i32>* %a
+  %op2 = load <16 x i32>, <16 x i32>* %b
+  %res = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %op1, <16 x i32> %op2)
+  store <16 x i32> %res, <16 x i32>* %a
+  ret void
+}
+
+define void @umax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+; CHECK-LABEL: umax_v32i32:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
+; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
+; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
+; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %op1 = load <32 x i32>, <32 x i32>* %a
+  %op2 = load <32 x i32>, <32 x i32>* %b
+  %res = call <32 x i32> @llvm.umax.v32i32(<32 x i32> %op1, <32 x i32> %op2)
+  store <32 x i32> %res, <32 x i32>* %a
+  ret void
+}
+
+define void @umax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+; CHECK-LABEL: umax_v64i32:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
+; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
+; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
+; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %op1 = load <64 x i32>, <64 x i32>* %a
+  %op2 = load <64 x i32>, <64 x i32>* %b
+  %res = call <64 x i32> @llvm.umax.v64i32(<64 x i32> %op1, <64 x i32> %op2)
+  store <64 x i32> %res, <64 x i32>* %a
+  ret void
+}
+
+; Vector i64 max are not legal for NEON so use SVE when available.
+define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+; CHECK-LABEL: umax_v1i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
+; CHECK-NEXT: umax z0.d, [[PG]]/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2)
+  ret <1 x i64> %res
+}
+
+; Vector i64 max are not legal for NEON so use SVE when available.
+define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+; CHECK-LABEL: umax_v2i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
+; CHECK-NEXT: umax z0.d, [[PG]]/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2)
+  ret <2 x i64> %res
+}
+
+define void @umax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: umax_v4i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
+; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
+; CHECK-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
+; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %op1 = load <4 x i64>, <4 x i64>* %a
+  %op2 = load <4 x i64>, <4 x i64>* %b
+  %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2)
+  store <4 x i64> %res, <4 x i64>* %a
+  ret void
+}
+
+define void @umax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
+; CHECK-LABEL: umax_v8i64:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
+; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
+; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
+; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
+; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x[[B_HI]]]
+; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
+; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
+; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %op1 = load <8 x i64>, <8 x i64>* %a
+  %op2 = load <8 x i64>, <8 x i64>* %b
+  %res = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %op1, <8 x i64> %op2)
+  store <8 x i64> %res, <8 x i64>* %a
+  ret void
+}
+
+define void @umax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+; CHECK-LABEL: umax_v16i64:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
+; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
+; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
+; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %op1 = load <16 x i64>, <16 x i64>* %a
+  %op2 = load <16 x i64>, <16 x i64>* %b
+  %res = call <16 x i64> @llvm.umax.v16i64(<16 x i64> %op1, <16 x i64> %op2)
+  store <16 x i64> %res, <16 x i64>* %a
+  ret void
+}
+
+define void @umax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+; CHECK-LABEL: umax_v32i64:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
+; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
+; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
+; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %op1 = load <32 x i64>, <32 x i64>* %a
+  %op2 = load <32 x i64>, <32 x i64>* %b
+  %res = call <32 x i64> @llvm.umax.v32i64(<32 x i64> %op1, <32 x i64> %op2)
+  store <32 x i64> %res, <32 x i64>* %a
+  ret void
+}
+
+;
+; UMIN
+;
+
+; Don't use SVE for 64-bit vectors.
+define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
+; CHECK-LABEL: umin_v8i8:
+; CHECK: umin v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: ret
+  %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2)
+  ret <8 x i8> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
+; CHECK-LABEL: umin_v16i8:
+; CHECK: umin v0.16b, v0.16b, v1.16b
+; CHECK: ret
+  %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2)
+  ret <16 x i8> %res
+}
+
+define void @umin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
+; CHECK-LABEL: umin_v32i8:
+; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
+; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
+; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
+; CHECK-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
+; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %op1 = load <32 x i8>, <32 x i8>* %a
+  %op2 = load <32 x i8>, <32 x i8>* %b
+  %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2)
+  store <32 x i8> %res, <32 x i8>* %a
+  ret void
+}
+
+define void @umin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
+; CHECK-LABEL: umin_v64i8:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
+; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
+; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
+; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
+; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+;
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
+; VBITS_EQ_256-DAG: mov w[[A:[0-9]+]], #32
+; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A]]]
+; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1, x[[A]]]
+; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1]
+; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b
+; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b
+; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0, x[[A]]]
+; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0]
+  %op1 = load <64 x i8>, <64 x i8>* %a
+  %op2 = load <64 x i8>, <64 x i8>* %b
+  %res = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %op1, <64 x i8> %op2)
+  store <64 x i8> %res, <64 x i8>* %a
+  ret void
+}
+
+define void @umin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
+; CHECK-LABEL: umin_v128i8:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
+; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
+; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
+; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
+; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %op1 = load <128 x i8>, <128 x i8>* %a
+  %op2 = load <128 x i8>, <128 x i8>* %b
+  %res = call <128 x i8> @llvm.umin.v128i8(<128 x i8> %op1, <128 x i8> %op2)
+  store <128 x i8> %res, <128 x i8>* %a
+  ret void
+}
+
+define void @umin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
+; CHECK-LABEL: umin_v256i8:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
+; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
+; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
+; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
+; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %op1 = load <256 x i8>, <256 x i8>* %a
+  %op2 = load <256 x i8>, <256 x i8>* %b
+  %res = call <256 x i8> @llvm.umin.v256i8(<256 x i8> %op1, <256 x i8> %op2)
+  store <256 x i8> %res, <256 x i8>* %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
+; CHECK-LABEL: umin_v4i16:
+; CHECK: umin v0.4h, v0.4h, v1.4h
+; CHECK-NEXT: ret
+  %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2)
+  ret <4 x i16> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
+; CHECK-LABEL: umin_v8i16:
+; CHECK: umin v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: ret
+  %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2)
+  ret <8 x i16> %res
+}
+
+define void @umin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: umin_v16i16:
+; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
+; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
+; CHECK-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
+; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %op1 = load <16 x i16>, <16 x i16>* %a
+  %op2 = load <16 x i16>, <16 x i16>* %b
+  %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2)
+  store <16 x i16> %res, <16 x i16>* %a
+  ret void
+}
+
+define void @umin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
+; CHECK-LABEL: umin_v32i16:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
+; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
+; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
+; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
+; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
+; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
+; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x[[B_HI]]]
+; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
+; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
+; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %op1 = load <32 x i16>, <32 x i16>* %a
+  %op2 = load <32 x i16>, <32 x i16>* %b
+  %res = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %op1, <32 x i16> %op2)
+  store <32 x i16> %res, <32 x i16>* %a
+  ret void
+}
+
+define void @umin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
+; CHECK-LABEL: umin_v64i16:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
+; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
+; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
+; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %op1 = load <64 x i16>, <64 x i16>* %a
+  %op2 = load <64 x i16>, <64 x i16>* %b
+  %res = call <64 x i16> @llvm.umin.v64i16(<64 x i16> %op1, <64 x i16> %op2)
+  store <64 x i16> %res, <64 x i16>* %a
+  ret void
+}
+
+define void @umin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
+; CHECK-LABEL: umin_v128i16:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
+; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
+; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
+; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %op1 = load <128 x i16>, <128 x i16>* %a
+  %op2 = load <128 x i16>, <128 x i16>* %b
+  %res = call <128 x i16> @llvm.umin.v128i16(<128 x i16> %op1, <128 x i16> %op2)
+  store <128 x i16> %res, <128 x i16>* %a
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
+; CHECK-LABEL: umin_v2i32:
+; CHECK: umin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT: ret
+  %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2)
+  ret <2 x i32> %res
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
+; CHECK-LABEL: umin_v4i32:
+; CHECK: umin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
+  %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2)
+  ret <4 x i32> %res
+}
+
+define void @umin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: umin_v8i32:
+; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
+; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
+; CHECK-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
+; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %op1 = load <8 x i32>, <8 x i32>* %a
+  %op2 = load <8 x i32>, <8 x i32>* %b
+  %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2)
+  store <8 x i32> %res, <8 x i32>* %a
+  ret void
+}
+
+define void @umin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
+; CHECK-LABEL: umin_v16i32:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
+; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
+; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
+; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
+; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
+; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x[[B_HI]]]
+; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
+; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
+; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %op1 = load <16 x i32>, <16 x i32>* %a
+  %op2 = load <16 x i32>, <16 x i32>* %b
+  %res = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %op1, <16 x i32> %op2)
+  store <16 x i32> %res, <16 x i32>* %a
+  ret void
+}
+
+define void @umin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
+; CHECK-LABEL: umin_v32i32:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
+; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
+; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
+; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %op1 = load <32 x i32>, <32 x i32>* %a
+  %op2 = load <32 x i32>, <32 x i32>* %b
+  %res = call <32 x i32> @llvm.umin.v32i32(<32 x i32> %op1, <32 x i32> %op2)
+  store <32 x i32> %res, <32 x i32>* %a
+  ret void
+}
+
+define void @umin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
+; CHECK-LABEL: umin_v64i32:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
+; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
+; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
+; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %op1 = load <64 x i32>, <64 x i32>* %a
+  %op2 = load <64 x i32>, <64 x i32>* %b
+  %res = call <64 x i32> @llvm.umin.v64i32(<64 x i32> %op1, <64 x i32> %op2)
+  store <64 x i32> %res, <64 x i32>* %a
+  ret void
+}
+
+; Vector i64 min are not legal for NEON so use SVE when available.
+define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
+; CHECK-LABEL: umin_v1i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
+; CHECK-NEXT: umin z0.d, [[PG]]/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2)
+  ret <1 x i64> %res
+}
+
+; Vector i64 min are not legal for NEON so use SVE when available.
+define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
+; CHECK-LABEL: umin_v2i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
+; CHECK-NEXT: umin z0.d, [[PG]]/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2)
+  ret <2 x i64> %res
+}
+
+define void @umin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: umin_v4i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
+; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
+; CHECK-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
+; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %op1 = load <4 x i64>, <4 x i64>* %a
+  %op2 = load <4 x i64>, <4 x i64>* %b
+  %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2)
+  store <4 x i64> %res, <4 x i64>* %a
+  ret void
+}
+
+define void @umin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
+; CHECK-LABEL: umin_v8i64:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
+; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
+; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
+; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
+; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x[[B_HI]]]
+; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
+; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
+; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %op1 = load <8 x i64>, <8 x i64>* %a
+  %op2 = load <8 x i64>, <8 x i64>* %b
+  %res = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %op1, <8 x i64> %op2)
+  store <8 x i64> %res, <8 x i64>* %a
+  ret void
+}
+
+define void @umin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
+; CHECK-LABEL: umin_v16i64:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
+; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
+; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
+; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %op1 = load <16 x i64>, <16 x i64>* %a
+  %op2 = load <16 x i64>, <16 x i64>* %b
+  %res = call <16 x i64> @llvm.umin.v16i64(<16 x i64> %op1, <16 x i64> %op2)
+  store <16 x i64> %res, <16 x i64>* %a
+  ret void
+}
+
+define void @umin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
+; CHECK-LABEL: umin_v32i64:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
+; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
+; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
+; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %op1 = load <32 x i64>, <32 x i64>* %a
+  %op2 = load <32 x i64>, <32 x i64>* %b
+  %res = call <32 x i64> @llvm.umin.v32i64(<32 x i64> %op1, <32 x i64> %op2)
+  store <32 x i64> %res, <32 x i64>* %a
+  ret void
+}
+
 attributes #0 = { "target-features"="+sve" }
 
 declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>)
@@ -816,3 +1561,54 @@ declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>)
 declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>)
 declare <16 x i64> @llvm.smax.v16i64(<16 x i64>, <16 x i64>)
 declare <32 x i64> @llvm.smax.v32i64(<32 x i64>, <32 x i64>)
+
+declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>)
+declare <128 x i8> @llvm.umin.v128i8(<128 x i8>, <128 x i8>)
+declare <256 x i8> @llvm.umin.v256i8(<256 x i8>, <256 x i8>)
+declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>)
+declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>)
+declare <64 x i16> @llvm.umin.v64i16(<64 x i16>, <64 x i16>)
+declare <128 x i16> @llvm.umin.v128i16(<128 x i16>, <128 x i16>)
+declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
+declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>)
+declare <32 x i32> @llvm.umin.v32i32(<32 x i32>, <32 x i32>)
+declare <64 x i32> @llvm.umin.v64i32(<64 x i32>, <64 x i32>)
+declare <1 x i64> @llvm.umin.v1i64(<1 x i64>, <1 x i64>)
+declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>)
+declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>)
+declare <16 x i64> @llvm.umin.v16i64(<16 x i64>, <16 x i64>)
+declare <32 x i64> @llvm.umin.v32i64(<32 x i64>, <32 x i64>)
+
+declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>)
+declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
+declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>)
+declare <128 x i8> @llvm.umax.v128i8(<128 x i8>, <128 x i8>)
+declare <256 x i8> @llvm.umax.v256i8(<256 x i8>, <256 x i8>)
+declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>)
+declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
+declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>)
+declare <64 x i16> @llvm.umax.v64i16(<64 x i16>, <64 x i16>)
+declare <128 x i16> @llvm.umax.v128i16(<128 x i16>, <128 x i16>)
+declare <2 x i32> @llvm.umax.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
+declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>)
+declare <32 x i32> @llvm.umax.v32i32(<32 x i32>, <32 x i32>)
+declare <64 x i32> @llvm.umax.v64i32(<64 x i32>, <64 x i32>)
+declare <1 x i64> @llvm.umax.v1i64(<1 x i64>, <1 x i64>)
+declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>)
+declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>)
+declare <16 x i64> @llvm.umax.v16i64(<16 x i64>, <16 x i64>)
+declare <32 x i64> @llvm.umax.v32i64(<32 x i64>, <32 x i64>)
+