Description
The following reduced IR is derived from https://github.com/dtcxzyw/llvm-opt-benchmark/blob/314b5d859bb1d19cb93c259e57edafaf11d4fc80/bench/abseil-cpp/original/bounded_utf8_length_sequence_test.ll#L8889
The reduced code is still a bit long, but we've pruned out extranous code as much as possible. Maybe we can reduce it further if we know some of the root causes. (Further reduced versions have been placed in the comments)
missed optimization: %66 = getelementptr inbounds nuw [2 x i64], ptr %0, i64 0, i64 %65
-> %66 = getelementptr inbounds nuw [2 x i64], ptr %0, i64 0, i64 0
https://godbolt.org/z/G3xf9rh8W
reduced code:
define i64 @src(ptr noundef nonnull align 8 dereferenceable(16) %0, i32 noundef %1) {
%5 = alloca i32, align 4
%16 = alloca i32, align 4
store i32 %1, ptr %5, align 4
%21 = load i32, ptr %5, align 4
%22 = icmp uge i32 %21, 64
br i1 %22, label %23, label %24
23: ; preds = %3
store i32 63, ptr %5, align 4
br label %24
24: ; preds = %23, %3
br label %31
31: ; preds = %30, %27
%32 = load i32, ptr %5, align 4
%33 = udiv i32 %32, 32
store i32 %33, ptr %16, align 4
br label %57
57: ; preds = %31
%58 = load i32, ptr %16, align 4
%59 = icmp ugt i32 %58, 0
br i1 %59, label %61, label %60
60: ; preds = %57
ret i64 1
61: ; preds = %57
%63 = load i32, ptr %16, align 4
%64 = sub i32 %63, 1
%65 = zext i32 %64 to i64
%66 = getelementptr inbounds nuw [2 x i64], ptr %0, i64 0, i64 %65
%67 = load i64, ptr %66, align 8
ret i64 %67
}
clang-trunk:
define i64 @src(ptr noundef nonnull readonly align 8 captures(none) dereferenceable(16) %0, i32 noundef %1) local_unnamed_addr #0 {
%.not = icmp ult i32 %1, 32
br i1 %.not, label %common.ret, label %3
common.ret: ; preds = %2, %3
%common.ret.op = phi i64 [ %8, %3 ], [ 1, %2 ]
ret i64 %common.ret.op
3: ; preds = %2
%spec.store.select = tail call i32 @llvm.umin.i32(i32 %1, i32 63)
%4 = lshr i32 %spec.store.select, 5
%5 = add nsw i32 %4, -1
%6 = zext nneg i32 %5 to i64
%7 = getelementptr inbounds nuw [2 x i64], ptr %0, i64 0, i64 %6
%8 = load i64, ptr %7, align 8
br label %common.ret
}
expected code:
define i64 @src_optimized(ptr noundef nonnull readonly align 8 captures(none) dereferenceable(16) %0, i32 noundef %1) local_unnamed_addr #0 {
common.ret:
%.not = icmp ult i32 %1, 32
%2 = load i64, ptr %0, align 8
%spec.select = select i1 %.not, i64 1, i64 %2
ret i64 %spec.select
}
alive2 timed out. But opt -O3
produces the same IR for @src_optimized
and @tgt
, perhaps proving that the code before and after the desired optimization is equivalent. https://godbolt.org/z/G3xf9rh8W
(@src_optimized
is obtained after the @src
is optimized by clang;
@tgt
is obtained after %66 = getelementptr inbounds nuw [2 x i64], ptr %0, i64 0, i64 %65
-> %66 = getelementptr inbounds nuw [2 x i64], ptr %0, i64 0, i64 0
)