Code with goto is 3x faster than code with continue

While debugging https://github.com/JuliaLang/julia/issues/47542 I managed to reproduce the same slowdown with C code, https://godbolt.org/z/789GdG3bE.

The code is 
```c
__attribute__((noinline)) size_t foo(int64_t *arr, size_t len) {
    size_t i = 0;
    foo:
    while (i < len) {
        if (arr[i] == 0) {
            i += 1;
            goto foo;
        }
        i += 2;
    }
    return i;
}

__attribute__((noinline)) size_t bar(int64_t *arr, size_t len) {
    size_t i = 0;
    while (i < len) {
        if (arr[i] == 0) {
            i += 1;
            continue;
        }
        i += 2;
    }
    return i;
}
```

The difference is that code with continue gets compiled to a select where the goto keeps the branch. I imagine this is a choice done in SimplifyCFG but I imagine that branch prediction/speculation on this is good enough that the conditional increment done on the continue case is much worse. Goto was faster on everything I tested (2 aarch64 cpus and 2 x86 cpus)

```llvm
define dso_local i64 @foo(ptr nocapture noundef readonly %0, i64 noundef %1) local_unnamed_addr #0 {
  %3 = icmp eq i64 %1, 0
  br i1 %3, label %17, label %4

4:
  %5 = phi i64 [ %16, %15 ], [ 0, %2 ]
  %6 = getelementptr inbounds nuw i64, ptr %0, i64 %5
  %7 = load i64, ptr %6, align 8
  %8 = icmp eq i64 %7, 0
  br i1 %8, label %9, label %12

9:
  %10 = add nuw i64 %5, 1
  %11 = icmp ult i64 %10, %1
  br i1 %11, label %15, label %17

12:
  %13 = add i64 %5, 2
  %14 = icmp ult i64 %13, %1
  br i1 %14, label %15, label %17

15:
  %16 = phi i64 [ %13, %12 ], [ %10, %9 ]
  br label %4

17:
  %18 = phi i64 [ 0, %2 ], [ %10, %9 ], [ %13, %12 ]
  ret i64 %18
}

define dso_local i64 @bar(ptr nocapture noundef readonly %0, i64 noundef %1) local_unnamed_addr #0 {
  %3 = icmp eq i64 %1, 0
  br i1 %3, label %12, label %4

4:
  %5 = phi i64 [ %10, %4 ], [ 0, %2 ]
  %6 = getelementptr inbounds nuw i64, ptr %0, i64 %5
  %7 = load i64, ptr %6, align 8
  %8 = icmp eq i64 %7, 0
  %9 = select i1 %8, i64 1, i64 2
  %10 = add i64 %5, %9
  %11 = icmp ult i64 %10, %1
  br i1 %11, label %4, label %12

12:
  %13 = phi i64 [ 0, %2 ], [ %10, %4 ]
  ret i64 %13
}

```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Code with goto is 3x faster than code with continue #141638

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Code with goto is 3x faster than code with continue #141638

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions