-
-
Notifications
You must be signed in to change notification settings - Fork 5.6k
Description
After codegen_rewrite2, codegen is not able to reuse GC frame slots in the same basic block anymore, for both jlcall buffer and (especially) temporaries.
Testing with the following simple functions
julia> @noinline g1(a, b) = nothing
g1 (generic function with 1 method)
julia> @noinline g2(a...) = nothing
g2 (generic function with 1 method)
julia> function f1()
g1(Ref(1), Ref(2))
g1(Ref(1), Ref(2))
end
f1 (generic function with 1 method)
julia> function f2()
g2(Ref(1), Ref(2))
g2(Ref(1), Ref(2))
end
f2 (generic function with 1 method)
On 2bb94d6 (juliabox version, before jn/codegen_rewrite2 and after jb/functions so that the jlcall convention is the same with current master). f1()
and f2()
both generate only 2 gc frame slots (note that we emit the temporary directly into the jlcall frame in f2()
and the jlcall frames are always reused).
define void @julia_f1_23552() #0 {
top:
%0 = alloca [4 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64
0, i64 0
%1 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 2
%2 = bitcast [4 x %jl_value_t*]* %0 to i64*
store i64 4, i64* %2, align 8
%3 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 1
%4 = load i64, i64* bitcast ([80040 x i8*]* @jl_tls_states to i64*), align 8
%5 = bitcast %jl_value_t** %3 to i64*
store i64 %4, i64* %5, align 8
store %jl_value_t** %.sub, %jl_value_t*** bitcast ([80040 x i8*]* @jl_tls_state
s to %jl_value_t***), align 8
store %jl_value_t* null, %jl_value_t** %1, align 8
%6 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 3
store %jl_value_t* null, %jl_value_t** %6, align 8
%7 = call %jl_value_t* @jl_gc_alloc_1w()
%8 = getelementptr inbounds %jl_value_t, %jl_value_t* %7, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %8, align 8
store %jl_value_t* %7, %jl_value_t** %1, align 8
%9 = load i64, i64* inttoptr (i64 139617449926784 to i64*), align 128
%10 = bitcast %jl_value_t* %7 to i64*
store i64 %9, i64* %10, align 16
store %jl_value_t* %7, %jl_value_t** %1, align 8
%11 = call %jl_value_t* @jl_gc_alloc_1w()
%12 = getelementptr inbounds %jl_value_t, %jl_value_t* %11, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %12, align 8
store %jl_value_t* %11, %jl_value_t** %6, align 8
%13 = load i64, i64* inttoptr (i64 139617449926832 to i64*), align 16
%14 = bitcast %jl_value_t* %11 to i64*
store i64 %13, i64* %14, align 16
store %jl_value_t* %11, %jl_value_t** %6, align 8
call void @julia_g1_23553(%jl_value_t* %7, %jl_value_t* %11) #0
%15 = call %jl_value_t* @jl_gc_alloc_1w()
%16 = getelementptr inbounds %jl_value_t, %jl_value_t* %15, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %16, align 8
store %jl_value_t* %15, %jl_value_t** %1, align 8
%17 = load i64, i64* inttoptr (i64 139617449926784 to i64*), align 128
%18 = bitcast %jl_value_t* %15 to i64*
store i64 %17, i64* %18, align 16
store %jl_value_t* %15, %jl_value_t** %1, align 8
%19 = call %jl_value_t* @jl_gc_alloc_1w()
%20 = getelementptr inbounds %jl_value_t, %jl_value_t* %19, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %20, align 8
store %jl_value_t* %19, %jl_value_t** %6, align 8
%21 = load i64, i64* inttoptr (i64 139617449926832 to i64*), align 16
%22 = bitcast %jl_value_t* %19 to i64*
store i64 %21, i64* %22, align 16
store %jl_value_t* %19, %jl_value_t** %6, align 8
call void @julia_g1_23553(%jl_value_t* %15, %jl_value_t* %19) #0
%23 = load i64, i64* %5, align 8
store i64 %23, i64* bitcast ([80040 x i8*]* @jl_tls_states to i64*), align 8
ret void
}
define void @julia_f2_23557() #0 {
top:
%0 = alloca [4 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64
0, i64 0
%1 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 2
%2 = bitcast [4 x %jl_value_t*]* %0 to i64*
store i64 4, i64* %2, align 8
%3 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 1
%4 = load i64, i64* bitcast ([80040 x i8*]* @jl_tls_states to i64*), align 8
%5 = bitcast %jl_value_t** %3 to i64*
store i64 %4, i64* %5, align 8
store %jl_value_t** %.sub, %jl_value_t*** bitcast ([80040 x i8*]* @jl_tls_state
s to %jl_value_t***), align 8
store %jl_value_t* null, %jl_value_t** %1, align 8
%6 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 3
store %jl_value_t* null, %jl_value_t** %6, align 8
%7 = call %jl_value_t* @jl_gc_alloc_1w()
%8 = getelementptr inbounds %jl_value_t, %jl_value_t* %7, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %8, align 8
store %jl_value_t* %7, %jl_value_t** %1, align 8
%9 = load i64, i64* inttoptr (i64 139617449926784 to i64*), align 128
%10 = bitcast %jl_value_t* %7 to i64*
store i64 %9, i64* %10, align 16
store %jl_value_t* %7, %jl_value_t** %1, align 8
%11 = call %jl_value_t* @jl_gc_alloc_1w()
%12 = getelementptr inbounds %jl_value_t, %jl_value_t* %11, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %12, align 8
store %jl_value_t* %11, %jl_value_t** %6, align 8
%13 = load i64, i64* inttoptr (i64 139617449926832 to i64*), align 16
%14 = bitcast %jl_value_t* %11 to i64*
store i64 %13, i64* %14, align 16
store %jl_value_t* %11, %jl_value_t** %6, align 8
%15 = call %jl_value_t* @julia_g2_23558(%jl_value_t* inttoptr (i64 139617449836
992 to %jl_value_t*), %jl_value_t** %1, i32 2)
%16 = call %jl_value_t* @jl_gc_alloc_1w()
%17 = getelementptr inbounds %jl_value_t, %jl_value_t* %16, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %17, align 8
store %jl_value_t* %16, %jl_value_t** %1, align 8
%18 = load i64, i64* inttoptr (i64 139617449926784 to i64*), align 128
%19 = bitcast %jl_value_t* %16 to i64*
store i64 %18, i64* %19, align 16
store %jl_value_t* %16, %jl_value_t** %1, align 8
%20 = call %jl_value_t* @jl_gc_alloc_1w()
%21 = getelementptr inbounds %jl_value_t, %jl_value_t* %20, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %21, align 8
store %jl_value_t* %20, %jl_value_t** %6, align 8
%22 = load i64, i64* inttoptr (i64 139617449926832 to i64*), align 16
%23 = bitcast %jl_value_t* %20 to i64*
store i64 %22, i64* %23, align 16
store %jl_value_t* %20, %jl_value_t** %6, align 8
%24 = call %jl_value_t* @julia_g2_23558(%jl_value_t* inttoptr (i64 139617449836
992 to %jl_value_t*), %jl_value_t** %1, i32 2)
%25 = load i64, i64* %5, align 8
store i64 %25, i64* bitcast ([80040 x i8*]* @jl_tls_states to i64*), align 8
ret void
}
And on current master it generates 4 (4 temporaries) and 8 (4 temporaries and 2 * 2 jlcall frames without any reuse...)
define void @julia_f1_22963() #0 {
top:
%0 = alloca [6 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 0
%1 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 2
store %jl_value_t* null, %jl_value_t** %1, align 8
%2 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 3
store %jl_value_t* null, %jl_value_t** %2, align 8
%3 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 4
store %jl_value_t* null, %jl_value_t** %3, align 8
%4 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 5
store %jl_value_t* null, %jl_value_t** %4, align 8
%5 = bitcast [6 x %jl_value_t*]* %0 to i64*
store i64 8, i64* %5, align 8
%6 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 1
%7 = load i64, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
%8 = bitcast %jl_value_t** %6 to i64*
store i64 %7, i64* %8, align 8
store %jl_value_t** %.sub, %jl_value_t*** @jl_tls_states, align 8
%9 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %9, %jl_value_t** %1, align 8
%10 = getelementptr inbounds %jl_value_t, %jl_value_t* %9, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %10, align 8
%11 = bitcast %jl_value_t* %9 to i64*
store i64 1, i64* %11, align 16
%12 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %12, %jl_value_t** %2, align 8
%13 = getelementptr inbounds %jl_value_t, %jl_value_t* %12, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %13, align 8
%14 = bitcast %jl_value_t* %12 to i64*
store i64 2, i64* %14, align 16
call void @julia_g1_22964(%jl_value_t* %9, %jl_value_t* %12) #0
%15 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %15, %jl_value_t** %3, align 8
%16 = getelementptr inbounds %jl_value_t, %jl_value_t* %15, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %16, align 8
%17 = bitcast %jl_value_t* %15 to i64*
store i64 1, i64* %17, align 16
%18 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %18, %jl_value_t** %4, align 8
%19 = getelementptr inbounds %jl_value_t, %jl_value_t* %18, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %19, align 8
%20 = bitcast %jl_value_t* %18 to i64*
store i64 2, i64* %20, align 16
call void @julia_g1_22964(%jl_value_t* %15, %jl_value_t* %18) #0
%21 = load i64, i64* %8, align 8
store i64 %21, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
ret void
}
julia> @code_llvm f2()
define void @julia_f2_22968() #0 {
top:
%0 = alloca [10 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 0
%1 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 2
%2 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 6
%3 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 8
store %jl_value_t* null, %jl_value_t** %1, align 8
%4 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 3
store %jl_value_t* null, %jl_value_t** %4, align 8
%5 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 4
store %jl_value_t* null, %jl_value_t** %5, align 8
%6 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 5
store %jl_value_t* null, %jl_value_t** %6, align 8
store %jl_value_t* null, %jl_value_t** %2, align 8
%7 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 7
store %jl_value_t* null, %jl_value_t** %7, align 8
store %jl_value_t* null, %jl_value_t** %3, align 8
%8 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 9
store %jl_value_t* null, %jl_value_t** %8, align 8
%9 = bitcast [10 x %jl_value_t*]* %0 to i64*
store i64 16, i64* %9, align 8
%10 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 1
%11 = load i64, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
%12 = bitcast %jl_value_t** %10 to i64*
store i64 %11, i64* %12, align 8
store %jl_value_t** %.sub, %jl_value_t*** @jl_tls_states, align 8
%13 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %13, %jl_value_t** %1, align 8
%14 = getelementptr inbounds %jl_value_t, %jl_value_t* %13, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %14, align 8
%15 = bitcast %jl_value_t* %13 to i64*
store i64 1, i64* %15, align 16
%16 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %16, %jl_value_t** %4, align 8
%17 = getelementptr inbounds %jl_value_t, %jl_value_t* %16, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %17, align 8
%18 = bitcast %jl_value_t* %16 to i64*
store i64 2, i64* %18, align 16
store %jl_value_t* %13, %jl_value_t** %3, align 8
store %jl_value_t* %16, %jl_value_t** %8, align 8
%19 = call %jl_value_t* @julia_g2_22969(%jl_value_t* inttoptr (i64 139780292051200 to %jl_value_t*), %jl_value_t** %3, i32 2)
%20 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %20, %jl_value_t** %5, align 8
%21 = getelementptr inbounds %jl_value_t, %jl_value_t* %20, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %21, align 8
%22 = bitcast %jl_value_t* %20 to i64*
store i64 1, i64* %22, align 16
%23 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %23, %jl_value_t** %6, align 8
%24 = getelementptr inbounds %jl_value_t, %jl_value_t* %23, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %24, align 8
%25 = bitcast %jl_value_t* %23 to i64*
store i64 2, i64* %25, align 16
store %jl_value_t* %20, %jl_value_t** %2, align 8
store %jl_value_t* %23, %jl_value_t** %7, align 8
%26 = call %jl_value_t* @julia_g2_22969(%jl_value_t* inttoptr (i64 139780292051200 to %jl_value_t*), %jl_value_t** %2, i32 2)
%27 = load i64, i64* %12, align 8
store i64 %27, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
ret void
}
Adding a useless branch that can be constant folded by llvm (but not type inference) causes the jlcall buffer to be reused but not the temporaries.
julia> function f3()
g2(Ref(1), Ref(2))
1 == 1 ? 1 : 1
g2(Ref(1), Ref(2))
end
f3 (generic function with 1 method)
define void @julia_f3_22983() #0 {
top:
%0 = alloca [8 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 0
%1 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 2
%2 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 6
store %jl_value_t* null, %jl_value_t** %1, align 8
%3 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 3
store %jl_value_t* null, %jl_value_t** %3, align 8
%4 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 4
store %jl_value_t* null, %jl_value_t** %4, align 8
%5 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 5
store %jl_value_t* null, %jl_value_t** %5, align 8
store %jl_value_t* null, %jl_value_t** %2, align 8
%6 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 7
store %jl_value_t* null, %jl_value_t** %6, align 8
%7 = bitcast [8 x %jl_value_t*]* %0 to i64*
store i64 12, i64* %7, align 8
%8 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 1
%9 = load i64, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
%10 = bitcast %jl_value_t** %8 to i64*
store i64 %9, i64* %10, align 8
store %jl_value_t** %.sub, %jl_value_t*** @jl_tls_states, align 8
%11 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %11, %jl_value_t** %1, align 8
%12 = getelementptr inbounds %jl_value_t, %jl_value_t* %11, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %12, align 8
%13 = bitcast %jl_value_t* %11 to i64*
store i64 1, i64* %13, align 16
%14 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %14, %jl_value_t** %3, align 8
%15 = getelementptr inbounds %jl_value_t, %jl_value_t* %14, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %15, align 8
%16 = bitcast %jl_value_t* %14 to i64*
store i64 2, i64* %16, align 16
store %jl_value_t* %11, %jl_value_t** %2, align 8
store %jl_value_t* %14, %jl_value_t** %6, align 8
%17 = call %jl_value_t* @julia_g2_22969(%jl_value_t* inttoptr (i64 139780292051200 to %jl_value_t*), %jl_value_t** %2, i32 2)
%18 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %18, %jl_value_t** %4, align 8
%19 = getelementptr inbounds %jl_value_t, %jl_value_t* %18, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %19, align 8
%20 = bitcast %jl_value_t* %18 to i64*
store i64 1, i64* %20, align 16
%21 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %21, %jl_value_t** %5, align 8
%22 = getelementptr inbounds %jl_value_t, %jl_value_t* %21, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %22, align 8
%23 = bitcast %jl_value_t* %21 to i64*
store i64 2, i64* %23, align 16
store %jl_value_t* %18, %jl_value_t** %2, align 8
store %jl_value_t* %21, %jl_value_t** %6, align 8
%24 = call %jl_value_t* @julia_g2_22969(%jl_value_t* inttoptr (i64 139780292051200 to %jl_value_t*), %jl_value_t** %2, i32 2)
%25 = load i64, i64* %10, align 8
store i64 %25, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
ret void
}