Skip to content

Very inefficient GC frame generation #15369

@yuyichao

Description

@yuyichao

After codegen_rewrite2, codegen is not able to reuse GC frame slots in the same basic block anymore, for both jlcall buffer and (especially) temporaries.

Testing with the following simple functions

julia> @noinline g1(a, b) = nothing
g1 (generic function with 1 method)

julia> @noinline g2(a...) = nothing
g2 (generic function with 1 method)

julia> function f1()
           g1(Ref(1), Ref(2))
           g1(Ref(1), Ref(2))
       end
f1 (generic function with 1 method)

julia> function f2()
           g2(Ref(1), Ref(2))
           g2(Ref(1), Ref(2))
       end
f2 (generic function with 1 method)

On 2bb94d6 (juliabox version, before jn/codegen_rewrite2 and after jb/functions so that the jlcall convention is the same with current master). f1() and f2() both generate only 2 gc frame slots (note that we emit the temporary directly into the jlcall frame in f2() and the jlcall frames are always reused).

define void @julia_f1_23552() #0 {
top:                                                                             
  %0 = alloca [4 x %jl_value_t*], align 8                                        
  %.sub = getelementptr inbounds [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 
0, i64 0                                                                         
  %1 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 2    
  %2 = bitcast [4 x %jl_value_t*]* %0 to i64*                                    
  store i64 4, i64* %2, align 8                                                  
  %3 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 1    
  %4 = load i64, i64* bitcast ([80040 x i8*]* @jl_tls_states to i64*), align 8   
  %5 = bitcast %jl_value_t** %3 to i64*                                          
  store i64 %4, i64* %5, align 8                                                 
  store %jl_value_t** %.sub, %jl_value_t*** bitcast ([80040 x i8*]* @jl_tls_state
s to %jl_value_t***), align 8                                                    
  store %jl_value_t* null, %jl_value_t** %1, align 8                             
  %6 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 3    
  store %jl_value_t* null, %jl_value_t** %6, align 8                             
  %7 = call %jl_value_t* @jl_gc_alloc_1w()                                       
  %8 = getelementptr inbounds %jl_value_t, %jl_value_t* %7, i64 -1, i32 0        
  store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %8, align 8                                                                    
  store %jl_value_t* %7, %jl_value_t** %1, align 8                               
  %9 = load i64, i64* inttoptr (i64 139617449926784 to i64*), align 128          
  %10 = bitcast %jl_value_t* %7 to i64*                                          
  store i64 %9, i64* %10, align 16                                               
  store %jl_value_t* %7, %jl_value_t** %1, align 8                               
  %11 = call %jl_value_t* @jl_gc_alloc_1w()                                      
  %12 = getelementptr inbounds %jl_value_t, %jl_value_t* %11, i64 -1, i32 0      
  store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %12, align 8                                                                   
  store %jl_value_t* %11, %jl_value_t** %6, align 8                              
  %13 = load i64, i64* inttoptr (i64 139617449926832 to i64*), align 16          
  %14 = bitcast %jl_value_t* %11 to i64*                                         
  store i64 %13, i64* %14, align 16                                              
  store %jl_value_t* %11, %jl_value_t** %6, align 8                              
  call void @julia_g1_23553(%jl_value_t* %7, %jl_value_t* %11) #0                
  %15 = call %jl_value_t* @jl_gc_alloc_1w()                                      
  %16 = getelementptr inbounds %jl_value_t, %jl_value_t* %15, i64 -1, i32 0      
  store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %16, align 8                                                                   
  store %jl_value_t* %15, %jl_value_t** %1, align 8                              
  %17 = load i64, i64* inttoptr (i64 139617449926784 to i64*), align 128         
  %18 = bitcast %jl_value_t* %15 to i64*                                         
  store i64 %17, i64* %18, align 16                                              
  store %jl_value_t* %15, %jl_value_t** %1, align 8                              
  %19 = call %jl_value_t* @jl_gc_alloc_1w()                                      
  %20 = getelementptr inbounds %jl_value_t, %jl_value_t* %19, i64 -1, i32 0      
  store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %20, align 8                                                                   
  store %jl_value_t* %19, %jl_value_t** %6, align 8                              
  %21 = load i64, i64* inttoptr (i64 139617449926832 to i64*), align 16          
  %22 = bitcast %jl_value_t* %19 to i64*                                         
  store i64 %21, i64* %22, align 16                                              
  store %jl_value_t* %19, %jl_value_t** %6, align 8                              
  call void @julia_g1_23553(%jl_value_t* %15, %jl_value_t* %19) #0               
  %23 = load i64, i64* %5, align 8                                               
  store i64 %23, i64* bitcast ([80040 x i8*]* @jl_tls_states to i64*), align 8   
  ret void                                                                       
}                                                                                

define void @julia_f2_23557() #0 {                                             
top:                                                                             
  %0 = alloca [4 x %jl_value_t*], align 8                                        
  %.sub = getelementptr inbounds [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 
0, i64 0                                                                         
  %1 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 2    
  %2 = bitcast [4 x %jl_value_t*]* %0 to i64*                                    
  store i64 4, i64* %2, align 8                                                  
  %3 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 1    
  %4 = load i64, i64* bitcast ([80040 x i8*]* @jl_tls_states to i64*), align 8   
  %5 = bitcast %jl_value_t** %3 to i64*                                          
  store i64 %4, i64* %5, align 8                                                 
  store %jl_value_t** %.sub, %jl_value_t*** bitcast ([80040 x i8*]* @jl_tls_state
s to %jl_value_t***), align 8                                                    
  store %jl_value_t* null, %jl_value_t** %1, align 8                             
  %6 = getelementptr [4 x %jl_value_t*], [4 x %jl_value_t*]* %0, i64 0, i64 3    
  store %jl_value_t* null, %jl_value_t** %6, align 8                             
  %7 = call %jl_value_t* @jl_gc_alloc_1w()                                       
  %8 = getelementptr inbounds %jl_value_t, %jl_value_t* %7, i64 -1, i32 0        
  store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %8, align 8                                                                    
  store %jl_value_t* %7, %jl_value_t** %1, align 8                               
  %9 = load i64, i64* inttoptr (i64 139617449926784 to i64*), align 128          
  %10 = bitcast %jl_value_t* %7 to i64*                                          
  store i64 %9, i64* %10, align 16                                               
  store %jl_value_t* %7, %jl_value_t** %1, align 8                               
  %11 = call %jl_value_t* @jl_gc_alloc_1w()                                      
  %12 = getelementptr inbounds %jl_value_t, %jl_value_t* %11, i64 -1, i32 0      
  store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %12, align 8                                                                   
  store %jl_value_t* %11, %jl_value_t** %6, align 8                              
  %13 = load i64, i64* inttoptr (i64 139617449926832 to i64*), align 16          
  %14 = bitcast %jl_value_t* %11 to i64*                                         
  store i64 %13, i64* %14, align 16                                              
  store %jl_value_t* %11, %jl_value_t** %6, align 8                              
  %15 = call %jl_value_t* @julia_g2_23558(%jl_value_t* inttoptr (i64 139617449836
992 to %jl_value_t*), %jl_value_t** %1, i32 2)                                   
  %16 = call %jl_value_t* @jl_gc_alloc_1w()                                      
  %17 = getelementptr inbounds %jl_value_t, %jl_value_t* %16, i64 -1, i32 0      
  store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %17, align 8                                                                   
  store %jl_value_t* %16, %jl_value_t** %1, align 8                              
  %18 = load i64, i64* inttoptr (i64 139617449926784 to i64*), align 128         
  %19 = bitcast %jl_value_t* %16 to i64*                                         
  store i64 %18, i64* %19, align 16                                              
  store %jl_value_t* %16, %jl_value_t** %1, align 8                              
  %20 = call %jl_value_t* @jl_gc_alloc_1w()                                      
  %21 = getelementptr inbounds %jl_value_t, %jl_value_t* %20, i64 -1, i32 0      
  store %jl_value_t* inttoptr (i64 139617450557648 to %jl_value_t*), %jl_value_t*
* %21, align 8                                                                   
  store %jl_value_t* %20, %jl_value_t** %6, align 8                              
  %22 = load i64, i64* inttoptr (i64 139617449926832 to i64*), align 16          
  %23 = bitcast %jl_value_t* %20 to i64*                                         
  store i64 %22, i64* %23, align 16                                              
  store %jl_value_t* %20, %jl_value_t** %6, align 8                              
  %24 = call %jl_value_t* @julia_g2_23558(%jl_value_t* inttoptr (i64 139617449836
992 to %jl_value_t*), %jl_value_t** %1, i32 2)                                   
  %25 = load i64, i64* %5, align 8                                               
  store i64 %25, i64* bitcast ([80040 x i8*]* @jl_tls_states to i64*), align 8   
  ret void                                                                       
}

And on current master it generates 4 (4 temporaries) and 8 (4 temporaries and 2 * 2 jlcall frames without any reuse...)

define void @julia_f1_22963() #0 {
top:
  %0 = alloca [6 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 0
  %1 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 2
  store %jl_value_t* null, %jl_value_t** %1, align 8
  %2 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 3
  store %jl_value_t* null, %jl_value_t** %2, align 8
  %3 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 4
  store %jl_value_t* null, %jl_value_t** %3, align 8
  %4 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 5
  store %jl_value_t* null, %jl_value_t** %4, align 8
  %5 = bitcast [6 x %jl_value_t*]* %0 to i64*
  store i64 8, i64* %5, align 8
  %6 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %0, i64 0, i64 1
  %7 = load i64, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
  %8 = bitcast %jl_value_t** %6 to i64*
  store i64 %7, i64* %8, align 8
  store %jl_value_t** %.sub, %jl_value_t*** @jl_tls_states, align 8
  %9 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %9, %jl_value_t** %1, align 8
  %10 = getelementptr inbounds %jl_value_t, %jl_value_t* %9, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %10, align 8
  %11 = bitcast %jl_value_t* %9 to i64*
  store i64 1, i64* %11, align 16
  %12 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %12, %jl_value_t** %2, align 8
  %13 = getelementptr inbounds %jl_value_t, %jl_value_t* %12, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %13, align 8
  %14 = bitcast %jl_value_t* %12 to i64*
  store i64 2, i64* %14, align 16
  call void @julia_g1_22964(%jl_value_t* %9, %jl_value_t* %12) #0
  %15 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %15, %jl_value_t** %3, align 8
  %16 = getelementptr inbounds %jl_value_t, %jl_value_t* %15, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %16, align 8
  %17 = bitcast %jl_value_t* %15 to i64*
  store i64 1, i64* %17, align 16
  %18 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %18, %jl_value_t** %4, align 8
  %19 = getelementptr inbounds %jl_value_t, %jl_value_t* %18, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %19, align 8
  %20 = bitcast %jl_value_t* %18 to i64*
  store i64 2, i64* %20, align 16
  call void @julia_g1_22964(%jl_value_t* %15, %jl_value_t* %18) #0
  %21 = load i64, i64* %8, align 8
  store i64 %21, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
  ret void
}

julia> @code_llvm f2()

define void @julia_f2_22968() #0 {
top:
  %0 = alloca [10 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 0
  %1 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 2
  %2 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 6
  %3 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 8
  store %jl_value_t* null, %jl_value_t** %1, align 8
  %4 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 3
  store %jl_value_t* null, %jl_value_t** %4, align 8
  %5 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 4
  store %jl_value_t* null, %jl_value_t** %5, align 8
  %6 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 5
  store %jl_value_t* null, %jl_value_t** %6, align 8
  store %jl_value_t* null, %jl_value_t** %2, align 8
  %7 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 7
  store %jl_value_t* null, %jl_value_t** %7, align 8
  store %jl_value_t* null, %jl_value_t** %3, align 8
  %8 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 9
  store %jl_value_t* null, %jl_value_t** %8, align 8
  %9 = bitcast [10 x %jl_value_t*]* %0 to i64*
  store i64 16, i64* %9, align 8
  %10 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %0, i64 0, i64 1
  %11 = load i64, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
  %12 = bitcast %jl_value_t** %10 to i64*
  store i64 %11, i64* %12, align 8
  store %jl_value_t** %.sub, %jl_value_t*** @jl_tls_states, align 8
  %13 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %13, %jl_value_t** %1, align 8
  %14 = getelementptr inbounds %jl_value_t, %jl_value_t* %13, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %14, align 8
  %15 = bitcast %jl_value_t* %13 to i64*
  store i64 1, i64* %15, align 16
  %16 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %16, %jl_value_t** %4, align 8
  %17 = getelementptr inbounds %jl_value_t, %jl_value_t* %16, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %17, align 8
  %18 = bitcast %jl_value_t* %16 to i64*
  store i64 2, i64* %18, align 16
  store %jl_value_t* %13, %jl_value_t** %3, align 8
  store %jl_value_t* %16, %jl_value_t** %8, align 8
  %19 = call %jl_value_t* @julia_g2_22969(%jl_value_t* inttoptr (i64 139780292051200 to %jl_value_t*), %jl_value_t** %3, i32 2)
  %20 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %20, %jl_value_t** %5, align 8
  %21 = getelementptr inbounds %jl_value_t, %jl_value_t* %20, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %21, align 8
  %22 = bitcast %jl_value_t* %20 to i64*
  store i64 1, i64* %22, align 16
  %23 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %23, %jl_value_t** %6, align 8
  %24 = getelementptr inbounds %jl_value_t, %jl_value_t* %23, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %24, align 8
  %25 = bitcast %jl_value_t* %23 to i64*
  store i64 2, i64* %25, align 16
  store %jl_value_t* %20, %jl_value_t** %2, align 8
  store %jl_value_t* %23, %jl_value_t** %7, align 8
  %26 = call %jl_value_t* @julia_g2_22969(%jl_value_t* inttoptr (i64 139780292051200 to %jl_value_t*), %jl_value_t** %2, i32 2)
  %27 = load i64, i64* %12, align 8
  store i64 %27, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
  ret void
}

Adding a useless branch that can be constant folded by llvm (but not type inference) causes the jlcall buffer to be reused but not the temporaries.

julia> function f3()
           g2(Ref(1), Ref(2))
           1 == 1 ? 1 : 1
           g2(Ref(1), Ref(2))
       end
f3 (generic function with 1 method)
define void @julia_f3_22983() #0 {
top:
  %0 = alloca [8 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 0
  %1 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 2
  %2 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 6
  store %jl_value_t* null, %jl_value_t** %1, align 8
  %3 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 3
  store %jl_value_t* null, %jl_value_t** %3, align 8
  %4 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 4
  store %jl_value_t* null, %jl_value_t** %4, align 8
  %5 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 5
  store %jl_value_t* null, %jl_value_t** %5, align 8
  store %jl_value_t* null, %jl_value_t** %2, align 8
  %6 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 7
  store %jl_value_t* null, %jl_value_t** %6, align 8
  %7 = bitcast [8 x %jl_value_t*]* %0 to i64*
  store i64 12, i64* %7, align 8
  %8 = getelementptr [8 x %jl_value_t*], [8 x %jl_value_t*]* %0, i64 0, i64 1
  %9 = load i64, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
  %10 = bitcast %jl_value_t** %8 to i64*
  store i64 %9, i64* %10, align 8
  store %jl_value_t** %.sub, %jl_value_t*** @jl_tls_states, align 8
  %11 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %11, %jl_value_t** %1, align 8
  %12 = getelementptr inbounds %jl_value_t, %jl_value_t* %11, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %12, align 8
  %13 = bitcast %jl_value_t* %11 to i64*
  store i64 1, i64* %13, align 16
  %14 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %14, %jl_value_t** %3, align 8
  %15 = getelementptr inbounds %jl_value_t, %jl_value_t* %14, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %15, align 8
  %16 = bitcast %jl_value_t* %14 to i64*
  store i64 2, i64* %16, align 16
  store %jl_value_t* %11, %jl_value_t** %2, align 8
  store %jl_value_t* %14, %jl_value_t** %6, align 8
  %17 = call %jl_value_t* @julia_g2_22969(%jl_value_t* inttoptr (i64 139780292051200 to %jl_value_t*), %jl_value_t** %2, i32 2)
  %18 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %18, %jl_value_t** %4, align 8
  %19 = getelementptr inbounds %jl_value_t, %jl_value_t* %18, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %19, align 8
  %20 = bitcast %jl_value_t* %18 to i64*
  store i64 1, i64* %20, align 16
  %21 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %21, %jl_value_t** %5, align 8
  %22 = getelementptr inbounds %jl_value_t, %jl_value_t* %21, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 139780294998144 to %jl_value_t*), %jl_value_t** %22, align 8
  %23 = bitcast %jl_value_t* %21 to i64*
  store i64 2, i64* %23, align 16
  store %jl_value_t* %18, %jl_value_t** %2, align 8
  store %jl_value_t* %21, %jl_value_t** %6, align 8
  %24 = call %jl_value_t* @julia_g2_22969(%jl_value_t* inttoptr (i64 139780292051200 to %jl_value_t*), %jl_value_t** %2, i32 2)
  %25 = load i64, i64* %10, align 8
  store i64 %25, i64* bitcast (%jl_value_t*** @jl_tls_states to i64*), align 8
  ret void
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    GCGarbage collectorcompiler:codegenGeneration of LLVM IR and native codeperformanceMust go fasterregressionRegression in behavior compared to a previous version

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions