Skip to content

Commit ab94fad

Browse files
authored
Add fallback if we have make a weird GC decision. (#50682)
If something odd happens during GC (the PC goes to sleep) or a very big transient the heuristics might make a bad decision. What this PR implements is if we try to make our target more than double the one we had before we fallback to a more conservative method. This fixes the new issue @vtjnash found in #40644 for me.
1 parent 67d600c commit ab94fad

File tree

5 files changed

+70
-28
lines changed

5 files changed

+70
-28
lines changed

Make.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1503,7 +1503,7 @@ endef
15031503
WINE ?= wine
15041504

15051505
ifeq ($(BINARY),32)
1506-
HEAPLIM := --heap-size-hint=500M
1506+
HEAPLIM := --heap-size-hint=1000M
15071507
else
15081508
HEAPLIM :=
15091509
endif

src/gc-debug.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,18 +1224,18 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
12241224
if (!gc_logging_enabled) {
12251225
return;
12261226
}
1227-
jl_safe_printf("GC: pause %.2fms. collected %fMB. %s %s\n",
1227+
jl_safe_printf("\nGC: pause %.2fms. collected %fMB. %s %s\n",
12281228
pause/1e6, freed/(double)(1<<20),
12291229
full ? "full" : "incr",
12301230
recollect ? "recollect" : ""
12311231
);
12321232

1233-
jl_safe_printf("Heap stats: bytes_mapped %.2f MB, bytes_resident %.2f MB, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n, Fragmentation %.3f",
1233+
jl_safe_printf("Heap stats: bytes_mapped %.2f MB, bytes_resident %.2f MB,\nheap_size %.2f MB, heap_target %.2f MB, Fragmentation %.3f\n",
12341234
jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/(double)(1<<20),
12351235
jl_atomic_load_relaxed(&gc_heap_stats.bytes_resident)/(double)(1<<20),
1236+
// live_bytes/(double)(1<<20), live byes tracking is not accurate.
12361237
jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/(double)(1<<20),
12371238
jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/(double)(1<<20),
1238-
live_bytes/(double)(1<<20),
12391239
(double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size)
12401240
);
12411241
// Should fragmentation use bytes_resident instead of heap_size?

src/gc.c

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// This file is a part of Julia. License is MIT: https://julialang.org/license
22

33
#include "gc.h"
4+
#include "julia.h"
45
#include "julia_gcext.h"
56
#include "julia_assert.h"
67
#ifdef __GLIBC__
@@ -696,8 +697,8 @@ static uint64_t old_heap_size = 0;
696697
static uint64_t old_alloc_diff = 0;
697698
static uint64_t old_freed_diff = 0;
698699
static uint64_t gc_end_time = 0;
699-
700-
700+
static int thrash_counter = 0;
701+
static int thrashing = 0;
701702
// global variables for GC stats
702703

703704
// Resetting the object to a young object, this is used when marking the
@@ -1170,7 +1171,10 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
11701171
dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
11711172
uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
11721173
uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
1174+
dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
11731175
jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_acc - free_acc + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
1176+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
1177+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
11741178
}
11751179
}
11761180
}
@@ -3266,9 +3270,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
32663270
// If the live data outgrows the suggested max_total_memory
32673271
// we keep going with minimum intervals and full gcs until
32683272
// we either free some space or get an OOM error.
3269-
if (live_bytes > max_total_memory) {
3270-
sweep_full = 1;
3271-
}
32723273
if (gc_sweep_always_full) {
32733274
sweep_full = 1;
32743275
}
@@ -3320,7 +3321,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
33203321
gc_num.last_incremental_sweep = gc_end_time;
33213322
}
33223323

3323-
int thrashing = 0; // maybe we should report this to the user or error out?
33243324
size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size);
33253325
double target_allocs = 0.0;
33263326
double min_interval = default_collect_interval;
@@ -3331,24 +3331,32 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
33313331
double collect_smooth_factor = 0.5;
33323332
double tuning_factor = 0.03;
33333333
double alloc_mem = jl_gc_smooth(old_alloc_diff, alloc_diff, alloc_smooth_factor);
3334-
double alloc_time = jl_gc_smooth(old_mut_time, mutator_time, alloc_smooth_factor);
3334+
double alloc_time = jl_gc_smooth(old_mut_time, mutator_time + sweep_time, alloc_smooth_factor); // Charge sweeping to the mutator
33353335
double gc_mem = jl_gc_smooth(old_freed_diff, freed_diff, collect_smooth_factor);
3336-
double gc_time = jl_gc_smooth(old_pause_time, pause, collect_smooth_factor);
3336+
double gc_time = jl_gc_smooth(old_pause_time, pause - sweep_time, collect_smooth_factor);
33373337
old_alloc_diff = alloc_diff;
33383338
old_mut_time = mutator_time;
33393339
old_freed_diff = freed_diff;
33403340
old_pause_time = pause;
3341-
old_heap_size = heap_size;
3342-
thrashing = gc_time > mutator_time * 98 ? 1 : 0;
3341+
old_heap_size = heap_size; // TODO: Update these values dynamically instead of just during the GC
3342+
if (gc_time > alloc_time * 95 && !(thrash_counter < 4))
3343+
thrash_counter += 1;
3344+
else if (thrash_counter > 0)
3345+
thrash_counter -= 1;
33433346
if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0 ) {
33443347
double alloc_rate = alloc_mem/alloc_time;
33453348
double gc_rate = gc_mem/gc_time;
33463349
target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval
33473350
}
33483351
}
3349-
if (target_allocs == 0.0 || thrashing) // If we are thrashing go back to default
3350-
target_allocs = 2*sqrt((double)heap_size/min_interval);
3352+
if (thrashing == 0 && thrash_counter >= 3)
3353+
thrashing = 1;
3354+
else if (thrashing == 1 && thrash_counter <= 2)
3355+
thrashing = 0; // maybe we should report this to the user or error out?
33513356

3357+
int bad_result = (target_allocs*min_interval + heap_size) > 2 * jl_atomic_load_relaxed(&gc_heap_stats.heap_target); // Don't follow through on a bad decision
3358+
if (target_allocs == 0.0 || thrashing || bad_result) // If we are thrashing go back to default
3359+
target_allocs = 2*sqrt((double)heap_size/min_interval);
33523360
uint64_t target_heap = (uint64_t)target_allocs*min_interval + heap_size;
33533361
if (target_heap > max_total_memory && !thrashing) // Allow it to go over if we are thrashing if we die we die
33543362
target_heap = max_total_memory;
@@ -3612,10 +3620,10 @@ void jl_gc_init(void)
36123620
total_mem = uv_get_total_memory();
36133621
uint64_t constrained_mem = uv_get_constrained_memory();
36143622
if (constrained_mem > 0 && constrained_mem < total_mem)
3615-
total_mem = constrained_mem;
3623+
jl_gc_set_max_memory(constrained_mem - 250*1024*1024); // LLVM + other libraries need some amount of memory
36163624
#endif
36173625
if (jl_options.heap_size_hint)
3618-
jl_gc_set_max_memory(jl_options.heap_size_hint);
3626+
jl_gc_set_max_memory(jl_options.heap_size_hint - 250*1024*1024);
36193627

36203628
t_start = jl_hrtime();
36213629
}
@@ -3718,7 +3726,26 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
37183726
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
37193727
jl_atomic_store_relaxed(&ptls->gc_num.realloc,
37203728
jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
3721-
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old);
3729+
3730+
int64_t diff = sz - old;
3731+
if (diff < 0) {
3732+
uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
3733+
if (free_acc + diff < 16*1024)
3734+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, free_acc + (-diff));
3735+
else {
3736+
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_acc + (-diff)));
3737+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
3738+
}
3739+
}
3740+
else {
3741+
uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
3742+
if (alloc_acc + diff < 16*1024)
3743+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + diff);
3744+
else {
3745+
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + diff);
3746+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
3747+
}
3748+
}
37223749
}
37233750
return realloc(p, sz);
37243751
}
@@ -3835,7 +3862,27 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
38353862
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
38363863
jl_atomic_store_relaxed(&ptls->gc_num.realloc,
38373864
jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
3838-
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz);
3865+
3866+
int64_t diff = allocsz - oldsz;
3867+
if (diff < 0) {
3868+
uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
3869+
if (free_acc + diff < 16*1024)
3870+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, free_acc + (-diff));
3871+
else {
3872+
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_acc + (-diff)));
3873+
jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
3874+
}
3875+
}
3876+
else {
3877+
uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
3878+
if (alloc_acc + diff < 16*1024)
3879+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + diff);
3880+
else {
3881+
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + diff);
3882+
jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
3883+
}
3884+
}
3885+
38393886
int last_errno = errno;
38403887
#ifdef _OS_WINDOWS_
38413888
DWORD last_error = GetLastError();

test/cmdlineargs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,6 @@ end
974974
@test lines[3] == "foo"
975975
@test lines[4] == "bar"
976976
end
977-
#heap-size-hint
978-
@test readchomp(`$(Base.julia_cmd()) --startup-file=no --heap-size-hint=500M -e "println(@ccall jl_gc_get_max_memory()::UInt64)"`) == "524288000"
977+
#heap-size-hint, we reserve 250 MB for non GC memory (llvm, etc.)
978+
@test readchomp(`$(Base.julia_cmd()) --startup-file=no --heap-size-hint=500M -e "println(@ccall jl_gc_get_max_memory()::UInt64)"`) == "$((500-250)*1024*1024)"
979979
end

test/testenv.jl

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,7 @@ if !@isdefined(testenv_defined)
3838
function addprocs_with_testenv(X; rr_allowed=true, kwargs...)
3939
exename = rr_allowed ? `$rr_exename $test_exename` : test_exename
4040
if X isa Integer
41-
if Sys.iswindows()
42-
heap_size=round(Int,(Sys.free_memory()/(1024^2)/(X+1)))
43-
heap_size -= 300 # I don't know anymore
44-
else
45-
heap_size=round(Int,(Sys.total_memory()/(1024^2)/(X+1)))
46-
end
41+
heap_size=round(Int,(Sys.free_memory()/(1024^2)/(X+1)))
4742
push!(test_exeflags.exec, "--heap-size-hint=$(heap_size)M")
4843
end
4944
addprocs(X; exename=exename, exeflags=test_exeflags, kwargs...)

0 commit comments

Comments
 (0)