@@ -24,11 +24,17 @@ int jl_n_sweepthreads;
2424// Number of threads currently running the GC mark-loop
2525_Atomic(int ) gc_n_threads_marking ;
2626// Number of threads sweeping
27- _Atomic(int ) gc_n_threads_sweeping ;
27+ _Atomic(int ) gc_n_threads_sweeping_pools ;
28+ // Number of threads sweeping stacks
29+ _Atomic(int ) gc_n_threads_sweeping_stacks ;
2830// Temporary for the `ptls->gc_tls.page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
2931_Atomic(jl_gc_padded_page_stack_t * ) gc_allocd_scratch ;
3032// `tid` of mutator thread that triggered GC
3133_Atomic(int ) gc_master_tid ;
34+ // counter for sharing work when sweeping stacks
35+ _Atomic(int ) gc_ptls_sweep_idx ;
36+ // counter for round robin of giving back stack pages to the OS
37+ _Atomic(int ) gc_stack_free_idx = 0 ;
3238// `tid` of first GC thread
3339int gc_first_tid ;
3440// Mutex/cond used to synchronize wakeup of GC threads on parallel marking
@@ -996,13 +1002,50 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
9961002// sweep over all memory that is being used and not in a pool
9971003static void gc_sweep_other (jl_ptls_t ptls , int sweep_full ) JL_NOTSAFEPOINT
9981004{
999- sweep_stack_pools ();
10001005 gc_sweep_foreign_objs ();
10011006 sweep_malloced_memory ();
10021007 sweep_big (ptls );
10031008 jl_engine_sweep (gc_all_tls_states );
10041009}
10051010
1011+ // wake up all threads to sweep the stacks
1012+ void gc_sweep_wake_all_stacks (jl_ptls_t ptls ) JL_NOTSAFEPOINT
1013+ {
1014+ uv_mutex_lock (& gc_threads_lock );
1015+ int first = gc_first_parallel_collector_thread_id ();
1016+ int last = gc_last_parallel_collector_thread_id ();
1017+ for (int i = first ; i <= last ; i ++ ) {
1018+ jl_ptls_t ptls2 = gc_all_tls_states [i ];
1019+ gc_check_ptls_of_parallel_collector_thread (ptls2 );
1020+ jl_atomic_fetch_add (& ptls2 -> gc_tls .gc_stack_sweep_requested , 1 );
1021+ }
1022+ uv_cond_broadcast (& gc_threads_cond );
1023+ uv_mutex_unlock (& gc_threads_lock );
1024+ return ;
1025+ }
1026+
1027+ void gc_sweep_wait_for_all_stacks (void ) JL_NOTSAFEPOINT
1028+ {
1029+ while ((jl_atomic_load_acquire (& gc_ptls_sweep_idx ) >= 0 ) || jl_atomic_load_acquire (& gc_n_threads_sweeping_stacks ) != 0 ) {
1030+ jl_cpu_pause ();
1031+ }
1032+ }
1033+
1034+ void sweep_stack_pools (jl_ptls_t ptls ) JL_NOTSAFEPOINT
1035+ {
1036+ // initialize ptls index for parallel sweeping of stack pools
1037+ assert (gc_n_threads );
1038+ int stack_free_idx = jl_atomic_load_relaxed (& gc_stack_free_idx );
1039+ if (stack_free_idx + 1 == gc_n_threads )
1040+ jl_atomic_store_relaxed (& gc_stack_free_idx , 0 );
1041+ else
1042+ jl_atomic_store_relaxed (& gc_stack_free_idx , stack_free_idx + 1 );
1043+ jl_atomic_store_release (& gc_ptls_sweep_idx , gc_n_threads - 1 ); // idx == gc_n_threads = release stacks to the OS so it's serial
1044+ gc_sweep_wake_all_stacks (ptls );
1045+ sweep_stack_pool_loop ();
1046+ gc_sweep_wait_for_all_stacks ();
1047+ }
1048+
10061049static void gc_pool_sync_nfree (jl_gc_pagemeta_t * pg , jl_taggedvalue_t * last ) JL_NOTSAFEPOINT
10071050{
10081051 assert (pg -> fl_begin_offset != UINT16_MAX );
@@ -1078,7 +1121,7 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc
10781121}
10791122
10801123// wake up all threads to sweep the pages
1081- void gc_sweep_wake_all (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1124+ void gc_sweep_wake_all_pages (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
10821125{
10831126 int parallel_sweep_worthwhile = gc_sweep_prescan (ptls , new_gc_allocd_scratch );
10841127 if (parallel_sweep_worthwhile && !page_profile_enabled ) {
@@ -1114,18 +1157,18 @@ void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_
11141157}
11151158
11161159// wait for all threads to finish sweeping
1117- void gc_sweep_wait_for_all (void )
1160+ void gc_sweep_wait_for_all_pages (void )
11181161{
11191162 jl_atomic_store (& gc_allocd_scratch , NULL );
1120- while (jl_atomic_load_acquire (& gc_n_threads_sweeping ) != 0 ) {
1163+ while (jl_atomic_load_acquire (& gc_n_threads_sweeping_pools ) != 0 ) {
11211164 jl_cpu_pause ();
11221165 }
11231166}
11241167
11251168// sweep all pools
11261169void gc_sweep_pool_parallel (jl_ptls_t ptls )
11271170{
1128- jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
1171+ jl_atomic_fetch_add (& gc_n_threads_sweeping_pools , 1 );
11291172 jl_gc_padded_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
11301173 if (allocd_scratch != NULL ) {
11311174 gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
@@ -1170,7 +1213,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls)
11701213 }
11711214 gc_page_serializer_destroy (& serializer );
11721215 }
1173- jl_atomic_fetch_add (& gc_n_threads_sweeping , -1 );
1216+ jl_atomic_fetch_add (& gc_n_threads_sweeping_pools , -1 );
11741217}
11751218
11761219// free all pages (i.e. through `madvise` on Linux) that were lazily freed
@@ -1260,9 +1303,9 @@ static void gc_sweep_pool(void)
12601303 // the actual sweeping
12611304 jl_gc_padded_page_stack_t * new_gc_allocd_scratch = (jl_gc_padded_page_stack_t * ) calloc_s (n_threads * sizeof (jl_gc_padded_page_stack_t ));
12621305 jl_ptls_t ptls = jl_current_task -> ptls ;
1263- gc_sweep_wake_all (ptls , new_gc_allocd_scratch );
1306+ gc_sweep_wake_all_pages (ptls , new_gc_allocd_scratch );
12641307 gc_sweep_pool_parallel (ptls );
1265- gc_sweep_wait_for_all ();
1308+ gc_sweep_wait_for_all_pages ();
12661309
12671310 // reset half-pages pointers
12681311 for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
@@ -3073,6 +3116,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
30733116#endif
30743117 current_sweep_full = sweep_full ;
30753118 sweep_weak_refs ();
3119+ uint64_t stack_pool_time = jl_hrtime ();
3120+ sweep_stack_pools (ptls );
3121+ stack_pool_time = jl_hrtime () - stack_pool_time ;
3122+ gc_num .total_stack_pool_sweep_time += stack_pool_time ;
3123+ gc_num .stack_pool_sweep_time = stack_pool_time ;
30763124 gc_sweep_other (ptls , sweep_full );
30773125 gc_scrub ();
30783126 gc_verify_tags ();
@@ -3491,6 +3539,10 @@ STATIC_INLINE int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT
34913539 return (jl_atomic_load (& ptls -> gc_tls .gc_sweeps_requested ) > 0 );
34923540}
34933541
3542+ STATIC_INLINE int may_sweep_stack (jl_ptls_t ptls ) JL_NOTSAFEPOINT
3543+ {
3544+ return (jl_atomic_load (& ptls -> gc_tls .gc_stack_sweep_requested ) > 0 );
3545+ }
34943546// parallel gc thread function
34953547void jl_parallel_gc_threadfun (void * arg )
34963548{
@@ -3513,12 +3565,17 @@ void jl_parallel_gc_threadfun(void *arg)
35133565
35143566 while (1 ) {
35153567 uv_mutex_lock (& gc_threads_lock );
3516- while (!may_mark () && !may_sweep (ptls )) {
3568+ while (!may_mark () && !may_sweep (ptls ) && ! may_sweep_stack ( ptls ) ) {
35173569 uv_cond_wait (& gc_threads_cond , & gc_threads_lock );
35183570 }
35193571 uv_mutex_unlock (& gc_threads_lock );
35203572 assert (jl_atomic_load_relaxed (& ptls -> gc_state ) == JL_GC_PARALLEL_COLLECTOR_THREAD );
35213573 gc_mark_loop_parallel (ptls , 0 );
3574+ if (may_sweep_stack (ptls )) {
3575+ assert (jl_atomic_load_relaxed (& ptls -> gc_state ) == JL_GC_PARALLEL_COLLECTOR_THREAD );
3576+ sweep_stack_pool_loop ();
3577+ jl_atomic_fetch_add (& ptls -> gc_tls .gc_stack_sweep_requested , -1 );
3578+ }
35223579 if (may_sweep (ptls )) {
35233580 assert (jl_atomic_load_relaxed (& ptls -> gc_state ) == JL_GC_PARALLEL_COLLECTOR_THREAD );
35243581 gc_sweep_pool_parallel (ptls );
0 commit comments