From 13f63415c92d0686feaa7dea59d3972df8149708 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 18 Oct 2019 15:28:55 +0200 Subject: [PATCH] Instrument GC with memory profiler implementation This adds C support for a memory profiler within the GC, tracking locations of allocations, deallocations, etc... It operates in a similar manner as the time profiler with single large buffers setup beforehand through an initialization function, reducing the need for expensive allocations while the program being measured is running. The memory profiler instruments the GC in all locations that the GC statistics themselves are being modified (e.g. `gc_num.allocd` and `gc_num.freed`) by introducing new helper functions `jl_gc_count_{allocd,freed,reallocd}()`. Those utility functions call the `jl_memprofile_track_{de,}alloc()` method to register an address, a size and a tag with the memory profiler. We also track type information as this can be critically helpful when debugging, and to do so without breaking API guarantees we insert methods to set the type of a chunk of memory after allocating it where necessary. The tagging system allows the memory profiler to disambiguate, at profile time, between e.g. pooled allocations and the "big" allocator. It also allows the memory allocator to support tracking multiple "memory domains", e.g. a GPU support package could manually call `jl_memprofile_track_alloc()` any time a chunk of memory is allocated on the GPU so as to use the same system. By default, all values are tracked, however one can set a `memprof_tag_filter` value to track only the values you are most interested in. (E.g. only CPU domain big allocations) --- base/error.jl | 24 ++++- src/array.c | 12 ++- src/gc.c | 173 ++++++++++++++++++++++++------- src/julia_internal.h | 37 ++++++- src/llvm-final-gc-lowering.cpp | 18 +++- src/llvm-late-gc-lowering.cpp | 3 + src/llvm-pass-helpers.cpp | 30 ++++++ src/llvm-pass-helpers.h | 6 ++ src/profile.c | 129 ++++++++++++++++++++++- src/support/timefuncs.c | 4 +- src/support/timefuncs.h | 4 +- test/llvmpasses/late-lower-gc.ll | 2 + 12 files changed, 391 insertions(+), 51 deletions(-) diff --git a/base/error.jl b/base/error.jl index 3aaddd860dd9ae..8604c100f86aba 100644 --- a/base/error.jl +++ b/base/error.jl @@ -65,8 +65,16 @@ struct InterpreterIP mod::Union{Module,Nothing} end -# formatted backtrace buffers can contain all types of objects (none for now though) -const BackTraceEntry = Union{Ptr{Nothing}, InterpreterIP} +struct AllocationInfo + T::Union{Nothing,Type} + address::Ptr{Cvoid} + time::Float64 + allocsz::Csize_t + tag::UInt16 +end + +# formatted backtrace buffers can contain all types of objects +const BackTraceEntry = Union{Ptr{Nothing}, InterpreterIP, AllocationInfo} # but only some correspond with actual instruction pointers const InstructionPointer = Union{Ptr{Nothing}, InterpreterIP} @@ -114,6 +122,18 @@ function _reformat_bt(bt, Wanted::Type=BackTraceEntry) end push!(ret, InterpreterIP(code, header, mod)) end + elseif tag == 2 # JL_BT_ALLOCINFO_FRAME_TAG + if AllocationInfo <: Wanted + #@assert header == 0 + #@assert njlvalues == 1 + type = unsafe_pointer_to_objref(convert(Ptr{Any}, bt[i+2])) + #@assert nuintvals == 4 + address = reinterpret(Ptr{Cvoid}, bt[i+3]) + time = reinterpret(Cdouble, bt[i+4]) + allocsz = reinterpret(UInt, bt[i+5]) + tag = reinterpret(UInt, bt[i+6]) + push!(ret, AllocationInfo(type, address, time, allocsz, tag)) + end else # Tags we don't know about are an error throw(ArgumentError("Unexpected extended backtrace entry tag $tag at bt[$i]")) diff --git a/src/array.c b/src/array.c index a0627ff24ab8e5..0eb77e7e5a38d9 100644 --- a/src/array.c +++ b/src/array.c @@ -98,6 +98,7 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims, tsz += tot; tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align whole object a = (jl_array_t*)jl_gc_alloc(ptls, tsz, atype); + jl_memprofile_set_typeof(a, atype); // No allocation or safepoint allowed after this a->flags.how = 0; data = (char*)a + doffs; @@ -107,12 +108,14 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims, else { tsz = JL_ARRAY_ALIGN(tsz, JL_CACHE_BYTE_ALIGNMENT); // align whole object data = jl_gc_managed_malloc(tot); + jl_memprofile_set_typeof(data, atype); // Allocate the Array **after** allocating the data // to make sure the array is still young a = (jl_array_t*)jl_gc_alloc(ptls, tsz, atype); // No allocation or safepoint allowed after this a->flags.how = 2; jl_gc_track_malloced_array(ptls, a); + jl_memprofile_set_typeof(a, atype); if (!isunboxed || isunion) // need to zero out isbits union array selector bytes to ensure a valid type index memset(data, 0, tot); @@ -334,7 +337,9 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data, if (own_buffer) { a->flags.how = 2; jl_gc_track_malloced_array(ptls, a); - jl_gc_count_allocd(nel*elsz + (elsz == 1 ? 1 : 0)); + jl_gc_count_allocd(a, nel*elsz + (elsz == 1 ? 1 : 0), + JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); + jl_memprofile_set_typeof(a, atype); } else { a->flags.how = 0; @@ -401,7 +406,9 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array(jl_value_t *atype, void *data, if (own_buffer) { a->flags.how = 2; jl_gc_track_malloced_array(ptls, a); - jl_gc_count_allocd(nel*elsz + (elsz == 1 ? 1 : 0)); + jl_gc_count_allocd(a, nel*elsz + (elsz == 1 ? 1 : 0), + JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); + jl_memprofile_set_typeof(a, atype); } else { a->flags.how = 0; @@ -669,6 +676,7 @@ static int NOINLINE array_resize_buffer(jl_array_t *a, size_t newlen) a->flags.how = 1; jl_gc_wb_buf(a, a->data, nbytes); } + jl_memprofile_set_typeof(a->data, jl_typeof(a)); } if (JL_ARRAY_IMPL_NUL && elsz == 1 && !isbitsunion) memset((char*)a->data + oldnbytes - 1, 0, nbytes - oldnbytes + 1); diff --git a/src/gc.c b/src/gc.c index 5c5e3bd8d8de8d..ae570b48bf425d 100644 --- a/src/gc.c +++ b/src/gc.c @@ -847,7 +847,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) jl_throw(jl_memory_exception); gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t, gc_cblist_notify_external_alloc, (v, allocsz)); - ptls->gc_num.allocd += allocsz; + jl_gc_count_allocd(jl_valueof(&v->header), allocsz, JL_MEMPROF_TAG_DOMAIN_CPU | + JL_MEMPROF_TAG_ALLOC_BIGALLOC); ptls->gc_num.bigalloc++; #ifdef MEMDEBUG memset(v, 0xee, allocsz); @@ -887,7 +888,8 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT *pv = nxt; if (nxt) nxt->prev = pv; - gc_num.freed += v->sz&~3; + jl_gc_count_freed(jl_valueof(&v->header), v->sz&~3, JL_MEMPROF_TAG_DOMAIN_CPU | + JL_MEMPROF_TAG_ALLOC_BIGALLOC); #ifdef MEMDEBUG memset(v, 0xbb, v->sz&~3); #endif @@ -938,10 +940,44 @@ void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT ptls->heap.mallocarrays = ma; } -void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT +void jl_gc_count_allocd(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT { jl_ptls_t ptls = jl_get_ptls_states(); ptls->gc_num.allocd += sz; + + if (__unlikely(jl_memprofile_is_running())) { + jl_memprofile_track_alloc(addr, tag, sz); + } +} + +void jl_gc_count_freed(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT +{ + jl_ptls_t ptls = jl_get_ptls_states(); + ptls->gc_num.freed += sz; + + if (__unlikely(jl_memprofile_is_running())) { + jl_memprofile_track_dealloc(addr, tag); + } +} + +void jl_gc_count_reallocd(void * oldaddr, size_t oldsz, void * newaddr, size_t newsz, uint16_t tag) JL_NOTSAFEPOINT +{ + jl_ptls_t ptls = jl_get_ptls_states(); + if (oldsz < newsz) { + ptls->gc_num.allocd += newsz - oldsz; + } else { + ptls->gc_num.freed += oldsz - newsz; + } + + // Our memprofile does not yet have a way to represent "realloc", so we just + // represent this as a free immediately followed by a malloc. This makes the + // absolute value of the memory deltas look larger than the Julia GC's statistics + // would have you believe, as the Julia GC shows only the difference between + // the two values when realloc'ing. + if (__unlikely(jl_memprofile_is_running())) { + jl_memprofile_track_dealloc(oldaddr, tag); + jl_memprofile_track_alloc(newaddr, tag, newsz); + } } static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT @@ -1002,7 +1038,7 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT jl_free_aligned(d); else free(d); - gc_num.freed += array_nbytes(a); + jl_gc_count_freed(d, array_nbytes(a), JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); } } @@ -1094,7 +1130,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, return jl_gc_big_alloc(ptls, osize); #endif maybe_collect(ptls); - ptls->gc_num.allocd += osize; ptls->gc_num.poolalloc++; // first try to use the freelist jl_taggedvalue_t *v = p->freelist; @@ -1109,6 +1144,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, pg->nfree = 0; pg->has_young = 1; } + jl_gc_count_allocd(jl_valueof(v), osize, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_POOLALLOC); return jl_valueof(v); } // if the freelist is empty we reuse empty but not freed pages @@ -1133,6 +1169,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, next = (jl_taggedvalue_t*)((char*)v + osize); } p->newpages = next; + jl_gc_count_allocd(jl_valueof(v), osize, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_POOLALLOC); return jl_valueof(v); } @@ -1157,8 +1194,6 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t uint8_t *ages = pg->ages; jl_taggedvalue_t *v = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET); char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize; - size_t old_nfree = pg->nfree; - size_t nfree; int freedall = 1; int pg_skpd = 1; @@ -1177,7 +1212,6 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t else { jl_gc_free_page(data); } - nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize; goto done; } // For quick sweep, we might be able to skip the page if the page doesn't @@ -1192,7 +1226,6 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t pfl = (jl_taggedvalue_t**)page_pfl_end(pg); } freedall = 0; - nfree = pg->nfree; goto done; } } @@ -1258,11 +1291,9 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t pg->prev_nold = prev_nold; } } - nfree = pg->nfree; done: gc_time_count_page(freedall, pg_skpd); - gc_num.freed += (nfree - old_nfree) * osize; return pfl; } @@ -2567,6 +2598,8 @@ JL_DLLEXPORT int jl_gc_enable(int on) if (on && !prev) { // disable -> enable if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) { + // This to restore the value of `allocd` that was clobbered in `jl_gc_collect()` + // when `jl_gc_disable_counter` is nonzero. gc_num.allocd += gc_num.deferred_alloc; gc_num.deferred_alloc = 0; } @@ -2659,10 +2692,8 @@ static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp ptls2->heap.rem_bindings.len = n_bnd_refyoung; } -static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) +static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_bt_element_t* bt_data, size_t bt_size) { - jl_bt_element_t *bt_data = ptls2->bt_data; - size_t bt_size = ptls2->bt_size; for (size_t i = 0; i < bt_size; i += jl_bt_entry_size(bt_data + i)) { jl_bt_element_t *bt_entry = bt_data + i; if (jl_bt_is_native(bt_entry)) @@ -2673,6 +2704,61 @@ static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp } } +static void gc_track_pool_frees(void) +{ + // TODO: this is too costly, inline in sweep_page again + + // Iterate over the three levels of our pagetable. We collapse indentation here + // to make it more readable, especially as we do essentially the same thing + // three times with just slightly changed variable names: + for (int pg2_i = 0; pg2_i < (REGION2_PG_COUNT + 31) / 32; pg2_i++) { + uint32_t line2 = memory_map.allocmap1[pg2_i]; + if (line2) { + for (int j = 0; j < 32; j++) { + if ((line2 >> j) & 1) { + pagetable1_t * pagetable1 = memory_map.meta1[pg2_i * 32 + j]; + + for (int pg1_i = 0; pg1_i < REGION1_PG_COUNT / 32; pg1_i++) { + uint32_t line1 = pagetable1->allocmap0[pg1_i]; + if (line1) { + for (int k = 0; k < 32; k++) { + if ((line1 >> k) & 1) { + pagetable0_t * pagetable0 = pagetable1->meta0[pg1_i * 32 + k]; + + for (int pg0_i = 0; pg0_i < REGION0_PG_COUNT / 32; pg0_i++) { + uint32_t line0 = pagetable0->allocmap[pg0_i]; + if (line0) { + for (int l = 0; l < 32; l++) { + if ((line0 >> l) & 1) { + jl_gc_pagemeta_t * pg = pagetable0->meta[pg0_i * 32 + l]; + + // Once we have an actual page, iterate over the cells: + jl_taggedvalue_t *v = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); + char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - pg->osize; + + while ((char *)v <= lim) { + // If this object is live but unmarked, then it's about to be freed, + // so track that via jl_gc_count_freed(). + if (v->bits.gc == GC_CLEAN) { + jl_value_t * ptr = jl_gc_internal_obj_base_ptr(v); + if (ptr != NULL) { + jl_gc_count_freed(ptr, pg->osize, JL_MEMPROF_TAG_DOMAIN_CPU | + JL_MEMPROF_TAG_ALLOC_POOLALLOC); + } + } + + // Move to next cell in the page + v = (jl_taggedvalue_t*)((char*)v + pg->osize); + } + + // Region 0 + }}}} + // Region 1 + }}}} + // Region 2 + }}}} +} + size_t jl_maxrss(void); // Only one thread should be running in this function @@ -2697,8 +2783,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) jl_gc_queue_remset(gc_cache, &sp, ptls2); // 2.2. mark every thread local root jl_gc_queue_thread_local(gc_cache, &sp, ptls2); - // 2.3. mark any managed objects in the backtrace buffer - jl_gc_queue_bt_buf(gc_cache, &sp, ptls2); + // 2.3. mark any managed objects in the backtrace buffers, + // so that things like Interpreter frame objects do not disappear. + jl_gc_queue_bt_buf(gc_cache, &sp, ptls2->bt_data, ptls2->bt_size); + jl_gc_queue_bt_buf(gc_cache, &sp, (jl_bt_element_t *)jl_profile_get_data(), jl_profile_len_data()); } // 3. walk roots @@ -2822,6 +2910,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags(); + gc_track_pool_frees(); gc_sweep_pool(sweep_full); if (sweep_full) gc_sweep_perm_alloc(); @@ -3027,23 +3116,27 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void) JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { jl_ptls_t ptls = jl_get_ptls_states(); - if (ptls && ptls->world_age) { + if (ptls && ptls->world_age) maybe_collect(ptls); - ptls->gc_num.allocd += sz; + void *b = malloc(sz); + if (ptls && ptls->world_age) { + jl_gc_count_allocd(b, sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); ptls->gc_num.malloc++; } - return malloc(sz); + return b; } JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { jl_ptls_t ptls = jl_get_ptls_states(); - if (ptls && ptls->world_age) { + if (ptls && ptls->world_age) maybe_collect(ptls); - ptls->gc_num.allocd += nm*sz; + void *b = calloc(nm, sz); + if (ptls && ptls->world_age) { + jl_gc_count_allocd(b, nm*sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); ptls->gc_num.malloc++; } - return calloc(nm, sz); + return b; } JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) @@ -3051,7 +3144,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) jl_ptls_t ptls = jl_get_ptls_states(); free(p); if (ptls && ptls->world_age) { - ptls->gc_num.freed += sz; + jl_gc_count_freed(p, sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); ptls->gc_num.freecall++; } } @@ -3059,15 +3152,14 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { jl_ptls_t ptls = jl_get_ptls_states(); - if (ptls && ptls->world_age) { + if (ptls && ptls->world_age) maybe_collect(ptls); - if (sz < old) - ptls->gc_num.freed += (old - sz); - else - ptls->gc_num.allocd += (sz - old); + void *b = realloc(p, sz); + if (ptls && ptls->world_age) { + jl_gc_count_reallocd(p, old, b, sz, JL_MEMPROF_TAG_DOMAIN_CPU); ptls->gc_num.realloc++; } - return realloc(p, sz); + return b; } // allocation wrappers that save the size of allocations, to allow using @@ -3129,11 +3221,11 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT); if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - ptls->gc_num.allocd += allocsz; - ptls->gc_num.malloc++; void *b = malloc_cache_align(allocsz); if (b == NULL) jl_throw(jl_memory_exception); + jl_gc_count_allocd(b, allocsz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); + ptls->gc_num.malloc++; return b; } @@ -3147,15 +3239,8 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) { - ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; - live_bytes += allocsz - oldsz; - } - else if (allocsz < oldsz) - ptls->gc_num.freed += (oldsz - allocsz); - else - ptls->gc_num.allocd += (allocsz - oldsz); - ptls->gc_num.realloc++; + // realloc can free `d`, so access `owner` first. + int marked = jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED; void *b; if (isaligned) @@ -3165,6 +3250,14 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds if (b == NULL) jl_throw(jl_memory_exception); + if (marked) { + ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; + live_bytes += allocsz - oldsz; + } + else + jl_gc_count_reallocd(d, oldsz, b, allocsz, JL_MEMPROF_TAG_DOMAIN_CPU); + ptls->gc_num.realloc++; + return b; } diff --git a/src/julia_internal.h b/src/julia_internal.h index ecd5aad0871358..14b41af9af8471 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -227,6 +227,37 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz) return klass + N; } +// Flags that determine when a certain buffer has overrun itself +#define JL_MEMPROF_BT_OVERFLOW 0x01 +#define JL_MEMPROF_ALLOC_OVERFLOW 0x02 + +// Tags applied to memory allocations to specify which domain the memory is +// stored on, and also which "kind" of memory allocator was used. +// When filtering, a filter tag value of `0xffff` means "accept everything". +// We support the "CPU", "GPU" and "External" (e.g. "other") domains. +#define JL_MEMPROF_TAG_DOMAIN_CPU 0x0001 +#define JL_MEMPROF_TAG_DOMAIN_GPU 0x0002 +#define JL_MEMPROF_TAG_DOMAIN_EXTERNAL 0x0080 +// We differentiate between just normal "standard" allocation by malloc, versus +// the "pool" allocator, and finally "bigalloc" for special big things as +// that's often what we're most interested in, which are the pieces of memory +// allocated by `jl_gc_big_alloc()`. +#define JL_MEMPROF_TAG_ALLOC_STDALLOC 0x0100 +#define JL_MEMPROF_TAG_ALLOC_POOLALLOC 0x0200 +#define JL_MEMPROF_TAG_ALLOC_BIGALLOC 0x0400 +// We denote a free() by setting yet another tag +#define JL_MEMPROF_TAG_DEALLOC 0x8000 + +// Necessary memory profiler prototypes +JL_DLLEXPORT void jl_memprofile_track_alloc(void *v, uint16_t tag, size_t allocsz) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_memprofile_track_dealloc(void *v, uint16_t tag) JL_NOTSAFEPOINT; +JL_DLLEXPORT int jl_memprofile_is_running(void) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_memprofile_set_typeof(void * v, void * ty) JL_NOTSAFEPOINT; + +// Necessary time profiler prototypes +JL_DLLEXPORT uint8_t *jl_profile_get_data(void); +JL_DLLEXPORT size_t jl_profile_len_data(void); + #define JL_SMALL_BYTE_ALIGNMENT 16 #define JL_CACHE_BYTE_ALIGNMENT 64 // JL_HEAP_ALIGNMENT is the maximum alignment that the GC can provide @@ -249,6 +280,7 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty) v = jl_gc_big_alloc(ptls, allocsz); } jl_set_typeof(v, ty); + jl_memprofile_set_typeof(v, ty); return v; } JL_DLLEXPORT jl_value_t *jl_gc_alloc(jl_ptls_t ptls, size_t sz, void *ty); @@ -341,7 +373,9 @@ JL_DLLEXPORT jl_value_t *jl_apply_2va(jl_value_t *f, jl_value_t **args, uint32_t void jl_gc_sync_total_bytes(void); void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT; -void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT; +void jl_gc_count_allocd(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT; +void jl_gc_count_freed(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT; +void jl_gc_count_reallocd(void * oldaddr, size_t oldsz, void * newaddr, size_t newsz, uint16_t tag) JL_NOTSAFEPOINT; void jl_gc_run_all_finalizers(jl_ptls_t ptls); void gc_queue_binding(jl_binding_t *bnd) JL_NOTSAFEPOINT; @@ -684,6 +718,7 @@ STATIC_INLINE jl_value_t *jl_bt_entry_jlvalue(jl_bt_element_t *bt_entry, size_t } #define JL_BT_INTERP_FRAME_TAG 1 // An interpreter frame +#define JL_BT_ALLOCINFO_FRAME_TAG 2 // An allocation information frame // Number of bt elements in frame. STATIC_INLINE size_t jl_bt_entry_size(jl_bt_element_t *bt_entry) JL_NOTSAFEPOINT diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index a68b2fc193ed83..7a2d418e322eba 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -39,6 +39,7 @@ struct FinalLowerGC: public FunctionPass, private JuliaPassContext { Function *queueRootFunc; Function *poolAllocFunc; Function *bigAllocFunc; + Function *memprofileSetTypeofFunc; CallInst *ptlsStates; bool doInitialization(Module &M) override; @@ -60,6 +61,9 @@ struct FinalLowerGC: public FunctionPass, private JuliaPassContext { // Lowers a `julia.gc_alloc_bytes` intrinsic. Value *lowerGCAllocBytes(CallInst *target, Function &F); + // Lowers a `julia.gc_set_typeof` intrinsic. + Value *lowerGCSetTypeof(CallInst *target, Function &F); + // Lowers a `julia.queue_gc_root` intrinsic. Value *lowerQueueGCRoot(CallInst *target, Function &F); @@ -212,6 +216,13 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) return newI; } +Value *FinalLowerGC::lowerGCSetTypeof(CallInst *target, Function &F) +{ + assert(target->getNumArgOperands() == 2); + target->setCalledFunction(memprofileSetTypeofFunc); + return target; +} + bool FinalLowerGC::doInitialization(Module &M) { // Initialize platform-agnostic references. initAll(M); @@ -220,8 +231,9 @@ bool FinalLowerGC::doInitialization(Module &M) { queueRootFunc = getOrDeclare(jl_well_known::GCQueueRoot); poolAllocFunc = getOrDeclare(jl_well_known::GCPoolAlloc); bigAllocFunc = getOrDeclare(jl_well_known::GCBigAlloc); + memprofileSetTypeofFunc = getOrDeclare(jl_well_known::MemProfileSetTypeof); - GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc}; + GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, memprofileSetTypeofFunc}; unsigned j = 0; for (unsigned i = 0; i < sizeof(functionList) / sizeof(void*); i++) { if (!functionList[i]) @@ -301,6 +313,7 @@ bool FinalLowerGC::runOnFunction(Function &F) auto popGCFrameFunc = getOrNull(jl_intrinsics::popGCFrame); auto getGCFrameSlotFunc = getOrNull(jl_intrinsics::getGCFrameSlot); auto GCAllocBytesFunc = getOrNull(jl_intrinsics::GCAllocBytes); + auto GCSetTypeofFunc = getOrNull(jl_intrinsics::GCSetTypeof); auto queueGCRootFunc = getOrNull(jl_intrinsics::queueGCRoot); // Lower all calls to supported intrinsics. @@ -331,6 +344,9 @@ bool FinalLowerGC::runOnFunction(Function &F) else if (callee == GCAllocBytesFunc) { replaceInstruction(CI, lowerGCAllocBytes(CI, F), it); } + else if (callee == GCSetTypeofFunc) { + replaceInstruction(CI, lowerGCSetTypeof(CI, F), it); + } else if (callee == queueGCRootFunc) { replaceInstruction(CI, lowerQueueGCRoot(CI, F), it); } diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 09862e12d52e19..2d534fe2a524a1 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -1750,6 +1750,9 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) { EmitTagPtr(builder, T_prjlvalue, newI)); store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + auto setTypeofIntrinsic = getOrDeclare(jl_intrinsics::GCSetTypeof); + builder.CreateCall(setTypeofIntrinsic, {newI, CI->getArgOperand(2)}); + // Replace uses of the call to `julia.gc_alloc_obj` with the call to // `julia.gc_alloc_bytes`. CI->replaceAllUsesWith(newI); diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index 7efb5394c968a4..710c7ca5b17ea6 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -129,6 +129,7 @@ llvm::Function *JuliaPassContext::getOrDeclare( namespace jl_intrinsics { static const char *GET_GC_FRAME_SLOT_NAME = "julia.get_gc_frame_slot"; static const char *GC_ALLOC_BYTES_NAME = "julia.gc_alloc_bytes"; + static const char *GC_SET_TYPEOF_NAME = "julia.gc_set_typeof"; static const char *NEW_GC_FRAME_NAME = "julia.new_gc_frame"; static const char *PUSH_GC_FRAME_NAME = "julia.push_gc_frame"; static const char *POP_GC_FRAME_NAME = "julia.pop_gc_frame"; @@ -171,6 +172,20 @@ namespace jl_intrinsics { return addGCAllocAttributes(intrinsic, context.getLLVMContext()); }); + const IntrinsicDescription GCSetTypeof( + GC_SET_TYPEOF_NAME, + [](const JuliaPassContext &context) { + auto intrinsic = Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + { context.T_prjlvalue, context.T_prjlvalue }, + false), + Function::ExternalLinkage, + GC_SET_TYPEOF_NAME); + intrinsic->addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly); + return intrinsic; + }); + const IntrinsicDescription newGCFrame( NEW_GC_FRAME_NAME, [](const JuliaPassContext &context) { @@ -227,6 +242,7 @@ namespace jl_well_known { static const char *GC_BIG_ALLOC_NAME = "jl_gc_big_alloc"; static const char *GC_POOL_ALLOC_NAME = "jl_gc_pool_alloc"; static const char *GC_QUEUE_ROOT_NAME = "jl_gc_queue_root"; + static const char *MEMPROFILE_SET_TYPEOF_NAME = "jl_memprofile_set_typeof"; using jl_intrinsics::addGCAllocAttributes; @@ -271,4 +287,18 @@ namespace jl_well_known { func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); return func; }); + + const WellKnownFunctionDescription MemProfileSetTypeof( + MEMPROFILE_SET_TYPEOF_NAME, + [](const JuliaPassContext &context) { + auto func = Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + { context.T_prjlvalue, context.T_prjlvalue }, + false), + Function::ExternalLinkage, + MEMPROFILE_SET_TYPEOF_NAME); + func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + return func; + }); } diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index 71cab27e76ceba..271fbc2b82b30b 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -121,6 +121,9 @@ namespace jl_intrinsics { // passed as an argument. extern const IntrinsicDescription GCAllocBytes; + // `julia.gc_set_typeof`: an intrinsic that tags an allocation with a type. + extern const IntrinsicDescription GCSetTypeof; + // `julia.new_gc_frame`: an intrinsic that creates a new GC frame. extern const IntrinsicDescription newGCFrame; @@ -152,6 +155,9 @@ namespace jl_well_known { // `jl_gc_queue_root`: queues a GC root. extern const WellKnownFunctionDescription GCQueueRoot; + + // `jl_memprofile_set_typeof`: informs the memory profiler about a type. + extern const WellKnownFunctionDescription MemProfileSetTypeof; } #endif diff --git a/src/profile.c b/src/profile.c index 4bce8c619f4e74..854d25ccbabfc7 100644 --- a/src/profile.c +++ b/src/profile.c @@ -19,8 +19,11 @@ volatile size_t bt_size_cur = 0; volatile uint8_t bt_overflow = 0; /// only for sampling profiler static volatile uint64_t profile_delay_nsec = 0; +/// only for memory profiler +static volatile uint16_t memprof_tag_filter = 0xffff; volatile int profile_running = 0; +volatile int memprof_running = 0; JL_DLLEXPORT void jl_profile_clear_data(void) { @@ -28,10 +31,11 @@ JL_DLLEXPORT void jl_profile_clear_data(void) bt_overflow = 0; } -JL_DLLEXPORT int jl_profile_init(size_t maxsize, uint64_t delay_nsec) +JL_DLLEXPORT int jl_profile_init(size_t maxsize, uint64_t delay_nsec, uint16_t tag_filter) { bt_size_max = maxsize; profile_delay_nsec = delay_nsec; + memprof_tag_filter = tag_filter | JL_MEMPROF_TAG_DEALLOC; // always track deallocs // Free previous profile buffers, if we have any if (bt_data_prof != NULL) @@ -82,3 +86,126 @@ JL_DLLEXPORT int jl_profile_is_running(void) } // jl_profile_start_timer and jl_profile_stop_timer defined in signal-handling.c + + +// +// Memory profiler +// + +JL_DLLEXPORT uint16_t jl_memprofile_tag_filter(void) +{ + return memprof_tag_filter; +} + +JL_DLLEXPORT int jl_memprofile_is_running(void) +{ + return memprof_running; +} + +JL_DLLEXPORT void jl_memprofile_start(void) +{ + memprof_running = 1; +} + +JL_DLLEXPORT void jl_memprofile_stop(void) JL_NOTSAFEPOINT +{ + memprof_running = 0; +} + +// Helper function that makes it easy to take a chunk of plain-old-data that was +// allocated for an Array and find the "holding" jl_array_t object. +JL_DLLEXPORT jl_array_t * jl_memprofile_find_malloc_array(void * adata) +{ + // We walk every thread, so we need to disable the GC while we do this. + int prev_value = jl_gc_enable(0); + + // For each thread + for (int t_i = 0; t_i < jl_n_threads; t_i++) { + // Get its thread-local storage + jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + + // Take a look at the malloc'ed arrays for this thread + mallocarray_t *ma = ptls2->heap.mallocarrays; + + // Zoom through seeing if the given pointer matches this array's data pointer + while (ma != NULL) { + if (ma->a->data == adata) { + // If it matches, re-enable the GC and return that value + jl_gc_enable(prev_value); + return ma->a; + } + ma = ma->next; + } + } + + // We were unable to find it. :( + jl_gc_enable(prev_value); + return NULL; +} + +JL_DLLEXPORT void jl_memprofile_track_alloc(void *v, uint16_t tag, size_t allocsz) JL_NOTSAFEPOINT +{ + // Filter out this call with our tag filter + if ((tag & memprof_tag_filter) != tag) + return; + + // Store the current backtrace location into our buffer, and increment the + // buffer index by the number of elements added. + size_t bt_size_step = 0; + int incomplete = rec_backtrace((jl_bt_element_t*) bt_data_prof + bt_size_cur, + &bt_size_step, bt_size_max - bt_size_cur - 1, + 0, 1); + + // If we overran this buffer or don't have the place to store the allocation info frame, + // then don't record the memory trace and quit. + size_t alloc_size_step = 7; + if (incomplete || bt_size_cur + bt_size_step + + alloc_size_step >= bt_size_max) { + bt_overflow |= JL_MEMPROF_BT_OVERFLOW; + jl_memprofile_stop(); + return; + } + + // Store an allocation information frame into the buffer + uintptr_t entry_tags = jl_bt_entry_descriptor(1, 4, JL_BT_ALLOCINFO_FRAME_TAG, 0); + jl_bt_element_t *bt_entry = (jl_bt_element_t*) bt_data_prof + bt_size_cur + bt_size_step; + bt_entry[0].uintptr = JL_BT_NON_PTR_ENTRY; + bt_entry[1].uintptr = entry_tags; + // The location of the type information for this chunk of memory (tracked value). + // Initially set to nothing, populated later by `jl_memprofile_set_typeof()` + bt_entry[2].jlvalue = (jl_value_t*)jl_nothing; + // The location of the data in memory, used to match allocations with deallocations. + bt_entry[3].uintptr = (uintptr_t) v; + // The time at which this happened + bt_entry[4].uintptr = jl_clock_now(); // FIXME: double to a potentially 32-bit uintptr + // The size of the allocation, or 0 if this was a free. + bt_entry[5].uintptr = allocsz; + // Used to "tag" this allocation within a particular domain (CPU, GPU, other) + // or within a particular allocator (Pool, std, malloc), or as a free instead. + bt_entry[6].uintptr = tag; + + // Add a NULL-separator + bt_size_cur += bt_size_step + alloc_size_step; + bt_data_prof[bt_size_cur++].uintptr = 0; + assert(bt_size_cur < bt_size_max); +} + +JL_DLLEXPORT void jl_memprofile_set_typeof(void * v, void * ty) JL_NOTSAFEPOINT +{ + if (__unlikely(jl_memprofile_is_running())) { + if (bt_size_cur > 0) { + assert(bt_size_cur >= 8); // one full allocation info frame + end-of-block NULL + // if the type is valid (the field defaults to jl_nothing) + // and the memory location matches, update the type field + // FIXME: are there other types we shouldn't save because they can't be scanned? + if (ty && ty != (void*) jl_buff_tag && + bt_data_prof[bt_size_cur-5].uintptr == (uintptr_t) v) + bt_data_prof[bt_size_cur-6].jlvalue = (jl_value_t*) ty; + } + } +} + +JL_DLLEXPORT void jl_memprofile_track_dealloc(void *v, uint16_t tag) JL_NOTSAFEPOINT +{ + jl_memprofile_track_alloc(v, tag | JL_MEMPROF_TAG_DEALLOC, 0); +} diff --git a/src/support/timefuncs.c b/src/support/timefuncs.c index 031967638ec9e9..cfdff0683a256f 100644 --- a/src/support/timefuncs.c +++ b/src/support/timefuncs.c @@ -28,7 +28,7 @@ extern "C" { #endif -JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) +JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) JL_NOTSAFEPOINT { #if defined(_OS_WINDOWS_) struct __timeb64 tb; @@ -44,7 +44,7 @@ JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) return code; } -JL_DLLEXPORT double jl_clock_now(void) +JL_DLLEXPORT double jl_clock_now(void) JL_NOTSAFEPOINT { struct jl_timeval now; jl_gettimeofday(&now); diff --git a/src/support/timefuncs.h b/src/support/timefuncs.h index db3d0fccd97a9b..24a4add6b6bfe8 100644 --- a/src/support/timefuncs.h +++ b/src/support/timefuncs.h @@ -12,8 +12,8 @@ struct jl_timeval { int64_t usec; /* microseconds */ }; -JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv); -JL_DLLEXPORT double jl_clock_now(void); +JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) JL_NOTSAFEPOINT; +JL_DLLEXPORT double jl_clock_now(void) JL_NOTSAFEPOINT; void sleep_ms(int ms); #ifdef __cplusplus diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll index e6987bd635d38e..09a4273f075e62 100644 --- a/test/llvmpasses/late-lower-gc.ll +++ b/test/llvmpasses/late-lower-gc.ll @@ -44,6 +44,7 @@ top: ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; CHECK-NEXT: store %jl_value_t addrspace(10)* @tag, %jl_value_t addrspace(10)* addrspace(10)* [[V_HEADROOM]], !tbaa !0 %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, i64 8, %jl_value_t addrspace(10)* @tag) +; CHECK-NEXT: call void @julia.gc_set_typeof(%jl_value_t addrspace(10)* %v, %jl_value_t addrspace(10)* @tag) ; CHECK-NEXT: ret %jl_value_t addrspace(10)* %v ret %jl_value_t addrspace(10)* %v } @@ -63,6 +64,7 @@ top: ; CHECK-NEXT: [[V_HEADROOM:%.*]] = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(10)* [[V2]], i64 -1 ; CHECK-NEXT: store %jl_value_t addrspace(10)* @tag, %jl_value_t addrspace(10)* addrspace(10)* [[V_HEADROOM]], !tbaa !0 %v = call noalias %jl_value_t addrspace(10)* @julia.gc_alloc_obj(i8* %ptls_i8, i64 8, %jl_value_t addrspace(10)* @tag) +; CHECK-NEXT: call void @julia.gc_set_typeof(%jl_value_t addrspace(10)* %v, %jl_value_t addrspace(10)* @tag) ; CHECK-NEXT: %v64 = bitcast %jl_value_t addrspace(10)* %v to i64 addrspace(10)* %v64 = bitcast %jl_value_t addrspace(10)* %v to i64 addrspace(10)* ; CHECK-NEXT: %loadedval = load i64, i64 addrspace(10)* %v64, align 8, !range !4