diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md index 0e5acc16789bf7..86ed4a9f7bc651 100644 --- a/doc/src/manual/profile.md +++ b/doc/src/manual/profile.md @@ -61,7 +61,7 @@ To see the profiling results, there is a [graphical browser](https://github.com/ available, but here we'll use the text-based display that comes with the standard library: ```julia-repl -julia> Profile.print() +julia> Profile.Time.print() 80 ./event.jl:73; (::Base.REPL.##1#2{Base.REPL.REPLBackend})() 80 ./REPL.jl:97; macro expansion 80 ./REPL.jl:66; eval_user_input(::Any, ::Base.REPL.REPLBackend) @@ -130,7 +130,7 @@ collecting more samples: ```julia-repl julia> @profile (for i = 1:100; myfunc(); end) -julia> Profile.print() +julia> Profile.Time.print() [....] 3821 ./REPL[1]:2; myfunc() 3511 ./random.jl:431; rand!(::MersenneTwister, ::Array{Float64,3}, ::Int64, ::Type... @@ -152,7 +152,7 @@ This illustrates the default "tree" dump; an alternative is the "flat" dump, whi counts independent of their nesting: ```julia-repl -julia> Profile.print(format=:flat) +julia> Profile.Time.print(format=:flat) Count File Line Function 6714 ./ -1 anonymous 6714 ./REPL.jl 66 eval_user_input(::Any, ::Base.REPL.REPLBackend) @@ -201,12 +201,12 @@ useful way to view the results. ## Accumulation and clearing Results from [`@profile`](@ref) accumulate in a buffer; if you run multiple pieces of code under -[`@profile`](@ref), then [`Profile.print()`](@ref) will show you the combined results. This can -be very useful, but sometimes you want to start fresh; you can do so with [`Profile.clear()`](@ref). +[`@profile`](@ref), then [`Profile.Time.print()`](@ref) will show you the combined results. This can +be very useful, but sometimes you want to start fresh; you can do so with [`Profile.Time.clear()`](@ref). ## Options for controlling the display of profile results -[`Profile.print`](@ref) has more options than we've described so far. Let's see the full declaration: +[`Profile.Time.print`](@ref) has more options than we've described so far. Let's see the full declaration: ```julia function print(io::IO = stdout, data = fetch(); kwargs...) @@ -216,15 +216,15 @@ Let's first discuss the two positional arguments, and later the keyword argument * `io` -- Allows you to save the results to a buffer, e.g. a file, but the default is to print to `stdout` (the console). - * `data` -- Contains the data you want to analyze; by default that is obtained from [`Profile.fetch()`](@ref), + * `data` -- Contains the data you want to analyze; by default that is obtained from [`Profile.Time.fetch()`](@ref), which pulls out the backtraces from a pre-allocated buffer. For example, if you want to profile the profiler, you could say: ```julia - data = copy(Profile.fetch()) - Profile.clear() - @profile Profile.print(stdout, data) # Prints the previous results - Profile.print() # Prints results from Profile.print() + data = copy(Profile.Time.fetch()) + Profile.Time.clear() + @profile Profile.Time.print(stdout, data) # Prints the previous results + Profile.Time.print() # Prints results from Profile.Time.print() ``` The keyword arguments can be any combination of: @@ -233,7 +233,7 @@ The keyword arguments can be any combination of: with (default, `:tree`) or without (`:flat`) indentation indicating tree structure. * `C` -- If `true`, backtraces from C and Fortran code are shown (normally they are excluded). Try running the introductory - example with `Profile.print(C = true)`. This can be extremely helpful in deciding whether it's + example with `Profile.Time.print(C = true)`. This can be extremely helpful in deciding whether it's Julia code or C code that is causing a bottleneck; setting `C = true` also improves the interpretability of the nesting, at the cost of longer profile dumps. * `combine` -- Some lines of code contain multiple operations; for example, `s += A[i]` contains both an array @@ -256,13 +256,13 @@ to save to a file using a wide `displaysize` in an [`IOContext`](@ref): ```julia open("/tmp/prof.txt", "w") do s - Profile.print(IOContext(s, :displaysize => (24, 500))) + Profile.Time.print(IOContext(s, :displaysize => (24, 500))) end ``` ## Configuration -[`@profile`](@ref) just accumulates backtraces, and the analysis happens when you call [`Profile.print()`](@ref). +[`@profile`](@ref) just accumulates backtraces, and the analysis happens when you call [`Profile.Time.print()`](@ref). For a long-running computation, it's entirely possible that the pre-allocated buffer for storing backtraces will be filled. If that happens, the backtraces stop but your computation continues. As a consequence, you may miss some important profiling data (you will get a warning when that @@ -271,8 +271,8 @@ happens). You can obtain and configure the relevant parameters this way: ```julia -Profile.init() # returns the current settings -Profile.init(n = 10^7, delay = 0.01) +Profile.Time.init() # returns the current settings +Profile.Time.init(n = 10^7, delay = 0.01) ``` `n` is the total number of instruction pointers you can store, with a default value of `10^6`. diff --git a/src/Makefile b/src/Makefile index 04ea8d50f86ca5..d62af6f4fbd227 100644 --- a/src/Makefile +++ b/src/Makefile @@ -45,7 +45,7 @@ RUNTIME_C_SRCS := \ simplevector runtime_intrinsics precompile \ threading partr stackwalk gc gc-debug gc-pages gc-stacks method \ jlapi signal-handling safepoint jloptions timing subtype \ - crc32c + crc32c memprofile RUNTIME_SRCS := APInt-C runtime_ccall processor rtutils $(RUNTIME_C_SRCS) SRCS := $(RUNTIME_SRCS) diff --git a/src/array.c b/src/array.c index a0627ff24ab8e5..0eb77e7e5a38d9 100644 --- a/src/array.c +++ b/src/array.c @@ -98,6 +98,7 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims, tsz += tot; tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align whole object a = (jl_array_t*)jl_gc_alloc(ptls, tsz, atype); + jl_memprofile_set_typeof(a, atype); // No allocation or safepoint allowed after this a->flags.how = 0; data = (char*)a + doffs; @@ -107,12 +108,14 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims, else { tsz = JL_ARRAY_ALIGN(tsz, JL_CACHE_BYTE_ALIGNMENT); // align whole object data = jl_gc_managed_malloc(tot); + jl_memprofile_set_typeof(data, atype); // Allocate the Array **after** allocating the data // to make sure the array is still young a = (jl_array_t*)jl_gc_alloc(ptls, tsz, atype); // No allocation or safepoint allowed after this a->flags.how = 2; jl_gc_track_malloced_array(ptls, a); + jl_memprofile_set_typeof(a, atype); if (!isunboxed || isunion) // need to zero out isbits union array selector bytes to ensure a valid type index memset(data, 0, tot); @@ -334,7 +337,9 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data, if (own_buffer) { a->flags.how = 2; jl_gc_track_malloced_array(ptls, a); - jl_gc_count_allocd(nel*elsz + (elsz == 1 ? 1 : 0)); + jl_gc_count_allocd(a, nel*elsz + (elsz == 1 ? 1 : 0), + JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); + jl_memprofile_set_typeof(a, atype); } else { a->flags.how = 0; @@ -401,7 +406,9 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array(jl_value_t *atype, void *data, if (own_buffer) { a->flags.how = 2; jl_gc_track_malloced_array(ptls, a); - jl_gc_count_allocd(nel*elsz + (elsz == 1 ? 1 : 0)); + jl_gc_count_allocd(a, nel*elsz + (elsz == 1 ? 1 : 0), + JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); + jl_memprofile_set_typeof(a, atype); } else { a->flags.how = 0; @@ -669,6 +676,7 @@ static int NOINLINE array_resize_buffer(jl_array_t *a, size_t newlen) a->flags.how = 1; jl_gc_wb_buf(a, a->data, nbytes); } + jl_memprofile_set_typeof(a->data, jl_typeof(a)); } if (JL_ARRAY_IMPL_NUL && elsz == 1 && !isbitsunion) memset((char*)a->data + oldnbytes - 1, 0, nbytes - oldnbytes + 1); diff --git a/src/gc.c b/src/gc.c index 831dc14545348b..741321e742fd83 100644 --- a/src/gc.c +++ b/src/gc.c @@ -847,7 +847,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) jl_throw(jl_memory_exception); gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t, gc_cblist_notify_external_alloc, (v, allocsz)); - ptls->gc_num.allocd += allocsz; + jl_gc_count_allocd(jl_valueof(&v->header), allocsz, JL_MEMPROF_TAG_DOMAIN_CPU | + JL_MEMPROF_TAG_ALLOC_BIGALLOC); ptls->gc_num.bigalloc++; #ifdef MEMDEBUG memset(v, 0xee, allocsz); @@ -887,7 +888,8 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT *pv = nxt; if (nxt) nxt->prev = pv; - gc_num.freed += v->sz&~3; + jl_gc_count_freed(jl_valueof(&v->header), v->sz&~3, JL_MEMPROF_TAG_DOMAIN_CPU | + JL_MEMPROF_TAG_ALLOC_BIGALLOC); #ifdef MEMDEBUG memset(v, 0xbb, v->sz&~3); #endif @@ -938,10 +940,44 @@ void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT ptls->heap.mallocarrays = ma; } -void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT +void jl_gc_count_allocd(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT { jl_ptls_t ptls = jl_get_ptls_states(); ptls->gc_num.allocd += sz; + + if (__unlikely(jl_memprofile_running())) { + jl_memprofile_track_alloc(addr, tag, sz); + } +} + +void jl_gc_count_freed(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT +{ + jl_ptls_t ptls = jl_get_ptls_states(); + ptls->gc_num.freed += sz; + + if (__unlikely(jl_memprofile_running())) { + jl_memprofile_track_dealloc(addr, tag); + } +} + +void jl_gc_count_reallocd(void * oldaddr, size_t oldsz, void * newaddr, size_t newsz, uint16_t tag) JL_NOTSAFEPOINT +{ + jl_ptls_t ptls = jl_get_ptls_states(); + if (oldsz < newsz) { + ptls->gc_num.allocd += newsz - oldsz; + } else { + ptls->gc_num.freed += oldsz - newsz; + } + + // Our memprofile does not yet have a way to represent "realloc", so we just + // represent this as a free immediately followed by a malloc. This makes the + // absolute value of the memory deltas look larger than the Julia GC's statistics + // would have you believe, as the Julia GC shows only the difference between + // the two values when realloc'ing. + if (__unlikely(jl_memprofile_running())) { + jl_memprofile_track_dealloc(oldaddr, tag); + jl_memprofile_track_alloc(newaddr, tag, newsz); + } } static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT @@ -1002,7 +1038,7 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT jl_free_aligned(d); else free(d); - gc_num.freed += array_nbytes(a); + jl_gc_count_freed(d, array_nbytes(a), JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); } } @@ -1094,7 +1130,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, return jl_gc_big_alloc(ptls, osize); #endif maybe_collect(ptls); - ptls->gc_num.allocd += osize; ptls->gc_num.poolalloc++; // first try to use the freelist jl_taggedvalue_t *v = p->freelist; @@ -1109,6 +1144,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, pg->nfree = 0; pg->has_young = 1; } + jl_gc_count_allocd(jl_valueof(v), osize, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_POOLALLOC); return jl_valueof(v); } // if the freelist is empty we reuse empty but not freed pages @@ -1133,6 +1169,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, next = (jl_taggedvalue_t*)((char*)v + osize); } p->newpages = next; + jl_gc_count_allocd(jl_valueof(v), osize, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_POOLALLOC); return jl_valueof(v); } @@ -1157,7 +1194,6 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t uint8_t *ages = pg->ages; jl_taggedvalue_t *v = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET); char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize; - size_t old_nfree = pg->nfree; size_t nfree; int freedall = 1; @@ -1262,7 +1298,6 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t done: gc_time_count_page(freedall, pg_skpd); - gc_num.freed += (nfree - old_nfree) * osize; return pfl; } @@ -2560,6 +2595,8 @@ JL_DLLEXPORT int jl_gc_enable(int on) if (on && !prev) { // disable -> enable if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) { + // This to restore the value of `allocd` that was clobbered in `jl_gc_collect()` + // when `jl_gc_disable_counter` is nonzero. gc_num.allocd += gc_num.deferred_alloc; gc_num.deferred_alloc = 0; } @@ -2652,18 +2689,75 @@ static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp ptls2->heap.rem_bindings.len = n_bnd_refyoung; } -static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2) +static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, uintptr_t * bt_data, size_t bt_size) { size_t n = 0; - while (n+2 < ptls2->bt_size) { - if (ptls2->bt_data[n] == JL_BT_INTERP_FRAME) { - gc_mark_queue_obj(gc_cache, sp, (jl_value_t*)ptls2->bt_data[n+1]); + while (n+2 < bt_size) { + if (bt_data[n] == JL_BT_INTERP_FRAME) { + gc_mark_queue_obj(gc_cache, sp, (jl_value_t*)bt_data[n+1]); n += 2; } n++; } } +static void gc_track_pool_frees() +{ + // Don't do anything if the memory profiler isn't running. + if (!jl_memprofile_running()) + return; + + // Iterate over the three levels of our pagetable. We collapse indentation here + // to make it more readable, especially as we do essentially the same thing + // three times with just slightly changed variable names: + for (int pg2_i = 0; pg2_i < (REGION2_PG_COUNT + 31) / 32; pg2_i++) { + uint32_t line2 = memory_map.allocmap1[pg2_i]; + if (line2) { + for (int j = 0; j < 32; j++) { + if ((line2 >> j) & 1) { + pagetable1_t * pagetable1 = memory_map.meta1[pg2_i * 32 + j]; + + for (int pg1_i = 0; pg1_i < REGION1_PG_COUNT / 32; pg1_i++) { + uint32_t line1 = pagetable1->allocmap0[pg1_i]; + if (line1) { + for (int k = 0; k < 32; k++) { + if ((line1 >> k) & 1) { + pagetable0_t * pagetable0 = pagetable1->meta0[pg1_i * 32 + k]; + + for (int pg0_i = 0; pg0_i < REGION0_PG_COUNT / 32; pg0_i++) { + uint32_t line0 = pagetable0->allocmap[pg0_i]; + if (line0) { + for (int l = 0; l < 32; l++) { + if ((line0 >> l) & 1) { + jl_gc_pagemeta_t * pg = pagetable0->meta[pg0_i * 32 + l]; + + // Once we have an actual page, iterate over the cells: + jl_taggedvalue_t *v = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET); + char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - pg->osize; + + while ((char *)v <= lim) { + // If this object is live but unmarked, then it's about to be freed, + // so track that via jl_gc_count_freed(). + if (v->bits.gc == GC_CLEAN) { + jl_value_t * ptr = jl_gc_internal_obj_base_ptr(v); + if (ptr != NULL) { + jl_gc_count_freed(ptr, pg->osize, JL_MEMPROF_TAG_DOMAIN_CPU | + JL_MEMPROF_TAG_ALLOC_POOLALLOC); + } + } + + // Move to next cell in the page + v = (jl_taggedvalue_t*)((char*)v + pg->osize); + } + + // Region 0 + }}}} + // Region 1 + }}}} + // Region 2 + }}}} +} + size_t jl_maxrss(void); // Only one thread should be running in this function @@ -2688,8 +2782,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) jl_gc_queue_remset(gc_cache, &sp, ptls2); // 2.2. mark every thread local root jl_gc_queue_thread_local(gc_cache, &sp, ptls2); - // 2.3. mark any managed objects in the backtrace buffer - jl_gc_queue_bt_buf(gc_cache, &sp, ptls2); + // 2.3. mark any managed objects in the backtrace buffers, + // so that things like Interpreter frame objects do not disappear. + jl_gc_queue_bt_buf(gc_cache, &sp, ptls2->bt_data, ptls2->bt_size); + jl_gc_queue_bt_buf(gc_cache, &sp, (uintptr_t *)jl_profile_get_data(), jl_profile_len_data()); + jl_gc_queue_bt_buf(gc_cache, &sp, (uintptr_t *)jl_memprofile_get_bt_data(), jl_memprofile_len_bt_data()); } // 3. walk roots @@ -2813,6 +2910,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags(); + gc_track_pool_frees(); gc_sweep_pool(sweep_full); if (sweep_full) gc_sweep_perm_alloc(); @@ -3018,23 +3116,27 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void) JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { jl_ptls_t ptls = jl_get_ptls_states(); - if (ptls && ptls->world_age) { + if (ptls && ptls->world_age) maybe_collect(ptls); - ptls->gc_num.allocd += sz; + void *b = malloc(sz); + if (ptls && ptls->world_age) { + jl_gc_count_allocd(b, sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); ptls->gc_num.malloc++; } - return malloc(sz); + return b; } JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { jl_ptls_t ptls = jl_get_ptls_states(); - if (ptls && ptls->world_age) { + if (ptls && ptls->world_age) maybe_collect(ptls); - ptls->gc_num.allocd += nm*sz; + void *b = calloc(nm, sz); + if (ptls && ptls->world_age) { + jl_gc_count_allocd(b, nm*sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); ptls->gc_num.malloc++; } - return calloc(nm, sz); + return b; } JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) @@ -3042,7 +3144,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) jl_ptls_t ptls = jl_get_ptls_states(); free(p); if (ptls && ptls->world_age) { - ptls->gc_num.freed += sz; + jl_gc_count_freed(p, sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); ptls->gc_num.freecall++; } } @@ -3050,15 +3152,14 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { jl_ptls_t ptls = jl_get_ptls_states(); - if (ptls && ptls->world_age) { + if (ptls && ptls->world_age) maybe_collect(ptls); - if (sz < old) - ptls->gc_num.freed += (old - sz); - else - ptls->gc_num.allocd += (sz - old); + void *b = realloc(p, sz); + if (ptls && ptls->world_age) { + jl_gc_count_reallocd(p, old, b, sz, JL_MEMPROF_TAG_DOMAIN_CPU); ptls->gc_num.realloc++; } - return realloc(p, sz); + return b; } // allocation wrappers that save the size of allocations, to allow using @@ -3120,11 +3221,11 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT); if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - ptls->gc_num.allocd += allocsz; - ptls->gc_num.malloc++; void *b = malloc_cache_align(allocsz); if (b == NULL) jl_throw(jl_memory_exception); + jl_gc_count_allocd(b, allocsz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC); + ptls->gc_num.malloc++; return b; } @@ -3138,16 +3239,6 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) { - ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; - live_bytes += allocsz - oldsz; - } - else if (allocsz < oldsz) - ptls->gc_num.freed += (oldsz - allocsz); - else - ptls->gc_num.allocd += (allocsz - oldsz); - ptls->gc_num.realloc++; - void *b; if (isaligned) b = realloc_cache_align(d, allocsz, oldsz); @@ -3156,6 +3247,14 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds if (b == NULL) jl_throw(jl_memory_exception); + if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) { + ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; + live_bytes += allocsz - oldsz; + } + else + jl_gc_count_reallocd(d, oldsz, b, allocsz, JL_MEMPROF_TAG_DOMAIN_CPU); + ptls->gc_num.realloc++; + return b; } diff --git a/src/julia_internal.h b/src/julia_internal.h index d408257150e00e..4543d95335252c 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -227,6 +227,75 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz) return klass + N; } +// Flags that determine when a certain buffer has overrun itself +#define JL_MEMPROF_BT_OVERFLOW 0x01 +#define JL_MEMPROF_ALLOC_OVERFLOW 0x02 + +// Tags applied to memory allocations to specify which domain the memory is +// stored on, and also which "kind" of memory allocator was used. +// When filtering, a filter tag value of `0xffff` means "accept everything". +// We support the "CPU", "GPU" and "External" (e.g. "other") domains. +#define JL_MEMPROF_TAG_DOMAIN_CPU 0x0001 +#define JL_MEMPROF_TAG_DOMAIN_GPU 0x0002 +#define JL_MEMPROF_TAG_DOMAIN_EXTERNAL 0x0080 +// We differentiate between just normal "standard" allocation by malloc, versus +// the "pool" allocator, and finally "bigalloc" for special big things as +// that's often what we're most interested in, which are the pieces of memory +// allocated by `jl_gc_big_alloc()`. +#define JL_MEMPROF_TAG_ALLOC_STDALLOC 0x0100 +#define JL_MEMPROF_TAG_ALLOC_POOLALLOC 0x0200 +#define JL_MEMPROF_TAG_ALLOC_BIGALLOC 0x0400 +// We denote a free() by setting yet another tag +#define JL_MEMPROF_TAG_DEALLOC 0x8000 + +typedef struct _memprof_allocation_info_t +{ + // The location of the chunk of data in memory, used to match + // allocations with deallocations. This value, once it has been + // deallocated, cannot be dereferenced, in general. + void *memory_location; + + // The location of the type information for this chunk of memory + // This should always be dereferencable. + void *type; + + // The time at which this happened + double time; + + // The size of the allocation, or 0 if this was a free of a + // previously-allocated piece of data. + size_t allocsz; + + // Used to "tag" this allocation within a particular domain (CPU, GPU, other) + // or within a particular allocator (Pool, std, malloc), or as a free instead. + uint16_t tag; +} allocation_info_t; + +// Memory profiler data structures +static uintptr_t * memprof_bt_data = NULL; +static volatile size_t memprof_bt_data_size = 0; +static volatile size_t memprof_bt_data_size_max = 0; + +static allocation_info_t * memprof_alloc_data = NULL; +static volatile size_t memprof_alloc_data_size = 0; +static volatile size_t memprof_alloc_data_size_max = 0; + +static volatile uint8_t memprof_running = 0; +static volatile uint8_t memprof_overflow = 0; +static volatile uint16_t memprof_tag_filter = 0xffff; + +// Necessary memory profiler prototypes +JL_DLLEXPORT uint8_t* jl_memprofile_get_bt_data(void); +JL_DLLEXPORT size_t jl_memprofile_len_bt_data(void); +JL_DLLEXPORT void jl_memprofile_track_alloc(void *v, uint16_t tag, size_t allocsz) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_memprofile_track_dealloc(void *v, uint16_t tag) JL_NOTSAFEPOINT; +JL_DLLEXPORT int jl_memprofile_running(void) JL_NOTSAFEPOINT; +JL_DLLEXPORT void jl_memprofile_set_typeof(void * v, void * ty) JL_NOTSAFEPOINT; + +// Necessary time profiler prototypes +JL_DLLEXPORT uint8_t *jl_profile_get_data(void); +JL_DLLEXPORT size_t jl_profile_len_data(void); + #define JL_SMALL_BYTE_ALIGNMENT 16 #define JL_CACHE_BYTE_ALIGNMENT 64 // JL_HEAP_ALIGNMENT is the maximum alignment that the GC can provide @@ -249,6 +318,7 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty) v = jl_gc_big_alloc(ptls, allocsz); } jl_set_typeof(v, ty); + jl_memprofile_set_typeof(v, ty); return v; } JL_DLLEXPORT jl_value_t *jl_gc_alloc(jl_ptls_t ptls, size_t sz, void *ty); @@ -341,7 +411,9 @@ JL_DLLEXPORT jl_value_t *jl_apply_2va(jl_value_t *f, jl_value_t **args, uint32_t void jl_gc_sync_total_bytes(void); void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT; -void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT; +void jl_gc_count_allocd(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT; +void jl_gc_count_freed(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT; +void jl_gc_count_reallocd(void * oldaddr, size_t oldsz, void * newaddr, size_t newsz, uint16_t tag) JL_NOTSAFEPOINT; void jl_gc_run_all_finalizers(jl_ptls_t ptls); void gc_queue_binding(jl_binding_t *bnd) JL_NOTSAFEPOINT; diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp index a68b2fc193ed83..7a2d418e322eba 100644 --- a/src/llvm-final-gc-lowering.cpp +++ b/src/llvm-final-gc-lowering.cpp @@ -39,6 +39,7 @@ struct FinalLowerGC: public FunctionPass, private JuliaPassContext { Function *queueRootFunc; Function *poolAllocFunc; Function *bigAllocFunc; + Function *memprofileSetTypeofFunc; CallInst *ptlsStates; bool doInitialization(Module &M) override; @@ -60,6 +61,9 @@ struct FinalLowerGC: public FunctionPass, private JuliaPassContext { // Lowers a `julia.gc_alloc_bytes` intrinsic. Value *lowerGCAllocBytes(CallInst *target, Function &F); + // Lowers a `julia.gc_set_typeof` intrinsic. + Value *lowerGCSetTypeof(CallInst *target, Function &F); + // Lowers a `julia.queue_gc_root` intrinsic. Value *lowerQueueGCRoot(CallInst *target, Function &F); @@ -212,6 +216,13 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F) return newI; } +Value *FinalLowerGC::lowerGCSetTypeof(CallInst *target, Function &F) +{ + assert(target->getNumArgOperands() == 2); + target->setCalledFunction(memprofileSetTypeofFunc); + return target; +} + bool FinalLowerGC::doInitialization(Module &M) { // Initialize platform-agnostic references. initAll(M); @@ -220,8 +231,9 @@ bool FinalLowerGC::doInitialization(Module &M) { queueRootFunc = getOrDeclare(jl_well_known::GCQueueRoot); poolAllocFunc = getOrDeclare(jl_well_known::GCPoolAlloc); bigAllocFunc = getOrDeclare(jl_well_known::GCBigAlloc); + memprofileSetTypeofFunc = getOrDeclare(jl_well_known::MemProfileSetTypeof); - GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc}; + GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, memprofileSetTypeofFunc}; unsigned j = 0; for (unsigned i = 0; i < sizeof(functionList) / sizeof(void*); i++) { if (!functionList[i]) @@ -301,6 +313,7 @@ bool FinalLowerGC::runOnFunction(Function &F) auto popGCFrameFunc = getOrNull(jl_intrinsics::popGCFrame); auto getGCFrameSlotFunc = getOrNull(jl_intrinsics::getGCFrameSlot); auto GCAllocBytesFunc = getOrNull(jl_intrinsics::GCAllocBytes); + auto GCSetTypeofFunc = getOrNull(jl_intrinsics::GCSetTypeof); auto queueGCRootFunc = getOrNull(jl_intrinsics::queueGCRoot); // Lower all calls to supported intrinsics. @@ -331,6 +344,9 @@ bool FinalLowerGC::runOnFunction(Function &F) else if (callee == GCAllocBytesFunc) { replaceInstruction(CI, lowerGCAllocBytes(CI, F), it); } + else if (callee == GCSetTypeofFunc) { + replaceInstruction(CI, lowerGCSetTypeof(CI, F), it); + } else if (callee == queueGCRootFunc) { replaceInstruction(CI, lowerQueueGCRoot(CI, F), it); } diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 03ed17887acab9..149b41007229c0 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -1765,6 +1765,9 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) { EmitTagPtr(builder, T_prjlvalue, newI)); store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag); + auto setTypeofIntrinsic = getOrDeclare(jl_intrinsics::GCSetTypeof); + builder.CreateCall(setTypeofIntrinsic, {newI, CI->getArgOperand(2)}); + // Replace uses of the call to `julia.gc_alloc_obj` with the call to // `julia.gc_alloc_bytes`. CI->replaceAllUsesWith(newI); diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp index 7efb5394c968a4..03169569421737 100644 --- a/src/llvm-pass-helpers.cpp +++ b/src/llvm-pass-helpers.cpp @@ -129,6 +129,7 @@ llvm::Function *JuliaPassContext::getOrDeclare( namespace jl_intrinsics { static const char *GET_GC_FRAME_SLOT_NAME = "julia.get_gc_frame_slot"; static const char *GC_ALLOC_BYTES_NAME = "julia.gc_alloc_bytes"; + static const char *GC_SET_TYPEOF_NAME = "julia.gc_set_typeof"; static const char *NEW_GC_FRAME_NAME = "julia.new_gc_frame"; static const char *PUSH_GC_FRAME_NAME = "julia.push_gc_frame"; static const char *POP_GC_FRAME_NAME = "julia.pop_gc_frame"; @@ -171,6 +172,18 @@ namespace jl_intrinsics { return addGCAllocAttributes(intrinsic, context.getLLVMContext()); }); + const IntrinsicDescription GCSetTypeof( + GC_SET_TYPEOF_NAME, + [](const JuliaPassContext &context) { + return Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + { context.T_prjlvalue, context.T_prjlvalue }, + false), + Function::ExternalLinkage, + GC_SET_TYPEOF_NAME); + }); + const IntrinsicDescription newGCFrame( NEW_GC_FRAME_NAME, [](const JuliaPassContext &context) { @@ -227,6 +240,7 @@ namespace jl_well_known { static const char *GC_BIG_ALLOC_NAME = "jl_gc_big_alloc"; static const char *GC_POOL_ALLOC_NAME = "jl_gc_pool_alloc"; static const char *GC_QUEUE_ROOT_NAME = "jl_gc_queue_root"; + static const char *MEMPROFILE_SET_TYPEOF_NAME = "jl_memprofile_set_typeof"; using jl_intrinsics::addGCAllocAttributes; @@ -271,4 +285,18 @@ namespace jl_well_known { func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); return func; }); + + const WellKnownFunctionDescription MemProfileSetTypeof( + MEMPROFILE_SET_TYPEOF_NAME, + [](const JuliaPassContext &context) { + auto func = Function::Create( + FunctionType::get( + Type::getVoidTy(context.getLLVMContext()), + { context.T_prjlvalue, context.T_prjlvalue }, + false), + Function::ExternalLinkage, + MEMPROFILE_SET_TYPEOF_NAME); + func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly); + return func; + }); } diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h index 71cab27e76ceba..271fbc2b82b30b 100644 --- a/src/llvm-pass-helpers.h +++ b/src/llvm-pass-helpers.h @@ -121,6 +121,9 @@ namespace jl_intrinsics { // passed as an argument. extern const IntrinsicDescription GCAllocBytes; + // `julia.gc_set_typeof`: an intrinsic that tags an allocation with a type. + extern const IntrinsicDescription GCSetTypeof; + // `julia.new_gc_frame`: an intrinsic that creates a new GC frame. extern const IntrinsicDescription newGCFrame; @@ -152,6 +155,9 @@ namespace jl_well_known { // `jl_gc_queue_root`: queues a GC root. extern const WellKnownFunctionDescription GCQueueRoot; + + // `jl_memprofile_set_typeof`: informs the memory profiler about a type. + extern const WellKnownFunctionDescription MemProfileSetTypeof; } #endif diff --git a/src/memprofile.c b/src/memprofile.c new file mode 100644 index 00000000000000..8d64279423b9b8 --- /dev/null +++ b/src/memprofile.c @@ -0,0 +1,203 @@ +#include +#include +#include +#include +#include "julia.h" +#include "julia_internal.h" +#include "threading.h" +#include "gc.h" + +JL_DLLEXPORT void jl_memprofile_clear_data(void) +{ + memprof_bt_data_size = 0; + memprof_alloc_data_size = 0; + memprof_overflow = 0; +} + +JL_DLLEXPORT int jl_memprofile_init(size_t bt_maxsize, size_t alloc_maxsize, uint16_t tag_filter) +{ + // Free previous profile buffers, if we have any + if (memprof_bt_data != NULL) { + free((void*)memprof_bt_data); + memprof_bt_data = NULL; + memprof_bt_data_size = 0; + memprof_bt_data_size_max = 0; + } + if (memprof_alloc_data != NULL) { + free((void*)memprof_alloc_data); + memprof_alloc_data = NULL; + memprof_alloc_data_size = 0; + memprof_alloc_data_size_max = 0; + } + + // Initialize new profile buffers. We assume at least 10x + memprof_bt_data = (uintptr_t*) calloc(bt_maxsize, sizeof(uintptr_t)); + if (memprof_bt_data == NULL && bt_maxsize > 0) { + return -1; + } + + memprof_alloc_data = (allocation_info_t*) calloc(alloc_maxsize, sizeof(allocation_info_t)); + if (memprof_alloc_data == NULL && alloc_maxsize > 0) { + // Cleanup the previous allocation in the event of failure, so that it + // cannot be used accidentally. + free((void*)memprof_bt_data); + memprof_bt_data = NULL; + return -1; + } + + memprof_bt_data_size_max = bt_maxsize; + memprof_alloc_data_size_max = alloc_maxsize; + + // We store the filter, but we always track deallocs. :P + memprof_tag_filter = tag_filter | JL_MEMPROF_TAG_DEALLOC; + jl_memprofile_clear_data(); + return 0; +} + +JL_DLLEXPORT uint8_t* jl_memprofile_get_bt_data(void) +{ + return (uint8_t*) memprof_bt_data; +} + +JL_DLLEXPORT uint8_t* jl_memprofile_get_alloc_data(void) +{ + return (uint8_t*) memprof_alloc_data; +} + +JL_DLLEXPORT size_t jl_memprofile_len_bt_data(void) +{ + return memprof_bt_data_size; +} + +JL_DLLEXPORT size_t jl_memprofile_len_alloc_data(void) +{ + return memprof_alloc_data_size; +} + +JL_DLLEXPORT size_t jl_memprofile_maxlen_bt_data(void) +{ + return memprof_bt_data_size_max; +} + +JL_DLLEXPORT size_t jl_memprofile_maxlen_alloc_data(void) +{ + return memprof_alloc_data_size_max; +} + +JL_DLLEXPORT int jl_memprofile_running(void) JL_NOTSAFEPOINT +{ + return memprof_running == 1; +} + +JL_DLLEXPORT int jl_memprofile_overflow(void) +{ + return memprof_overflow; +} + +JL_DLLEXPORT int jl_memprofile_tag_filter(void) +{ + return memprof_tag_filter; +} + +JL_DLLEXPORT void jl_memprofile_start(void) +{ + memprof_running = 1; +} + +JL_DLLEXPORT void jl_memprofile_stop(void) JL_NOTSAFEPOINT +{ + memprof_running = 0; +} + +// Helper function that makes it easy to take a chunk of plain-old-data that was +// allocated for an Array and find the "holding" jl_array_t object. +JL_DLLEXPORT jl_array_t * jl_memprofile_find_malloc_array(void * adata) +{ + // We walk every thread, so we need to disable the GC while we do this. + int prev_value = jl_gc_enable(0); + + // For each thread + for (int t_i = 0; t_i < jl_n_threads; t_i++) { + // Get its thread-local storage + jl_ptls_t ptls2 = jl_all_tls_states[t_i]; + + // Take a look at the malloc'ed arrays for this thread + mallocarray_t *ma = ptls2->heap.mallocarrays; + + // Zoom through seeing if the given pointer matches this array's data pointer + while (ma != NULL) { + if (ma->a->data == adata) { + // If it matches, re-enable the GC and return that value + jl_gc_enable(prev_value); + return ma->a; + } + ma = ma->next; + } + } + + // We were unable to find it. :( + jl_gc_enable(prev_value); + return NULL; +} + +JL_DLLEXPORT void jl_memprofile_track_alloc(void *v, uint16_t tag, size_t allocsz) JL_NOTSAFEPOINT +{ + // Filter out this call with our tag filter + if ((tag & memprof_tag_filter) != tag) + return; + + // Store the current backtrace location into our buffer, and increment the + // buffer index by the number of elements added. + size_t bt_step = 0; + bt_step = rec_backtrace(memprof_bt_data + memprof_bt_data_size, + memprof_bt_data_size_max - memprof_bt_data_size - 1, + 0); + + // If we overran this buffer, then don't record the memory trace and quit. + if (bt_step == memprof_bt_data_size_max - memprof_bt_data_size) { + memprof_overflow |= JL_MEMPROF_BT_OVERFLOW; + jl_memprofile_stop(); + return; + } else { + // Otherwise, include this block and add a NULL-separator + memprof_bt_data_size += bt_step; + memprof_bt_data[memprof_bt_data_size++] = (uintptr_t)NULL; + } + + // Next up; store allocation/type information + memprof_alloc_data[memprof_alloc_data_size].memory_location = v; + memprof_alloc_data[memprof_alloc_data_size].time = jl_clock_now(); + + // If we are deallocating, then we set allocsz to 0 and must pair this alloc_data entry + // with a previous allocation within alloc_data to make sense of it. + memprof_alloc_data[memprof_alloc_data_size].allocsz = allocsz; + + // In general, we don't know the type at this point; we make use of `jl_memprofile_set_typeof()`. + memprof_alloc_data[memprof_alloc_data_size].type = NULL; + + // Tags are used to track the "domain" of this chunk of memory + memprof_alloc_data[memprof_alloc_data_size].tag = tag; + + memprof_alloc_data_size++; + + if (memprof_alloc_data_size >= memprof_alloc_data_size_max) { + memprof_overflow |= JL_MEMPROF_ALLOC_OVERFLOW; + jl_memprofile_stop(); + } +} + +JL_DLLEXPORT void jl_memprofile_set_typeof(void * v, void * ty) JL_NOTSAFEPOINT +{ + // We only search the very last allocation + if (memprof_alloc_data_size > 0) { + if (memprof_alloc_data[memprof_alloc_data_size-1].memory_location == v) { + // If it matches, then set the type pointer + memprof_alloc_data[memprof_alloc_data_size-1].type = ty; + } + } +} + +JL_DLLEXPORT void jl_memprofile_track_dealloc(void *v, uint16_t tag) JL_NOTSAFEPOINT +{ + jl_memprofile_track_alloc(v, tag | JL_MEMPROF_TAG_DEALLOC, 0); +} diff --git a/src/support/timefuncs.c b/src/support/timefuncs.c index 031967638ec9e9..cfdff0683a256f 100644 --- a/src/support/timefuncs.c +++ b/src/support/timefuncs.c @@ -28,7 +28,7 @@ extern "C" { #endif -JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) +JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) JL_NOTSAFEPOINT { #if defined(_OS_WINDOWS_) struct __timeb64 tb; @@ -44,7 +44,7 @@ JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) return code; } -JL_DLLEXPORT double jl_clock_now(void) +JL_DLLEXPORT double jl_clock_now(void) JL_NOTSAFEPOINT { struct jl_timeval now; jl_gettimeofday(&now); diff --git a/src/support/timefuncs.h b/src/support/timefuncs.h index db3d0fccd97a9b..24a4add6b6bfe8 100644 --- a/src/support/timefuncs.h +++ b/src/support/timefuncs.h @@ -12,8 +12,8 @@ struct jl_timeval { int64_t usec; /* microseconds */ }; -JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv); -JL_DLLEXPORT double jl_clock_now(void); +JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) JL_NOTSAFEPOINT; +JL_DLLEXPORT double jl_clock_now(void) JL_NOTSAFEPOINT; void sleep_ms(int ms); #ifdef __cplusplus diff --git a/stdlib/Profile/docs/src/index.md b/stdlib/Profile/docs/src/index.md index ac60bb92cb5ed8..eba8299cff6214 100644 --- a/stdlib/Profile/docs/src/index.md +++ b/stdlib/Profile/docs/src/index.md @@ -7,11 +7,18 @@ Profile.@profile The methods in `Profile` are not exported and need to be called e.g. as `Profile.print()`. ```@docs -Profile.clear -Profile.print -Profile.init -Profile.fetch -Profile.retrieve +Profile.Time.clear +Profile.Time.print +Profile.Time.init +Profile.Time.fetch +Profile.Time.retrieve +``` + +```@docs +Profile.Memory.init +``` + +```@docs Profile.callers Profile.clear_malloc_data ``` diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl index 429bb80fe4f60b..f2371e035dfca8 100644 --- a/stdlib/Profile/src/Profile.jl +++ b/stdlib/Profile/src/Profile.jl @@ -5,850 +5,27 @@ Profiling support, main entry point is the [`@profile`](@ref) macro. """ module Profile -import Base.StackTraces: lookup, UNKNOWN, show_spec_linfo, StackFrame - -# deprecated functions: use `getdict` instead -lookup(ip::UInt) = lookup(convert(Ptr{Cvoid}, ip)) - -export @profile - -""" - @profile - -`@profile ` runs your expression while taking periodic backtraces. These are -appended to an internal buffer of backtraces. -""" -macro profile(ex) - return quote - try - status = start_timer() - if status < 0 - error(error_codes[status]) - end - $(esc(ex)) - finally - stop_timer() - end - end -end - -#### -#### User-level functions -#### - -""" - init(; n::Integer, delay::Real)) - -Configure the `delay` between backtraces (measured in seconds), and the number `n` of -instruction pointers that may be stored. Each instruction pointer corresponds to a single -line of code; backtraces generally consist of a long list of instruction pointers. Default -settings can be obtained by calling this function with no arguments, and each can be set -independently using keywords or in the order `(n, delay)`. -""" -function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing) - n_cur = ccall(:jl_profile_maxlen_data, Csize_t, ()) - delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9 - if n === nothing && delay === nothing - return Int(n_cur), delay_cur - end - nnew = (n === nothing) ? n_cur : n - delaynew = (delay === nothing) ? delay_cur : delay - init(nnew, delaynew) -end - -function init(n::Integer, delay::Real) - status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), n, round(UInt64,10^9*delay)) - if status == -1 - error("could not allocate space for ", n, " instruction pointers") - end -end - # init with default values -# Use a max size of 1M profile samples, and fire timer every 1ms -if Sys.iswindows() - __init__() = init(1_000_000, 0.01) -else - __init__() = init(1_000_000, 0.001) -end - -""" - clear() - -Clear any existing backtraces from the internal buffer. -""" -clear() = ccall(:jl_profile_clear_data, Cvoid, ()) - -const LineInfoDict = Dict{UInt64, Vector{StackFrame}} -const LineInfoFlatDict = Dict{UInt64, StackFrame} - -struct ProfileFormat - maxdepth::Int - mincount::Int - noisefloor::Float64 - sortedby::Symbol - combine::Bool - C::Bool - recur::Symbol - function ProfileFormat(; - C = false, - combine = true, - maxdepth::Int = typemax(Int), - mincount::Int = 0, - noisefloor = 0, - sortedby::Symbol = :filefuncline, - recur::Symbol = :off) - return new(maxdepth, mincount, noisefloor, sortedby, combine, C, recur) - end -end - -""" - print([io::IO = stdout,] [data::Vector]; kwargs...) - -Prints profiling results to `io` (by default, `stdout`). If you do not -supply a `data` vector, the internal buffer of accumulated backtraces -will be used. - -The keyword arguments can be any combination of: - - - `format` -- Determines whether backtraces are printed with (default, `:tree`) or without (`:flat`) - indentation indicating tree structure. - - - `C` -- If `true`, backtraces from C and Fortran code are shown (normally they are excluded). - - - `combine` -- If `true` (default), instruction pointers are merged that correspond to the same line of code. - - - `maxdepth` -- Limits the depth higher than `maxdepth` in the `:tree` format. - - - `sortedby` -- Controls the order in `:flat` format. `:filefuncline` (default) sorts by the source - line, `:count` sorts in order of number of collected samples, and `:overhead` sorts by the number of samples - incurred by each function by itself. - - - `noisefloor` -- Limits frames that exceed the heuristic noise floor of the sample (only applies to format `:tree`). - A suggested value to try for this is 2.0 (the default is 0). This parameter hides samples for which `n <= noisefloor * √N`, - where `n` is the number of samples on this line, and `N` is the number of samples for the callee. - - - `mincount` -- Limits the printout to only those lines with at least `mincount` occurrences. - - - `recur` -- Controls the recursion handling in `:tree` format. `:off` (default) prints the tree as normal. `:flat` instead - compresses any recursion (by ip), showing the approximate effect of converting any self-recursion into an iterator. - `:flatc` does the same but also includes collapsing of C frames (may do odd things around `jl_apply`). -""" -function print(io::IO, - data::Vector{<:Unsigned} = fetch(), - lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data) - ; - format = :tree, - C = false, - combine = true, - maxdepth::Int = typemax(Int), - mincount::Int = 0, - noisefloor = 0, - sortedby::Symbol = :filefuncline, - recur::Symbol = :off) - print(io, data, lidict, ProfileFormat( - C = C, - combine = combine, - maxdepth = maxdepth, - mincount = mincount, - noisefloor = noisefloor, - sortedby = sortedby, - recur = recur), - format) -end - -function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat, format::Symbol) - cols::Int = Base.displaysize(io)[2] - data = convert(Vector{UInt64}, data) - fmt.recur ∈ (:off, :flat, :flatc) || throw(ArgumentError("recur value not recognized")) - if format === :tree - tree(io, data, lidict, cols, fmt) - elseif format === :flat - fmt.recur === :off || throw(ArgumentError("format flat only implements recur=:off")) - flat(io, data, lidict, cols, fmt) - else - throw(ArgumentError("output format $(repr(format)) not recognized")) - end -end - -""" - print([io::IO = stdout,] data::Vector, lidict::LineInfoDict; kwargs...) - -Prints profiling results to `io`. This variant is used to examine results exported by a -previous call to [`retrieve`](@ref). Supply the vector `data` of backtraces and -a dictionary `lidict` of line information. - -See `Profile.print([io], data)` for an explanation of the valid keyword arguments. -""" -print(data::Vector{<:Unsigned} = fetch(), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) = - print(stdout, data, lidict; kwargs...) - -""" - retrieve() -> data, lidict - -"Exports" profiling results in a portable format, returning the set of all backtraces -(`data`) and a dictionary that maps the (session-specific) instruction pointers in `data` to -`LineInfo` values that store the file name, function name, and line number. This function -allows you to save profiling results for future analysis. -""" -function retrieve() - data = fetch() - return (data, getdict(data)) -end - -function getdict(data::Vector{UInt}) - dict = LineInfoDict() - for ip in data - get!(() -> lookup(convert(Ptr{Cvoid}, ip)), dict, UInt64(ip)) - end - return dict -end - -""" - flatten(btdata::Vector, lidict::LineInfoDict) -> (newdata::Vector{UInt64}, newdict::LineInfoFlatDict) - -Produces "flattened" backtrace data. Individual instruction pointers -sometimes correspond to a multi-frame backtrace due to inlining; in -such cases, this function inserts fake instruction pointers for the -inlined calls, and returns a dictionary that is a 1-to-1 mapping -between instruction pointers and a single StackFrame. -""" -function flatten(data::Vector, lidict::LineInfoDict) - # Makes fake instruction pointers, counting down from typemax(UInt) - newip = typemax(UInt64) - 1 - taken = Set(keys(lidict)) # make sure we don't pick one that's already used - newdict = Dict{UInt64,StackFrame}() - newmap = Dict{UInt64,Vector{UInt64}}() - for (ip, trace) in lidict - if length(trace) == 1 - newdict[ip] = trace[1] - else - newm = UInt64[] - for sf in trace - while newip ∈ taken && newip > 0 - newip -= 1 - end - newip == 0 && error("all possible instruction pointers used") - push!(newm, newip) - newdict[newip] = sf - newip -= 1 - end - newmap[ip] = newm - end - end - newdata = UInt64[] - for ip::UInt64 in data - if haskey(newmap, ip) - append!(newdata, newmap[ip]) - else - push!(newdata, ip) - end +# Use a max size of 1M profile samples, and fire timer every 1ms, except on Windows +function __init__() + timer_ms = 0.001 + @static if Sys.iswindows() + timer_ms = 0.01 end - return (newdata, newdict) + Time.init(1_000_000, timer_ms) + Memory.init(20_000_000, 1_000_000, 0xffff) end -# Take a file-system path and try to form a concise representation of it -# based on the package ecosystem -function short_path(spath::Symbol, filenamecache::Dict{Symbol, String}) - return get!(filenamecache, spath) do - path = string(spath) - if isabspath(path) - if ispath(path) - # try to replace the file-system prefix with a short "@Module" one, - # assuming that profile came from the current machine - # (or at least has the same file-system layout) - root = path - while !isempty(root) - root, base = splitdir(root) - isempty(base) && break - @assert startswith(path, root) - for proj in Base.project_names - project_file = joinpath(root, proj) - if Base.isfile_casesensitive(project_file) - pkgid = Base.project_file_name_uuid(project_file, "") - isempty(pkgid.name) && return path # bad Project file - # return the joined the module name prefix and path suffix - path = path[nextind(path, sizeof(root)):end] - return string("@", pkgid.name, path) - end - end - end - end - return path - elseif isfile(joinpath(Sys.BINDIR::String, Base.DATAROOTDIR, "julia", "base", path)) - # do the same mechanic for Base (or Core/Compiler) files as above, - # but they start from a relative path - return joinpath("@Base", normpath(path)) - else - # for non-existent relative paths (such as "REPL[1]"), just consider simplifying them - return normpath(path) # drop leading "./" - end - end -end +include("stacktrace_tools.jl") +include("memory.jl") +include("time.jl") +include("compat.jl") -""" - callers(funcname, [data, lidict], [filename=], [linerange=]) -> Vector{Tuple{count, lineinfo}} - -Given a previous profiling run, determine who called a particular function. Supplying the -filename (and optionally, range of line numbers over which the function is defined) allows -you to disambiguate an overloaded method. The returned value is a vector containing a count -of the number of calls and line information about the caller. One can optionally supply -backtrace `data` obtained from [`retrieve`](@ref); otherwise, the current internal -profile buffer is used. -""" -function callers end - -function callers(funcname::String, bt::Vector, lidict::LineInfoFlatDict; filename = nothing, linerange = nothing) - if filename === nothing && linerange === nothing - return callersf(li -> String(li.func) == funcname, - bt, lidict) - end - filename === nothing && throw(ArgumentError("if supplying linerange, you must also supply the filename")) - filename = String(filename) - if linerange === nothing - return callersf(li -> String(li.func) == funcname && String(li.file) == filename, - bt, lidict) - else - return callersf(li -> String(li.func) == funcname && String(li.file) == filename && in(li.line, linerange), - bt, lidict) - end -end - -callers(funcname::String, bt::Vector, lidict::LineInfoDict; kwargs...) = - callers(funcname, flatten(bt, lidict)...; kwargs...) -callers(funcname::String; kwargs...) = callers(funcname, retrieve()...; kwargs...) -callers(func::Function, bt::Vector, lidict::LineInfoFlatDict; kwargs...) = - callers(string(func), bt, lidict; kwargs...) -callers(func::Function; kwargs...) = callers(string(func), retrieve()...; kwargs...) - -## -## For --track-allocation -## -# Reset the malloc log. Used to avoid counting memory allocated during -# compilation. - -""" - clear_malloc_data() - -Clears any stored memory allocation data when running julia with `--track-allocation`. -Execute the command(s) you want to test (to force JIT-compilation), then call -[`clear_malloc_data`](@ref). Then execute your command(s) again, quit -Julia, and examine the resulting `*.mem` files. -""" -clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ()) - -# C wrappers -start_timer() = ccall(:jl_profile_start_timer, Cint, ()) - -stop_timer() = ccall(:jl_profile_stop_timer, Cvoid, ()) - -is_running() = ccall(:jl_profile_is_running, Cint, ())!=0 - -get_data_pointer() = convert(Ptr{UInt}, ccall(:jl_profile_get_data, Ptr{UInt8}, ())) - -len_data() = convert(Int, ccall(:jl_profile_len_data, Csize_t, ())) - -maxlen_data() = convert(Int, ccall(:jl_profile_maxlen_data, Csize_t, ())) - -error_codes = Dict( - -1=>"cannot specify signal action for profiling", - -2=>"cannot create the timer for profiling", - -3=>"cannot start the timer for profiling", - -4=>"cannot unblock SIGUSR1") - - -""" - fetch() -> data - -Returns a copy of the buffer of profile backtraces. Note that the -values in `data` have meaning only on this machine in the current session, because it -depends on the exact memory addresses used in JIT-compiling. This function is primarily for -internal use; [`retrieve`](@ref) may be a better choice for most users. -""" -function fetch() - maxlen = maxlen_data() - len = len_data() - if (len == maxlen) - @warn """The profile data buffer is full; profiling probably terminated - before your program finished. To profile for longer runs, call - `Profile.init()` with a larger buffer and/or larger delay.""" - end - data = Vector{UInt}(undef, len) - GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len) - return data -end - - -## Print as a flat list -# Counts the number of times each line appears, at any nesting level and at the topmost level -# Merging multiple equivalent entries and recursive calls -function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, C::Bool) where {T} - lilist = StackFrame[] - n = Int[] - m = Int[] - lilist_idx = Dict{T, Int}() - recursive = Set{T}() - first = true - totalshots = 0 - for ip in data - if ip == 0 - totalshots += 1 - empty!(recursive) - first = true - continue - end - frames = lidict[ip] - nframes = (frames isa Vector ? length(frames) : 1) - for i = 1:nframes - frame = (frames isa Vector ? frames[i] : frames) - !C && frame.from_c && continue - key = (T === UInt64 ? ip : frame) - idx = get!(lilist_idx, key, length(lilist) + 1) - if idx > length(lilist) - push!(recursive, key) - push!(lilist, frame) - push!(n, 1) - push!(m, 0) - elseif !(key in recursive) - push!(recursive, key) - n[idx] += 1 - end - if first - m[idx] += 1 - first = false - end - end - end - @assert length(lilist) == length(n) == length(m) == length(lilist_idx) - return (lilist, n, m, totalshots) -end - -function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat) - lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C) - if isempty(lilist) - warning_empty() - return - end - if false # optional: drop the "non-interpretable" ones - keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist) - lilist = lilist[keep] - n = n[keep] - m = m[keep] - end - filenamemap = Dict{Symbol,String}() - print_flat(io, lilist, n, m, cols, filenamemap, fmt) - Base.println(io, "Total snapshots: ", totalshots) - nothing -end - -function print_flat(io::IO, lilist::Vector{StackFrame}, - n::Vector{Int}, m::Vector{Int}, - cols::Int, filenamemap::Dict{Symbol,String}, - fmt::ProfileFormat) - if fmt.sortedby == :count - p = sortperm(n) - elseif fmt.sortedby == :overhead - p = sortperm(m) - else - p = liperm(lilist) - end - lilist = lilist[p] - n = n[p] - m = m[p] - filenames = String[short_path(li.file, filenamemap) for li in lilist] - funcnames = String[string(li.func) for li in lilist] - wcounts = max(6, ndigits(maximum(n))) - wself = max(9, ndigits(maximum(m))) - maxline = 1 - maxfile = 6 - maxfunc = 10 - for i in 1:length(lilist) - li = lilist[i] - maxline = max(maxline, li.line) - maxfunc = max(maxfunc, length(funcnames[i])) - maxfile = max(maxfile, length(filenames[i])) - end - wline = max(5, ndigits(maxline)) - ntext = max(20, cols - wcounts - wself - wline - 3) - maxfunc += 25 # for type signatures - if maxfile + maxfunc <= ntext - wfile = maxfile - wfunc = ntext - maxfunc # take the full width (for type sig) - else - wfile = 2*ntext÷5 - wfunc = 3*ntext÷5 - end - println(io, lpad("Count", wcounts, " "), " ", lpad("Overhead", wself, " "), " ", - rpad("File", wfile, " "), " ", lpad("Line", wline, " "), " Function") - println(io, lpad("=====", wcounts, " "), " ", lpad("========", wself, " "), " ", - rpad("====", wfile, " "), " ", lpad("====", wline, " "), " ========") - for i = 1:length(n) - n[i] < fmt.mincount && continue - li = lilist[i] - Base.print(io, lpad(string(n[i]), wcounts, " "), " ") - Base.print(io, lpad(string(m[i]), wself, " "), " ") - if li == UNKNOWN - if !fmt.combine && li.pointer != 0 - Base.print(io, "@0x", string(li.pointer, base=16)) - else - Base.print(io, "[any unknown stackframes]") - end - else - file = filenames[i] - isempty(file) && (file = "[unknown file]") - Base.print(io, rpad(rtruncto(file, wfile), wfile, " "), " ") - Base.print(io, lpad(li.line > 0 ? string(li.line) : "?", wline, " "), " ") - fname = funcnames[i] - if !li.from_c && li.linfo !== nothing - fname = sprint(show_spec_linfo, li) - end - isempty(fname) && (fname = "[unknown function]") - Base.print(io, ltruncto(fname, wfunc)) - end - println(io) - end - nothing -end - -## A tree representation - -# Representation of a prefix trie of backtrace counts -mutable struct StackFrameTree{T} # where T <: Union{UInt64, StackFrame} - # content fields: - frame::StackFrame - count::Int # number of frames this appeared in - overhead::Int # number frames where this was the code being executed - flat_count::Int # number of times this frame was in the flattened representation (unlike count, this'll sum to 100% of parent) - max_recur::Int # maximum number of times this frame was the *top* of the recursion in the stack - count_recur::Int # sum of the number of times this frame was the *top* of the recursion in a stack (divide by count to get an average) - down::Dict{T, StackFrameTree{T}} - # construction workers: - recur::Int - builder_key::Vector{UInt64} - builder_value::Vector{StackFrameTree{T}} - up::StackFrameTree{T} - StackFrameTree{T}() where {T} = new(UNKNOWN, 0, 0, 0, 0, 0, Dict{T, StackFrameTree{T}}(), 0, UInt64[], StackFrameTree{T}[]) -end - - -const indent_s = " ╎"^10 -const indent_z = collect(eachindex(indent_s)) -function indent(depth::Int) - depth < 1 && return "" - depth <= length(indent_z) && return indent_s[1:indent_z[depth]] - div, rem = divrem(depth, length(indent_z)) - return (indent_s^div) * SubString(indent_s, 1, indent_z[rem]) -end - -function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, maxes, filenamemap::Dict{Symbol,String}, showpointer::Bool) - nindent = min(cols>>1, level) - ndigoverhead = ndigits(maxes.overhead) - ndigcounts = ndigits(maxes.count) - ndigline = ndigits(maximum(frame.frame.line for frame in frames)) + 6 - ntext = max(30, cols - ndigoverhead - nindent - ndigcounts - ndigline - 6) - widthfile = 2*ntext÷5 # min 12 - widthfunc = 3*ntext÷5 # min 18 - strs = Vector{String}(undef, length(frames)) - showextra = false - if level > nindent - nextra = level - nindent - nindent -= ndigits(nextra) + 2 - showextra = true - end - for i = 1:length(frames) - frame = frames[i] - li = frame.frame - stroverhead = lpad(frame.overhead > 0 ? string(frame.overhead) : "", ndigoverhead, " ") - base = nindent == 0 ? "" : indent(nindent - 1) * " " - if showextra - base = string(base, "+", nextra, " ") - end - strcount = rpad(string(frame.count), ndigcounts, " ") - if li != UNKNOWN - if li.line == li.pointer - strs[i] = string(stroverhead, "╎", base, strcount, " ", - "[unknown function] (pointer: 0x", - string(li.pointer, base = 16, pad = 2*sizeof(Ptr{Cvoid})), - ")") - else - if !li.from_c && li.linfo !== nothing - fname = sprint(show_spec_linfo, li) - else - fname = string(li.func) - end - filename = short_path(li.file, filenamemap) - if showpointer - fname = string( - "0x", - string(li.pointer, base = 16, pad = 2*sizeof(Ptr{Cvoid})), - " ", - fname) - end - strs[i] = string(stroverhead, "╎", base, strcount, " ", - rtruncto(filename, widthfile), - ":", - li.line == -1 ? "?" : string(li.line), - "; ", - ltruncto(fname, widthfunc)) - end - else - strs[i] = string(stroverhead, "╎", base, strcount, " [unknown stackframe]") - end - end - return strs -end - -# turn a list of backtraces into a tree (implicitly separated by NULL markers) -function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, C::Bool, recur::Symbol) where {T} - parent = root - tops = Vector{StackFrameTree{T}}() - build = Vector{StackFrameTree{T}}() - startframe = length(all) - for i in startframe:-1:1 - ip = all[i] - if ip == 0 - # sentinel value indicates the start of a new backtrace - empty!(build) - root.recur = 0 - if recur !== :off - # We mark all visited nodes to so we'll only count those branches - # once for each backtrace. Reset that now for the next backtrace. - push!(tops, parent) - for top in tops - while top.recur != 0 - top.max_recur < top.recur && (top.max_recur = top.recur) - top.recur = 0 - top = top.up - end - end - empty!(tops) - end - let this = parent - while this !== root - this.flat_count += 1 - this = this.up - end - end - parent.overhead += 1 - parent = root - root.count += 1 - startframe = i - else - pushfirst!(build, parent) - if recur === :flat || recur == :flatc - # Rewind the `parent` tree back, if this exact ip was already present *higher* in the current tree - found = false - for j in 1:(startframe - i) - if ip == all[i + j] - if recur === :flat # if not flattening C frames, check that now - frames = lidict[ip] - frame = (frames isa Vector ? frames[1] : frames) - frame.from_c && break # not flattening this frame - end - push!(tops, parent) - parent = build[j] - parent.recur += 1 - parent.count_recur += 1 - found = true - break - end - end - found && continue - end - builder_key = parent.builder_key - builder_value = parent.builder_value - fastkey = searchsortedfirst(builder_key, ip) - if fastkey < length(builder_key) && builder_key[fastkey] === ip - # jump forward to the end of the inlining chain - # avoiding an extra (slow) lookup of `ip` in `lidict` - # and an extra chain of them in `down` - # note that we may even have this === parent (if we're ignoring this frame ip) - this = builder_value[fastkey] - let this = this - while this !== parent && (recur === :off || this.recur == 0) - this.count += 1 - this.recur = 1 - this = this.up - end - end - parent = this - continue - end - frames = lidict[ip] - nframes = (frames isa Vector ? length(frames) : 1) - this = parent - # add all the inlining frames - for i = nframes:-1:1 - frame = (frames isa Vector ? frames[i] : frames) - !C && frame.from_c && continue - key = (T === UInt64 ? ip : frame) - this = get!(StackFrameTree{T}, parent.down, key) - if recur === :off || this.recur == 0 - this.frame = frame - this.up = parent - this.count += 1 - this.recur = 1 - end - parent = this - end - # record where the end of this chain is for this ip - insert!(builder_key, fastkey, ip) - insert!(builder_value, fastkey, this) - end - end - function cleanup!(node::StackFrameTree) - stack = [node] - while !isempty(stack) - node = pop!(stack) - node.recur = 0 - empty!(node.builder_key) - empty!(node.builder_value) - append!(stack, values(node.down)) - end - nothing - end - cleanup!(root) - return root -end - -function maxstats(root::StackFrameTree) - maxcount = Ref(0) - maxflatcount = Ref(0) - maxoverhead = Ref(0) - maxmaxrecur = Ref(0) - stack = [root] - while !isempty(stack) - node = pop!(stack) - maxcount[] = max(maxcount[], node.count) - maxoverhead[] = max(maxoverhead[], node.overhead) - maxflatcount[] = max(maxflatcount[], node.flat_count) - maxmaxrecur[] = max(maxmaxrecur[], node.max_recur) - append!(stack, values(node.down)) - end - return (count=maxcount[], count_flat=maxflatcount[], overhead=maxoverhead[], max_recur=maxmaxrecur[]) -end - -# Print the stack frame tree starting at a particular root. Uses a worklist to -# avoid stack overflows. -function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat) where T - maxes = maxstats(bt) - filenamemap = Dict{Symbol,String}() - worklist = [(bt, 0, 0, "")] - println(io, "Overhead ╎ [+additional indent] Count File:Line; Function") - println(io, "=========================================================") - while !isempty(worklist) - (bt, level, noisefloor, str) = popfirst!(worklist) - isempty(str) || println(io, str) - level > fmt.maxdepth && continue - isempty(bt.down) && continue - # Order the line information - nexts = collect(values(bt.down)) - # Generate the string for each line - strs = tree_format(nexts, level, cols, maxes, filenamemap, T === UInt64) - # Recurse to the next level - if fmt.sortedby == :count - counts = collect(frame.count for frame in nexts) - p = sortperm(counts) - elseif fmt.sortedby == :overhead - m = collect(frame.overhead for frame in nexts) - p = sortperm(m) - elseif fmt.sortedby == :flat_count - m = collect(frame.flat_count for frame in nexts) - p = sortperm(m) - else - lilist = collect(frame.frame for frame in nexts) - p = liperm(lilist) - end - for i in reverse(p) - down = nexts[i] - count = down.count - count < fmt.mincount && continue - count < noisefloor && continue - str = strs[i] - noisefloor_down = fmt.noisefloor > 0 ? floor(Int, fmt.noisefloor * sqrt(count)) : 0 - pushfirst!(worklist, (down, level + 1, noisefloor_down, str)) - end - end -end - -function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat) - if fmt.combine - root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur) - else - root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur) - end - if isempty(root.down) - warning_empty() - return - end - print_tree(io, root, cols, fmt) - Base.println(io, "Total snapshots: ", root.count) - nothing -end - -function callersf(matchfunc::Function, bt::Vector, lidict::LineInfoFlatDict) - counts = Dict{StackFrame, Int}() - lastmatched = false - for id in bt - if id == 0 - lastmatched = false - continue - end - li = lidict[id] - if lastmatched - if haskey(counts, li) - counts[li] += 1 - else - counts[li] = 1 - end - end - lastmatched = matchfunc(li) - end - k = collect(keys(counts)) - v = collect(values(counts)) - p = sortperm(v, rev=true) - return [(v[i], k[i]) for i in p] -end - -# Utilities -function rtruncto(str::String, w::Int) - if length(str) <= w - return str - else - return string("...", str[prevind(str, end, w-4):end]) - end -end -function ltruncto(str::String, w::Int) - if length(str) <= w - return str - else - return string(str[1:nextind(str, 1, w-4)], "...") - end -end - - -truncto(str::Symbol, w::Int) = truncto(string(str), w) - -# Order alphabetically (file, function) and then by line number -function liperm(lilist::Vector{StackFrame}) - function lt(a::StackFrame, b::StackFrame) - a == UNKNOWN && return false - b == UNKNOWN && return true - fcmp = cmp(a.file, b.file) - fcmp < 0 && return true - fcmp > 0 && return false - fcmp = cmp(a.func, b.func) - fcmp < 0 && return true - fcmp > 0 && return false - fcmp = cmp(a.line, b.line) - fcmp < 0 && return true - return false - end - return sortperm(lilist, lt = lt) -end +using .Memory, .Time -warning_empty() = @warn """ - There were no samples collected. Run your program longer (perhaps by - running it multiple times), or adjust the delay between samples with - `Profile.init()`.""" +# Export our two measuring tools +import .Time: @profile +import .Memory: @memprofile +export @profile, @memprofile end # module diff --git a/stdlib/Profile/src/compat.jl b/stdlib/Profile/src/compat.jl new file mode 100644 index 00000000000000..29f74b4450a84b --- /dev/null +++ b/stdlib/Profile/src/compat.jl @@ -0,0 +1,22 @@ +# Compatibility shims for old users who aren't used to the `Time` and `Memory` sub-modules + +function init(args...; kwargs...) + @warn("Profile.init() is deprecated, use Profile.Time.init() or Profile.Memory.init() directly") + Time.init(args...; kwargs...) +end + +function print(args...; kwargs...) + @warn("Profile.print() is deprecated, use Profile.Time.print() or Profile.Memory.print() directly") + Time.print(args...; kwargs...) +end + + +""" + clear_malloc_data() + +Clears any stored memory allocation data when running julia with `--track-allocation`. +Execute the command(s) you want to test (to force JIT-compilation), then call +[`clear_malloc_data`](@ref). Then execute your command(s) again, quit +Julia, and examine the resulting `*.mem` files. +""" +clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ()) diff --git a/stdlib/Profile/src/memory.jl b/stdlib/Profile/src/memory.jl new file mode 100644 index 00000000000000..8e331b11698a17 --- /dev/null +++ b/stdlib/Profile/src/memory.jl @@ -0,0 +1,349 @@ +module Memory + +using ..Profile: getdict + +using Base.StackTraces + + +## +## For --track-allocation (unrelated to @memprofile) +## +# Reset the malloc log. Used to avoid counting memory allocated during +# compilation. + +""" + clear_malloc_data() + +Clears any stored memory allocation data when running julia with `--track-allocation`. +Execute the command(s) you want to test (to force JIT-compilation), then call +[`clear_malloc_data`](@ref). Then execute your command(s) again, quit +Julia, and examine the resulting `*.mem` files. +""" +clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ()) + + +tag_lookup(d::Dict, s::Symbol) = d[s] +tag_lookup(d::Dict, v::Vector{Symbol}) = mapreduce(s->d[s], |, v) +tag_lookup(d::Dict, n::Number) = [t for (t, v) in d if (v & n == v) && (t != :all)] + +const memory_domain_map = Dict( + :cpu => 0x0001, + :gpu => 0x0002, + :external => 0x0080, + + :all => 0x00ff, +) +const allocator_map = Dict( + :std => 0x0100, + :pool => 0x0200, + :big => 0x0400, + + :all => 0xff00, +) +const closing_tag = 0x8000 + +""" + build_tag_filter(memory_domain = :all, allocator = :all) + +Build a memory profile tag filter that will capture only events matching the given +filter parameters. This method is meant to mirror the `#define`'d `JL_MEMPROF_TAG_*` +values within `julia_internal,.h`. Valid memory domains are `:cpu`, `:gpu`, `:external` +and `:all`. Valid allocators are `:std`, `:pool`, `:big`, and `:all`. + +You can build a union of multiple allocators and memory domains by passing in a +vector of symbols, e.g. `build_tag_filter(:all, [:std, :pool])`. +""" +function build_tag_filter(memory_domain = :all, allocator = :all) + return tag_lookup(memory_domain_map, memory_domain) | tag_lookup(allocator_map, allocator) +end + +""" + init(; bt_size::Integer, alloc_size::Integer, tag_filter::UInt16) + +Configure the number `bt_size` of instruction pointers that may be stored for backtraces +of memory allocation and deallocation locatinos, as well as the number of allocation event +structures that are prepared for when profiling memory usage. Each instruction pointer +corresponds to a single line of code; backtraces generally consist of a long list of +instruction pointers. Default settings can be obtained by calling this function with no +arguments, and each can be set independently using keywords or in the order `(bt_size, alloc_size)`. + +`tag_filter` can be used to filter what kind of events are captured by the memory profiler, +see `build_tag_filter()` for more. +""" +function init(; bt_size::Union{Nothing,Integer} = nothing, + alloc_size::Union{Nothing,Integer} = nothing, + tag_filter::UInt16 = build_tag_filter()) + bt_size = something(bt_size, get_memprofile_bt_data_maxlen()) + alloc_size = something(alloc_size, get_memprofile_alloc_data_maxlen()) + + # Sub off to our friend + return init(bt_size, alloc_size, tag_filter) +end + +function init(bt_size::Integer, alloc_size::Integer, tag_filter::UInt16) + status = ccall(:jl_memprofile_init, Cint, (Csize_t, Csize_t, UInt16), bt_size, alloc_size, tag_filter) + if status == -1 + error("could not allocate space for ", bt_size, " instruction pointers and ", alloc_size, " allocation event structures") + end +end + +# This struct must stay in sync with `src/memprofiler.c` +struct allocation_info_t + address::Ptr{Cvoid} + T::Ptr{Cvoid} + time::Float64 + allocsz::Csize_t + tag::UInt16 +end + +# This struct is a combined version of the above `allocation_info_t` with backtrace data +# and automatic GC marking for the objects that are still open so that they don't get +# collected while we're modifying them. +struct AllocationInfo + address::UInt + T::Any + alloc_time::Float64 + free_time::Float64 + alloc_stacktrace::Vector{StackFrame} + free_stacktrace::Vector{StackFrame} + allocsz::UInt64 + tag::UInt16 + + # For an "open" allocation, this is the object that is still alive. For a "closed" + # allocation, this is just `nothing`. + object::Any +end + +# If a chunk is "open", we know it's live so actually grab the object and its type +# The only subtlety is when it's a malloc'd list. +function update_live_object(a::AllocationInfo) + array_addr = ccall(:jl_memprofile_find_malloc_array, Ptr{Cvoid}, (Ptr{Cvoid},), Ptr{Cvoid}(a.address)) + if array_addr != C_NULL + obj = unsafe_pointer_to_objref(array_addr) + else + # Not doing this right now, because it appears that they can ~~disappear~~ + obj = nothing # unsafe_pointer_to_objref(Ptr{Cvoid}(a.address)) + end + + return AllocationInfo( + a.address, + a.T, + a.alloc_time, + a.free_time, + a.alloc_stacktrace, + a.free_stacktrace, + a.allocsz, + a.tag, + obj + ) +end + +function Base.show(io::IO, a::AllocationInfo) + md = join(tag_lookup(memory_domain_map, a.tag), ", ") + at = join(tag_lookup(allocator_map, a.tag), ", ") + T = a.T === nothing ? "(?)" : string(a.T) + print(io, "AllocationInfo (@$(repr(a.address)): $(a.allocsz) B, domain: ($(md)), allocator: ($(at)), type: $(T))") +end + +""" + open_AI(a::allocation_info_t, alloc_stacktrace) + +Create an `AllocationInfo` that represents an allocated (and not yet deallocated) chunk +of memory. This `AllocationInfo` will later be "closed" to represent the full lifecycle +of an allocated and then freed piece of memory. This function has also been known to +create highly competitive dota agents. +""" +function open_AI(a::allocation_info_t, alloc_stacktrace) + # If a_type is still C_NULL, just set the type as `nothing`: + if a.T == C_NULL + a_type = nothing + else + a_type = unsafe_pointer_to_objref(a.T) + end + + return AllocationInfo( + UInt(a.address), + a_type, + a.time, + 0.0, + alloc_stacktrace, + StackFrame[], + UInt64(a.allocsz), + a.tag, + + # We initialize this as `nothing` because we don't know if this + # object is actually still live or not. + nothing, + ) +end + +""" + close_AI(a::AllocationInfo, d::allocation_info_t, dealloc_stacktrace) + +Mark a previously opened `AllocationInfo` as closed, denoting that we know both the +allocation and deallocation points for this chunk of memory. +""" +function close_AI(a::AllocationInfo, d::allocation_info_t, dealloc_stacktrace) + # Ensure that we're dealing with the same memory location + @assert a.address == UInt(d.address) "Attempting to close different memory locations! ($(a.address) != $(UInt(d.address)))" + @assert (a.tag | closing_tag) == d.tag "Attempting to close memory from different allocators/domains! ($(a.tag | closing_tag) != $(d.tag))" + + # If a_type is still C_NULL, just set the type as `nothing`: + if d.T == C_NULL + d_type = nothing + else + d_type = unsafe_pointer_to_objref(d.T) + end + + nstr(x) = x === nothing ? "nothing" : string(x) + @assert ((d_type == nothing || a.T == nothing) || d_type == a.T) "Mismatched types! ($(nstr(d_type)) != $(nstr(a.T)))" + + return AllocationInfo( + a.address, + something(a.T, d_type, Some(nothing)), + a.alloc_time, + d.time, + a.alloc_stacktrace, + dealloc_stacktrace, + a.allocsz, + a.tag, + nothing, + ) +end + +macro memprofile(ex) + return quote + try + # Run a GC before we start profiling, to eliminate "ghost" chunks + Base.GC.gc() + start_memprofile() + + # Explicit `begin/end` to help GC scope resolution + begin + $(esc(ex)) + end + finally + # Run a GC afterwards to eliminate things that aren't actually leaked, + # they just weren't cleaned up before we stopped profiling. + Base.GC.gc() + stop_memprofile() + end + end +end + +start_memprofile() = ccall(:jl_memprofile_start, Cvoid, ()) +stop_memprofile() = ccall(:jl_memprofile_stop, Cvoid, ()) + +get_memprofile_bt_data() = ccall(:jl_memprofile_get_bt_data, Ptr{UInt}, ()) +get_memprofile_bt_data_len() = convert(Int, ccall(:jl_memprofile_len_bt_data, Csize_t, ())) +get_memprofile_bt_data_maxlen() = convert(Int, ccall(:jl_memprofile_maxlen_bt_data, Csize_t, ())) + +get_memprofile_alloc_data() = ccall(:jl_memprofile_get_alloc_data, Ptr{allocation_info_t}, ()) +get_memprofile_alloc_data_len() = convert(Int, ccall(:jl_memprofile_len_alloc_data, Csize_t, ())) +get_memprofile_alloc_data_maxlen() = convert(Int, ccall(:jl_memprofile_maxlen_alloc_data, Csize_t, ())) + +get_memprofile_overflow() = ccall(:jl_memprofile_overflow, Cint, ()) +get_memprofile_tag_filter() = UInt16(ccall(:jl_memprofile_tag_filter, Cint, ())) + +""" + read_and_coalesce_memprofile_data() + +Load in the backtrace and allocation data buffers from the C side of Julia-land, returns two +vectors, `open_chunks` and `closed_chunks`, which represent all allocations that are yet to +be freed, and all allocations that were properly freed. +""" +function read_and_coalesce_memprofile_data() + overflow = get_memprofile_overflow() + if overflow & 0x01 != 0 + @warn """The memory profile backtrace buffer overflowed; profiling terminated + before your program finished. To profile for longer runs, call + `Profile.init()` with a larger buffer and/or larger delay.""" + end + if overflow & 0x02 != 0 + @warn """The memory profile allocation info buffer overflowed; profiling terminated + before your program finished. To profile for longer runs, call + `Profile.init()` with a larger buffer and/or larger delay.""" + end + + bt_data = unsafe_wrap(Array, get_memprofile_bt_data(), (get_memprofile_bt_data_len(),)) + alloc_data = unsafe_wrap(Array, get_memprofile_alloc_data(), (get_memprofile_alloc_data_len(),)) + + # Build backtrace lookup table + bt_data = Base._reformat_bt(bt_data) + bt_lookup = getdict(bt_data) + + # These are chunks of memory that have been allocated, but not yet freed. They are keyed by + # (memory_address, tag) for easy lookup + open_chunks_map = Dict{Tuple{UInt,UInt16},AllocationInfo}() + + # These are chunks of memory that have been allocated and then freed. They will start in + # open_chunks_map, then get migrated over here once they are freed. + closed_chunks = AllocationInfo[] + + # These are chunks of memory that were free'd but we didn't see them get created. + ghost_chunks = Tuple[] + + # Loop over every event in `alloc_data`, looking for identical memory pointers; attempt to pair + # them together to create "closed" chunks: + bt_idx = 1 + for a in alloc_data + # Helper function to extract the next backtrace from the buffer + function next_bt_trace(bt_data, bt_idx) + bt_end_idx = bt_idx + while bt_data[bt_end_idx] != 0x0 + bt_end_idx += 1 + end + bt = [bt_lookup[x][1] for x in bt_data[bt_idx:bt_end_idx-1]] + return (bt, bt_end_idx + 1) + end + + # Extract the backtrace associated with this allocation event + bt, bt_idx = next_bt_trace(bt_data, bt_idx) + + # Construct the "chunk identifier" that we use to loop up within open_chunks_map, etc... + chunk_id = (UInt(a.address), a.tag | closing_tag) + + # Are we an allocation? + if a.allocsz != 0 + @info("Opening $(chunk_id) $(a.T) $(a.allocsz)") + + # Assert that we're not inserting an identical chunk + @assert !(chunk_id in keys(open_chunks_map)) "Doubly-opened memory chunk!" + open_chunks_map[chunk_id] = open_AI(a, bt) + else + @info("Closing $(chunk_id) $(a.T)") + + # If this `a` represents a free(), let's see if we're closing a previously opened chunk. + if !(chunk_id in keys(open_chunks_map)) + push!(ghost_chunks, (a, bt)) + else + # "Close" the chunk by adding deallocation information to it, then remove + # that chunk from the open_chunks_map! + push!(closed_chunks, close_AI(open_chunks_map[chunk_id], a, bt)) + delete!(open_chunks_map, chunk_id) + end + end + end + + if !isempty(ghost_chunks) + @warn "Attempted to close $(length(ghost_chunks)) ghost memory chunks!" + end + + # Any leftover "open" chunks are just plain old leaked memory. + open_chunks = sort(collect(values(open_chunks_map)); by = x -> x.alloc_time) + closed_chunks = sort(closed_chunks, by = x -> x.alloc_time) + + # Leftover "open" chunks still have their memory, so let's get their types! + open_chunks = update_live_object.(open_chunks) + return open_chunks, closed_chunks, ghost_chunks +end + + +""" + clear() + +Clear any existing memory profile data from the internal buffers. +""" +clear_memprofile_data() = ccall(:jl_memprofile_clear_data, Cvoid, ()) + +end # module Memory diff --git a/stdlib/Profile/src/stacktrace_tools.jl b/stdlib/Profile/src/stacktrace_tools.jl new file mode 100644 index 00000000000000..8d0ef7dad182ad --- /dev/null +++ b/stdlib/Profile/src/stacktrace_tools.jl @@ -0,0 +1,628 @@ +import Base.StackTraces: lookup, UNKNOWN, show_spec_linfo, StackFrame + +# deprecated functions: use `getdict` instead +lookup(ip::UInt) = lookup(convert(Ptr{Cvoid}, ip)) + + +const IP = Union{UInt,Base.InterpreterIP} +const LineInfoDict = Dict{IP, Vector{StackFrame}} +const LineInfoFlatDict = Dict{IP, StackFrame} + +struct ProfileFormat + maxdepth::Int + mincount::Int + noisefloor::Float64 + sortedby::Symbol + combine::Bool + C::Bool + recur::Symbol + function ProfileFormat(; + C = false, + combine = true, + maxdepth::Int = typemax(Int), + mincount::Int = 0, + noisefloor = 0, + sortedby::Symbol = :filefuncline, + recur::Symbol = :off) + return new(maxdepth, mincount, noisefloor, sortedby, combine, C, recur) + end +end + +""" + getdict(data::Vector{UInt}) +Given a list of instruction pointers, build a dictionary mapping from those pointers to +LineInfo objects (as returned by `StackTraces.lookup()`). This allows us to quickly take +a list of instruction pointers and convert it to `LineInfo` objects. +""" +function getdict(data) + return LineInfoDict(x=>lookup(x) for x in unique(data)) +end + +""" + flatten(btdata::Vector, lidict::LineInfoDict) -> (newdata::Vector{UInt64}, newdict::LineInfoFlatDict) + +Produces "flattened" backtrace data. Individual instruction pointers +sometimes correspond to a multi-frame backtrace due to inlining; in +such cases, this function inserts fake instruction pointers for the +inlined calls, and returns a dictionary that is a 1-to-1 mapping +between instruction pointers and a single StackFrame. +""" +function flatten(data::Vector, lidict::LineInfoDict) + # Makes fake instruction pointers, counting down from typemax(UInt) + newip = typemax(UInt64) - 1 + taken = Set(keys(lidict)) # make sure we don't pick one that's already used + newdict = Dict{IP,StackFrame}() + newmap = Dict{IP,Vector{UInt64}}() + for (ip, trace) in lidict + if length(trace) == 1 + newdict[ip] = trace[1] + else + newm = UInt64[] + for sf in trace + while newip ∈ taken && newip > 0 + newip -= 1 + end + newip == 0 && error("all possible instruction pointers used") + push!(newm, newip) + newdict[newip] = sf + newip -= 1 + end + newmap[ip] = newm + end + end + newdata = UInt64[] + for ip::UInt64 in data + if haskey(newmap, ip) + append!(newdata, newmap[ip]) + else + push!(newdata, ip) + end + end + return (newdata, newdict) +end + +# Take a file-system path and try to form a concise representation of it +# based on the package ecosystem +function short_path(spath::Symbol, filenamecache::Dict{Symbol, String}) + return get!(filenamecache, spath) do + path = string(spath) + if isabspath(path) + if ispath(path) + # try to replace the file-system prefix with a short "@Module" one, + # assuming that profile came from the current machine + # (or at least has the same file-system layout) + root = path + while !isempty(root) + root, base = splitdir(root) + isempty(base) && break + @assert startswith(path, root) + for proj in Base.project_names + project_file = joinpath(root, proj) + if Base.isfile_casesensitive(project_file) + pkgid = Base.project_file_name_uuid(project_file, "") + isempty(pkgid.name) && return path # bad Project file + # return the joined the module name prefix and path suffix + path = path[nextind(path, sizeof(root)):end] + return string("@", pkgid.name, path) + end + end + end + end + return path + elseif isfile(joinpath(Sys.BINDIR::String, Base.DATAROOTDIR, "julia", "base", path)) + # do the same mechanic for Base (or Core/Compiler) files as above, + # but they start from a relative path + return joinpath("@Base", normpath(path)) + else + # for non-existent relative paths (such as "REPL[1]"), just consider simplifying them + return normpath(path) # drop leading "./" + end + end +end + +""" + callers(funcname, [data, lidict], [filename=], [linerange=]) -> Vector{Tuple{count, lineinfo}} + +Given a previous profiling run, determine who called a particular function. Supplying the +filename (and optionally, range of line numbers over which the function is defined) allows +you to disambiguate an overloaded method. The returned value is a vector containing a count +of the number of calls and line information about the caller. One can optionally supply +backtrace `data` obtained from [`Time.retrieve`](@ref); otherwise, the current internal +profile buffer is used. +""" +function callers end + +function callers(funcname::String, bt::Vector, lidict::LineInfoFlatDict; filename = nothing, linerange = nothing) + if filename === nothing && linerange === nothing + return callersf(li -> String(li.func) == funcname, + bt, lidict) + end + filename === nothing && throw(ArgumentError("if supplying linerange, you must also supply the filename")) + filename = String(filename) + if linerange === nothing + return callersf(li -> String(li.func) == funcname && String(li.file) == filename, + bt, lidict) + else + return callersf(li -> String(li.func) == funcname && String(li.file) == filename && in(li.line, linerange), + bt, lidict) + end +end + +callers(funcname::String, bt::Vector, lidict::LineInfoDict; kwargs...) = + callers(funcname, flatten(bt, lidict)...; kwargs...) +callers(func::Function, bt::Vector, lidict::LineInfoFlatDict; kwargs...) = + callers(string(func), bt, lidict; kwargs...) + + +## Print as a flat list +# Counts the number of times each line appears, at any nesting level and at the topmost level +# Merging multiple equivalent entries and recursive calls +function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, C::Bool) where {T} + lilist = StackFrame[] + n = Int[] + m = Int[] + lilist_idx = Dict{T, Int}() + recursive = Set{T}() + first = true + totalshots = 0 + for ip in data + if ip == 0 + totalshots += 1 + empty!(recursive) + first = true + continue + end + frames = lidict[ip] + nframes = (frames isa Vector ? length(frames) : 1) + for i = 1:nframes + frame = (frames isa Vector ? frames[i] : frames) + !C && frame.from_c && continue + key = (T === UInt64 ? ip : frame) + idx = get!(lilist_idx, key, length(lilist) + 1) + if idx > length(lilist) + push!(recursive, key) + push!(lilist, frame) + push!(n, 1) + push!(m, 0) + elseif !(key in recursive) + push!(recursive, key) + n[idx] += 1 + end + if first + m[idx] += 1 + first = false + end + end + end + @assert length(lilist) == length(n) == length(m) == length(lilist_idx) + return (lilist, n, m, totalshots) +end + +function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat) + lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C) + if isempty(lilist) + warning_empty() + return + end + if false # optional: drop the "non-interpretable" ones + keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist) + lilist = lilist[keep] + n = n[keep] + m = m[keep] + end + filenamemap = Dict{Symbol,String}() + print_flat(io, lilist, n, m, cols, filenamemap, fmt) + Base.println(io, "Total snapshots: ", totalshots) + nothing +end + +function print_flat(io::IO, lilist::Vector{StackFrame}, + n::Vector{Int}, m::Vector{Int}, + cols::Int, filenamemap::Dict{Symbol,String}, + fmt::ProfileFormat) + if fmt.sortedby == :count + p = sortperm(n) + elseif fmt.sortedby == :overhead + p = sortperm(m) + else + p = liperm(lilist) + end + lilist = lilist[p] + n = n[p] + m = m[p] + filenames = String[short_path(li.file, filenamemap) for li in lilist] + funcnames = String[string(li.func) for li in lilist] + wcounts = max(6, ndigits(maximum(n))) + wself = max(9, ndigits(maximum(m))) + maxline = 1 + maxfile = 6 + maxfunc = 10 + for i in 1:length(lilist) + li = lilist[i] + maxline = max(maxline, li.line) + maxfunc = max(maxfunc, length(funcnames[i])) + maxfile = max(maxfile, length(filenames[i])) + end + wline = max(5, ndigits(maxline)) + ntext = max(20, cols - wcounts - wself - wline - 3) + maxfunc += 25 # for type signatures + if maxfile + maxfunc <= ntext + wfile = maxfile + wfunc = ntext - maxfunc # take the full width (for type sig) + else + wfile = 2*ntext÷5 + wfunc = 3*ntext÷5 + end + println(io, lpad("Count", wcounts, " "), " ", lpad("Overhead", wself, " "), " ", + rpad("File", wfile, " "), " ", lpad("Line", wline, " "), " Function") + println(io, lpad("=====", wcounts, " "), " ", lpad("========", wself, " "), " ", + rpad("====", wfile, " "), " ", lpad("====", wline, " "), " ========") + for i = 1:length(n) + n[i] < fmt.mincount && continue + li = lilist[i] + Base.print(io, lpad(string(n[i]), wcounts, " "), " ") + Base.print(io, lpad(string(m[i]), wself, " "), " ") + if li == UNKNOWN + if !fmt.combine && li.pointer != 0 + Base.print(io, "@0x", string(li.pointer, base=16)) + else + Base.print(io, "[any unknown stackframes]") + end + else + file = filenames[i] + isempty(file) && (file = "[unknown file]") + Base.print(io, rpad(rtruncto(file, wfile), wfile, " "), " ") + Base.print(io, lpad(li.line > 0 ? string(li.line) : "?", wline, " "), " ") + fname = funcnames[i] + if !li.from_c && li.linfo !== nothing + fname = sprint(show_spec_linfo, li) + end + isempty(fname) && (fname = "[unknown function]") + Base.print(io, ltruncto(fname, wfunc)) + end + println(io) + end + nothing +end + +## A tree representation + +# Representation of a prefix trie of backtrace counts +mutable struct StackFrameTree{T} # where T <: Union{UInt64, StackFrame} + # content fields: + frame::StackFrame + count::Int # number of frames this appeared in + overhead::Int # number frames where this was the code being executed + flat_count::Int # number of times this frame was in the flattened representation (unlike count, this'll sum to 100% of parent) + max_recur::Int # maximum number of times this frame was the *top* of the recursion in the stack + count_recur::Int # sum of the number of times this frame was the *top* of the recursion in a stack (divide by count to get an average) + down::Dict{T, StackFrameTree{T}} + # construction workers: + recur::Int + builder_key::Vector{UInt64} + builder_value::Vector{StackFrameTree{T}} + up::StackFrameTree{T} + StackFrameTree{T}() where {T} = new(UNKNOWN, 0, 0, 0, 0, 0, Dict{T, StackFrameTree{T}}(), 0, UInt64[], StackFrameTree{T}[]) +end + + +const indent_s = " ╎"^10 +const indent_z = collect(eachindex(indent_s)) +function indent(depth::Int) + depth < 1 && return "" + depth <= length(indent_z) && return indent_s[1:indent_z[depth]] + div, rem = divrem(depth, length(indent_z)) + return (indent_s^div) * SubString(indent_s, 1, indent_z[rem]) +end + +function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, maxes, filenamemap::Dict{Symbol,String}, showpointer::Bool) + nindent = min(cols>>1, level) + ndigoverhead = ndigits(maxes.overhead) + ndigcounts = ndigits(maxes.count) + ndigline = ndigits(maximum(frame.frame.line for frame in frames)) + 6 + ntext = max(30, cols - ndigoverhead - nindent - ndigcounts - ndigline - 6) + widthfile = 2*ntext÷5 # min 12 + widthfunc = 3*ntext÷5 # min 18 + strs = Vector{String}(undef, length(frames)) + showextra = false + if level > nindent + nextra = level - nindent + nindent -= ndigits(nextra) + 2 + showextra = true + end + for i = 1:length(frames) + frame = frames[i] + li = frame.frame + stroverhead = lpad(frame.overhead > 0 ? string(frame.overhead) : "", ndigoverhead, " ") + base = nindent == 0 ? "" : indent(nindent - 1) * " " + if showextra + base = string(base, "+", nextra, " ") + end + strcount = rpad(string(frame.count), ndigcounts, " ") + if li != UNKNOWN + if li.line == li.pointer + strs[i] = string(stroverhead, "╎", base, strcount, " ", + "[unknown function] (pointer: 0x", + string(li.pointer, base = 16, pad = 2*sizeof(Ptr{Cvoid})), + ")") + else + if !li.from_c && li.linfo !== nothing + fname = sprint(show_spec_linfo, li) + else + fname = string(li.func) + end + filename = short_path(li.file, filenamemap) + if showpointer + fname = string( + "0x", + string(li.pointer, base = 16, pad = 2*sizeof(Ptr{Cvoid})), + " ", + fname) + end + strs[i] = string(stroverhead, "╎", base, strcount, " ", + rtruncto(filename, widthfile), + ":", + li.line == -1 ? "?" : string(li.line), + "; ", + ltruncto(fname, widthfunc)) + end + else + strs[i] = string(stroverhead, "╎", base, strcount, " [unknown stackframe]") + end + end + return strs +end + +# turn a list of backtraces into a tree (implicitly separated by NULL markers) +function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, C::Bool, recur::Symbol) where {T} + parent = root + tops = Vector{StackFrameTree{T}}() + build = Vector{StackFrameTree{T}}() + startframe = length(all) + for i in startframe:-1:1 + ip = all[i] + if ip == 0 + # sentinel value indicates the start of a new backtrace + empty!(build) + root.recur = 0 + if recur !== :off + # We mark all visited nodes to so we'll only count those branches + # once for each backtrace. Reset that now for the next backtrace. + push!(tops, parent) + for top in tops + while top.recur != 0 + top.max_recur < top.recur && (top.max_recur = top.recur) + top.recur = 0 + top = top.up + end + end + empty!(tops) + end + let this = parent + while this !== root + this.flat_count += 1 + this = this.up + end + end + parent.overhead += 1 + parent = root + root.count += 1 + startframe = i + else + pushfirst!(build, parent) + if recur === :flat || recur == :flatc + # Rewind the `parent` tree back, if this exact ip was already present *higher* in the current tree + found = false + for j in 1:(startframe - i) + if ip == all[i + j] + if recur === :flat # if not flattening C frames, check that now + frames = lidict[ip] + frame = (frames isa Vector ? frames[1] : frames) + frame.from_c && break # not flattening this frame + end + push!(tops, parent) + parent = build[j] + parent.recur += 1 + parent.count_recur += 1 + found = true + break + end + end + found && continue + end + builder_key = parent.builder_key + builder_value = parent.builder_value + fastkey = searchsortedfirst(builder_key, ip) + if fastkey < length(builder_key) && builder_key[fastkey] === ip + # jump forward to the end of the inlining chain + # avoiding an extra (slow) lookup of `ip` in `lidict` + # and an extra chain of them in `down` + # note that we may even have this === parent (if we're ignoring this frame ip) + this = builder_value[fastkey] + let this = this + while this !== parent && (recur === :off || this.recur == 0) + this.count += 1 + this.recur = 1 + this = this.up + end + end + parent = this + continue + end + frames = lidict[ip] + nframes = (frames isa Vector ? length(frames) : 1) + this = parent + # add all the inlining frames + for i = nframes:-1:1 + frame = (frames isa Vector ? frames[i] : frames) + !C && frame.from_c && continue + key = (T === UInt64 ? ip : frame) + this = get!(StackFrameTree{T}, parent.down, key) + if recur === :off || this.recur == 0 + this.frame = frame + this.up = parent + this.count += 1 + this.recur = 1 + end + parent = this + end + # record where the end of this chain is for this ip + insert!(builder_key, fastkey, ip) + insert!(builder_value, fastkey, this) + end + end + function cleanup!(node::StackFrameTree) + stack = [node] + while !isempty(stack) + node = pop!(stack) + node.recur = 0 + empty!(node.builder_key) + empty!(node.builder_value) + append!(stack, values(node.down)) + end + nothing + end + cleanup!(root) + return root +end + +function maxstats(root::StackFrameTree) + maxcount = Ref(0) + maxflatcount = Ref(0) + maxoverhead = Ref(0) + maxmaxrecur = Ref(0) + stack = [root] + while !isempty(stack) + node = pop!(stack) + maxcount[] = max(maxcount[], node.count) + maxoverhead[] = max(maxoverhead[], node.overhead) + maxflatcount[] = max(maxflatcount[], node.flat_count) + maxmaxrecur[] = max(maxmaxrecur[], node.max_recur) + append!(stack, values(node.down)) + end + return (count=maxcount[], count_flat=maxflatcount[], overhead=maxoverhead[], max_recur=maxmaxrecur[]) +end + +# Print the stack frame tree starting at a particular root. Uses a worklist to +# avoid stack overflows. +function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat) where T + maxes = maxstats(bt) + filenamemap = Dict{Symbol,String}() + worklist = [(bt, 0, 0, "")] + println(io, "Overhead ╎ [+additional indent] Count File:Line; Function") + println(io, "=========================================================") + while !isempty(worklist) + (bt, level, noisefloor, str) = popfirst!(worklist) + isempty(str) || println(io, str) + level > fmt.maxdepth && continue + isempty(bt.down) && continue + # Order the line information + nexts = collect(values(bt.down)) + # Generate the string for each line + strs = tree_format(nexts, level, cols, maxes, filenamemap, T === UInt64) + # Recurse to the next level + if fmt.sortedby == :count + counts = collect(frame.count for frame in nexts) + p = sortperm(counts) + elseif fmt.sortedby == :overhead + m = collect(frame.overhead for frame in nexts) + p = sortperm(m) + elseif fmt.sortedby == :flat_count + m = collect(frame.flat_count for frame in nexts) + p = sortperm(m) + else + lilist = collect(frame.frame for frame in nexts) + p = liperm(lilist) + end + for i in reverse(p) + down = nexts[i] + count = down.count + count < fmt.mincount && continue + count < noisefloor && continue + str = strs[i] + noisefloor_down = fmt.noisefloor > 0 ? floor(Int, fmt.noisefloor * sqrt(count)) : 0 + pushfirst!(worklist, (down, level + 1, noisefloor_down, str)) + end + end +end + +function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat) + if fmt.combine + root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur) + else + root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur) + end + if isempty(root.down) + warning_empty() + return + end + print_tree(io, root, cols, fmt) + Base.println(io, "Total snapshots: ", root.count) + nothing +end + +function callersf(matchfunc::Function, bt::Vector, lidict::LineInfoFlatDict) + counts = Dict{StackFrame, Int}() + lastmatched = false + for id in bt + if id == 0 + lastmatched = false + continue + end + li = lidict[id] + if lastmatched + if haskey(counts, li) + counts[li] += 1 + else + counts[li] = 1 + end + end + lastmatched = matchfunc(li) + end + k = collect(keys(counts)) + v = collect(values(counts)) + p = sortperm(v, rev=true) + return [(v[i], k[i]) for i in p] +end + +# Utilities +function rtruncto(str::String, w::Int) + if length(str) <= w + return str + else + return string("...", str[prevind(str, end, w-4):end]) + end +end +function ltruncto(str::String, w::Int) + if length(str) <= w + return str + else + return string(str[1:nextind(str, 1, w-4)], "...") + end +end + + +truncto(str::Symbol, w::Int) = truncto(string(str), w) + +# Order alphabetically (file, function) and then by line number +function liperm(lilist::Vector{StackFrame}) + function lt(a::StackFrame, b::StackFrame) + a == UNKNOWN && return false + b == UNKNOWN && return true + fcmp = cmp(a.file, b.file) + fcmp < 0 && return true + fcmp > 0 && return false + fcmp = cmp(a.func, b.func) + fcmp < 0 && return true + fcmp > 0 && return false + fcmp = cmp(a.line, b.line) + fcmp < 0 && return true + return false + end + return sortperm(lilist, lt = lt) +end + +warning_empty() = @warn """ + There were no samples collected. Run your program longer (perhaps by + running it multiple times), or adjust the delay between samples with + `Profile.init()`.""" diff --git a/stdlib/Profile/src/time.jl b/stdlib/Profile/src/time.jl new file mode 100644 index 00000000000000..76479df2e23575 --- /dev/null +++ b/stdlib/Profile/src/time.jl @@ -0,0 +1,207 @@ +module Time + +using ..Profile: LineInfoDict, LineInfoFlatDict, ProfileFormat +using ..Profile: getdict, flatten, tree, flat +import ..Profile: callers + +export @profile + +""" + @profile + +`@profile ` runs your expression while taking periodic backtraces. These are +appended to an internal buffer of backtraces. +""" +macro profile(ex) + return quote + try + status = start_timer() + if status < 0 + error(error_codes[status]) + end + $(esc(ex)) + finally + stop_timer() + end + end +end + +#### +#### User-level functions +#### + +""" + init(; n::Integer, delay::Real)) + +Configure the `delay` between backtraces (measured in seconds), and the number `n` of +instruction pointers that may be stored. Each instruction pointer corresponds to a single +line of code; backtraces generally consist of a long list of instruction pointers. Default +settings can be obtained by calling this function with no arguments, and each can be set +independently using keywords or in the order `(n, delay)`. +""" +function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing) + n_cur = ccall(:jl_profile_maxlen_data, Csize_t, ()) + delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9 + if n === nothing && delay === nothing + return Int(n_cur), delay_cur + end + nnew = (n === nothing) ? n_cur : n + delaynew = (delay === nothing) ? delay_cur : delay + init(nnew, delaynew) +end + +function init(n::Integer, delay::Real) + status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), n, round(UInt64,10^9*delay)) + if status == -1 + error("could not allocate space for ", n, " instruction pointers") + end +end + +""" + clear() + +Clear any existing backtraces from the internal buffer. +""" +clear() = ccall(:jl_profile_clear_data, Cvoid, ()) + +""" + print([io::IO = stdout,] [data::Vector]; kwargs...) + +Prints profiling results to `io` (by default, `stdout`). If you do not +supply a `data` vector, the internal buffer of accumulated backtraces +will be used. + +The keyword arguments can be any combination of: + + - `format` -- Determines whether backtraces are printed with (default, `:tree`) or without (`:flat`) + indentation indicating tree structure. + + - `C` -- If `true`, backtraces from C and Fortran code are shown (normally they are excluded). + + - `combine` -- If `true` (default), instruction pointers are merged that correspond to the same line of code. + + - `maxdepth` -- Limits the depth higher than `maxdepth` in the `:tree` format. + + - `sortedby` -- Controls the order in `:flat` format. `:filefuncline` (default) sorts by the source + line, `:count` sorts in order of number of collected samples, and `:overhead` sorts by the number of samples + incurred by each function by itself. + + - `noisefloor` -- Limits frames that exceed the heuristic noise floor of the sample (only applies to format `:tree`). + A suggested value to try for this is 2.0 (the default is 0). This parameter hides samples for which `n <= noisefloor * √N`, + where `n` is the number of samples on this line, and `N` is the number of samples for the callee. + + - `mincount` -- Limits the printout to only those lines with at least `mincount` occurrences. + + - `recur` -- Controls the recursion handling in `:tree` format. `:off` (default) prints the tree as normal. `:flat` instead + compresses any recursion (by ip), showing the approximate effect of converting any self-recursion into an iterator. + `:flatc` does the same but also includes collapsing of C frames (may do odd things around `jl_apply`). +""" +function print(io::IO, + data::Vector = fetch(), + lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data) + ; + format = :tree, + C = false, + combine = true, + maxdepth::Int = typemax(Int), + mincount::Int = 0, + noisefloor = 0, + sortedby::Symbol = :filefuncline, + recur::Symbol = :off) + print(io, data, lidict, ProfileFormat( + C = C, + combine = combine, + maxdepth = maxdepth, + mincount = mincount, + noisefloor = noisefloor, + sortedby = sortedby, + recur = recur), + format) +end + +function print(io::IO, data::Vector, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat, format::Symbol) + cols::Int = Base.displaysize(io)[2] + data = convert(Vector{UInt64}, data) + fmt.recur ∈ (:off, :flat, :flatc) || throw(ArgumentError("recur value not recognized")) + if format === :tree + tree(io, data, lidict, cols, fmt) + elseif format === :flat + fmt.recur === :off || throw(ArgumentError("format flat only implements recur=:off")) + flat(io, data, lidict, cols, fmt) + else + throw(ArgumentError("output format $(repr(format)) not recognized")) + end +end + +""" + print([io::IO = stdout,] data::Vector, lidict::LineInfoDict; kwargs...) + +Prints profiling results to `io`. This variant is used to examine results exported by a +previous call to [`retrieve`](@ref). Supply the vector `data` of backtraces and +a dictionary `lidict` of line information. + +See `Profile.print([io], data)` for an explanation of the valid keyword arguments. +""" +print(data::Vector = fetch(), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) = + print(stdout, data, lidict; kwargs...) + +""" + retrieve() -> data, lidict + +"Exports" profiling results in a portable format, returning the set of all backtraces +(`data`) and a dictionary that maps the (session-specific) instruction pointers in `data` to +`LineInfo` values that store the file name, function name, and line number. This function +allows you to save profiling results for future analysis. +""" +function retrieve() + data = fetch() + return (data, getdict(data)) +end + + +# C wrappers +start_timer() = ccall(:jl_profile_start_timer, Cint, ()) + +stop_timer() = ccall(:jl_profile_stop_timer, Cvoid, ()) + +is_running() = ccall(:jl_profile_is_running, Cint, ())!=0 + + did_overflow() = ccall(:jl_profile_overflow, Cint, ()) != 0 + +get_data_pointer() = convert(Ptr{UInt}, ccall(:jl_profile_get_data, Ptr{UInt8}, ())) + +len_data() = convert(Int, ccall(:jl_profile_len_data, Csize_t, ())) + +maxlen_data() = convert(Int, ccall(:jl_profile_maxlen_data, Csize_t, ())) + +error_codes = Dict( + -1=>"cannot specify signal action for profiling", + -2=>"cannot create the timer for profiling", + -3=>"cannot start the timer for profiling", + -4=>"cannot unblock SIGUSR1") + + +""" + fetch() -> data + +Returns a copy of the buffer of profile backtraces. Note that the +values in `data` have meaning only on this machine in the current session, because it +depends on the exact memory addresses used in JIT-compiling. This function is primarily for +internal use; [`retrieve`](@ref) may be a better choice for most users. +""" +function fetch() + len = len_data() + if did_overflow() + @warn """The time profile data buffer overflowed; profiling terminated + before your program finished. To profile for longer runs, call + `Profile.init()` with a larger buffer and/or larger delay.""" + end + data = Vector{UInt}(undef, len) + GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len) + return data +end + +callers(funcname::String; kwargs...) = callers(funcname, retrieve()...; kwargs...) +callers(func::Function; kwargs...) = callers(string(func), retrieve()...; kwargs...) + +end # module diff --git a/stdlib/Profile/test/memory.jl b/stdlib/Profile/test/memory.jl new file mode 100644 index 00000000000000..f8be2a127c5092 --- /dev/null +++ b/stdlib/Profile/test/memory.jl @@ -0,0 +1,49 @@ +using Profile + +import Profile.Memory: clear_memprofile_data, read_and_coalesce_memprofile_data, get_memprofile_bt_data, get_memprofile_bt_data_len, get_memprofile_alloc_data, get_memprofile_alloc_data_len, AllocationInfo, get_memprofile_overflow, getdict, close_AI, open_AI, start_memprofile, stop_memprofile, closing_tag + +forever_chunks = [] +function foo() + global forever_chunks + + # Create a chunk that will live forever + push!(forever_chunks, Array{UInt8,2}(undef,1000,1000)) + + # Create a chunk that......will not. + Array{UInt8,2}(undef,1000,100) + + # Create lots of little objects + tups = Any[(1,), (2,3)] + for idx in 1:20 + addition = (tups[end-1]..., tups[end]...) + addition = addition[2:end] + push!(tups, addition) + end + # Keep a few of them around + push!(forever_chunks, tups[1]) + push!(forever_chunks, tups[end]) + return nothing +end + +function test(should_profile::Bool = false) + # We use the trick of setting the tag filter to 0x0000 to disable + # memory profiling so that we don't ever have to recompile. + Profile.Memory.init(50_000_000, 1_000_000, 0xffff * should_profile) + global forever_chunks = [] + @memprofile foo() +end + +@info("Precompiling test()") +test(false) + +@info("Running test()") +test(true) + +@info("Reading memprofile data...") +open_chunks, closed_chunks, ghost_chunks = read_and_coalesce_memprofile_data() +println("open_chunks:") +display(open_chunks) + +# This often crashes us, if we've held on to a bad object address. +# Explicitly run it to make sure everything is on the up-and-up. +Base.GC.gc() diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl index b26f1f70e5d465..c25af5b0eda2bb 100644 --- a/stdlib/Profile/test/runtests.jl +++ b/stdlib/Profile/test/runtests.jl @@ -2,94 +2,11 @@ using Test, Profile, Serialization -Profile.clear() -Profile.init() - -let iobuf = IOBuffer() - for fmt in (:tree, :flat) - Test.@test_logs (:warn, r"^There were no samples collected\.") Profile.print(iobuf, format=fmt, C=true) - Test.@test_logs (:warn, r"^There were no samples collected\.") Profile.print(iobuf, [0x0000000000000001], Dict(0x0000000000000001 => [Base.StackTraces.UNKNOWN]), format=fmt, C=false) - end -end - -@noinline function busywait(t, n_tries) - iter = 0 - init_data = Profile.len_data() - while iter < n_tries && Profile.len_data() == init_data - iter += 1 - tend = time() + t - while time() < tend end - end +@testset "time" begin + include("time.jl") end -busywait(0, 0) # compile -@profile busywait(1, 20) - -let r = Profile.retrieve() - mktemp() do path, io - serialize(io, r) - close(io) - open(path) do io - @test isa(deserialize(io), Tuple{Vector{UInt},Dict{UInt64,Vector{Base.StackTraces.StackFrame}}}) - end - end +@testset "memory" begin + include("memory.jl") end -let iobuf = IOBuffer() - Profile.print(iobuf, format=:tree, C=true) - str = String(take!(iobuf)) - @test !isempty(str) - truncate(iobuf, 0) - Profile.print(iobuf, format=:tree, maxdepth=2) - str = String(take!(iobuf)) - @test !isempty(str) - truncate(iobuf, 0) - Profile.print(iobuf, format=:flat, C=true) - str = String(take!(iobuf)) - @test !isempty(str) - truncate(iobuf, 0) - Profile.print(iobuf) - @test !isempty(String(take!(iobuf))) - truncate(iobuf, 0) - Profile.print(iobuf, format=:flat, sortedby=:count) - @test !isempty(String(take!(iobuf))) - Profile.print(iobuf, format=:tree, recur=:flat) - str = String(take!(iobuf)) - @test !isempty(str) - truncate(iobuf, 0) -end - -Profile.clear() -@test isempty(Profile.fetch()) - -let - @test Profile.callers("\\") !== nothing - @test Profile.callers(\) !== nothing - # linerange with no filename provided should fail - @test_throws ArgumentError Profile.callers(\; linerange=10:50) -end - -# issue #13229 -module I13229 -using Test, Profile -global z = 0 -@timed @profile for i = 1:5 - function f(x) - return x + i - end - global z = f(i) -end -@test z == 10 -end - -@testset "setting sample count and delay in init" begin - n_, delay_ = Profile.init() - @test n_ == 1_000_000 - def_delay = Sys.iswindows() ? 0.01 : 0.001 - @test delay_ == def_delay - Profile.init(n=1_000_001, delay=0.0005) - n_, delay_ = Profile.init() - @test n_ == 1_000_001 - @test delay_ == 0.0005 - Profile.init(n=1_000_000, delay=def_delay) -end diff --git a/stdlib/Profile/test/time.jl b/stdlib/Profile/test/time.jl new file mode 100644 index 00000000000000..2d774bb2de8bbf --- /dev/null +++ b/stdlib/Profile/test/time.jl @@ -0,0 +1,90 @@ +Profile.Time.init() + +let iobuf = IOBuffer() + for fmt in (:tree, :flat) + Test.@test_logs (:warn, r"^There were no samples collected\.") Profile.Time.print(iobuf, format=fmt, C=true) + Test.@test_logs (:warn, r"^There were no samples collected\.") Profile.Time.print(iobuf, [0x0000000000000001], Profile.LineInfoDict(0x0000000000000001 => [Base.StackTraces.UNKNOWN]), format=fmt, C=false) + end +end + +@noinline function busywait(t, n_tries) + iter = 0 + init_data = Profile.Time.len_data() + while iter < n_tries && Profile.Time.len_data() == init_data + iter += 1 + tend = time() + t + while time() < tend end + end +end + +busywait(0, 0) # compile +@profile busywait(1, 20) + +let r = Profile.Time.retrieve() + mktemp() do path, io + serialize(io, r) + close(io) + open(path) do io + @test isa(deserialize(io), Tuple{Vector{UInt},Profile.LineInfoDict}) + end + end +end + +let iobuf = IOBuffer() + Profile.Time.print(iobuf, format=:tree, C=true) + str = String(take!(iobuf)) + @test !isempty(str) + truncate(iobuf, 0) + Profile.Time.print(iobuf, format=:tree, maxdepth=2) + str = String(take!(iobuf)) + @test !isempty(str) + truncate(iobuf, 0) + Profile.Time.print(iobuf, format=:flat, C=true) + str = String(take!(iobuf)) + @test !isempty(str) + truncate(iobuf, 0) + Profile.Time.print(iobuf) + @test !isempty(String(take!(iobuf))) + truncate(iobuf, 0) + Profile.Time.print(iobuf, format=:flat, sortedby=:count) + @test !isempty(String(take!(iobuf))) + Profile.Time.print(iobuf, format=:tree, recur=:flat) + str = String(take!(iobuf)) + @test !isempty(str) + truncate(iobuf, 0) +end + +Profile.Time.clear() +@test isempty(Profile.Time.fetch()) + +let + @test Profile.Time.callers("\\") !== nothing + @test Profile.Time.callers(\) !== nothing + # linerange with no filename provided should fail + @test_throws ArgumentError Profile.Time.callers(\; linerange=10:50) +end + +# issue #13229 +module I13229 +using Test, Profile +global z = 0 +@timed @profile for i = 1:5 + function f(x) + return x + i + end + global z = f(i) +end +@test z == 10 +end + +@testset "setting sample count and delay in init" begin + n_, delay_ = Profile.Time.init() + @test n_ == 1_000_000 + def_delay = Sys.iswindows() ? 0.01 : 0.001 + @test delay_ == def_delay + Profile.Time.init(n=1_000_001, delay=0.0005) + n_, delay_ = Profile.Time.init() + @test n_ == 1_000_001 + @test delay_ == 0.0005 + Profile.Time.init(n=1_000_000, delay=def_delay) +end