From ec85f1d30018d0d9f174c58c4252c118ac52b814 Mon Sep 17 00:00:00 2001
From: Elliot Saba <staticfloat@gmail.com>
Date: Fri, 4 Oct 2019 09:16:13 +0200
Subject: [PATCH] Instrument GC with memory profiler implementation

This adds C support for a memory profiler within the GC, tracking
locations of allocations, deallocations, etc...  It operates in a
similar manner as the time profiler with single large buffers setup
beforehand through an initialization function, reducing the need for
expensive allocations while the program being measured is running.

The memory profiler instruments the GC in all locations that the GC
statistics themselves are being modified (e.g. `gc_num.allocd` and
`gc_num.freed`) by introducing new helper functions
`jl_gc_count_{allocd,freed,reallocd}()`.  Those utility functions call
the `jl_memprofile_track_{de,}alloc()` method to register an address,
a size and a tag with the memory profiler.  We also track type
information as this can be critically helpful when debugging, and to do
so without breaking API guarantees we insert methods to set the type of
a chunk of memory after allocating it where necessary.

The tagging system allows the memory profiler to disambiguate, at
profile time, between e.g. pooled allocations and the "big" allocator.
It also allows the memory allocator to support tracking multiple "memory
domains", e.g. a GPU support package could manually call
`jl_memprofile_track_alloc()` any time a chunk of memory is allocated on
the GPU so as to use the same system.  By default, all values are
tracked, however one can set a `memprof_tag_filter` value to track only
the values you are most interested in.  (E.g. only CPU domain big
allocations)

To disambiguate the memory and time profilers, we split them out into
separate modules.
---
 doc/src/manual/profile.md              |  32 +-
 src/Makefile                           |   2 +-
 src/array.c                            |  12 +-
 src/gc.c                               | 175 +++--
 src/julia_internal.h                   |  74 ++-
 src/llvm-final-gc-lowering.cpp         |  18 +-
 src/llvm-late-gc-lowering.cpp          |   3 +
 src/llvm-pass-helpers.cpp              |  28 +
 src/llvm-pass-helpers.h                |   6 +
 src/memprofile.c                       | 203 ++++++
 src/support/timefuncs.c                |   4 +-
 src/support/timefuncs.h                |   4 +-
 stdlib/Profile/docs/src/index.md       |  17 +-
 stdlib/Profile/src/Profile.jl          | 855 +------------------------
 stdlib/Profile/src/compat.jl           |  22 +
 stdlib/Profile/src/memory.jl           | 349 ++++++++++
 stdlib/Profile/src/stacktrace_tools.jl | 628 ++++++++++++++++++
 stdlib/Profile/src/time.jl             | 207 ++++++
 stdlib/Profile/test/memory.jl          |  49 ++
 stdlib/Profile/test/runtests.jl        |  91 +--
 stdlib/Profile/test/time.jl            |  90 +++
 21 files changed, 1875 insertions(+), 994 deletions(-)
 create mode 100644 src/memprofile.c
 create mode 100644 stdlib/Profile/src/compat.jl
 create mode 100644 stdlib/Profile/src/memory.jl
 create mode 100644 stdlib/Profile/src/stacktrace_tools.jl
 create mode 100644 stdlib/Profile/src/time.jl
 create mode 100644 stdlib/Profile/test/memory.jl
 create mode 100644 stdlib/Profile/test/time.jl

diff --git a/doc/src/manual/profile.md b/doc/src/manual/profile.md
index 0e5acc16789bf7..86ed4a9f7bc651 100644
--- a/doc/src/manual/profile.md
+++ b/doc/src/manual/profile.md
@@ -61,7 +61,7 @@ To see the profiling results, there is a [graphical browser](https://github.com/
 available, but here we'll use the text-based display that comes with the standard library:
 
 ```julia-repl
-julia> Profile.print()
+julia> Profile.Time.print()
 80 ./event.jl:73; (::Base.REPL.##1#2{Base.REPL.REPLBackend})()
  80 ./REPL.jl:97; macro expansion
   80 ./REPL.jl:66; eval_user_input(::Any, ::Base.REPL.REPLBackend)
@@ -130,7 +130,7 @@ collecting more samples:
 ```julia-repl
 julia> @profile (for i = 1:100; myfunc(); end)
 
-julia> Profile.print()
+julia> Profile.Time.print()
 [....]
  3821 ./REPL[1]:2; myfunc()
   3511 ./random.jl:431; rand!(::MersenneTwister, ::Array{Float64,3}, ::Int64, ::Type...
@@ -152,7 +152,7 @@ This illustrates the default "tree" dump; an alternative is the "flat" dump, whi
 counts independent of their nesting:
 
 ```julia-repl
-julia> Profile.print(format=:flat)
+julia> Profile.Time.print(format=:flat)
  Count File          Line Function
   6714 ./<missing>     -1 anonymous
   6714 ./REPL.jl       66 eval_user_input(::Any, ::Base.REPL.REPLBackend)
@@ -201,12 +201,12 @@ useful way to view the results.
 ## Accumulation and clearing
 
 Results from [`@profile`](@ref) accumulate in a buffer; if you run multiple pieces of code under
-[`@profile`](@ref), then [`Profile.print()`](@ref) will show you the combined results. This can
-be very useful, but sometimes you want to start fresh; you can do so with [`Profile.clear()`](@ref).
+[`@profile`](@ref), then [`Profile.Time.print()`](@ref) will show you the combined results. This can
+be very useful, but sometimes you want to start fresh; you can do so with [`Profile.Time.clear()`](@ref).
 
 ## Options for controlling the display of profile results
 
-[`Profile.print`](@ref) has more options than we've described so far. Let's see the full declaration:
+[`Profile.Time.print`](@ref) has more options than we've described so far. Let's see the full declaration:
 
 ```julia
 function print(io::IO = stdout, data = fetch(); kwargs...)
@@ -216,15 +216,15 @@ Let's first discuss the two positional arguments, and later the keyword argument
 
   * `io` -- Allows you to save the results to a buffer, e.g. a file, but the default is to print to `stdout`
     (the console).
-  * `data` -- Contains the data you want to analyze; by default that is obtained from [`Profile.fetch()`](@ref),
+  * `data` -- Contains the data you want to analyze; by default that is obtained from [`Profile.Time.fetch()`](@ref),
     which pulls out the backtraces from a pre-allocated buffer. For example, if you want to profile
     the profiler, you could say:
 
     ```julia
-    data = copy(Profile.fetch())
-    Profile.clear()
-    @profile Profile.print(stdout, data) # Prints the previous results
-    Profile.print()                      # Prints results from Profile.print()
+    data = copy(Profile.Time.fetch())
+    Profile.Time.clear()
+    @profile Profile.Time.print(stdout, data) # Prints the previous results
+    Profile.Time.print()                      # Prints results from Profile.Time.print()
     ```
 
 The keyword arguments can be any combination of:
@@ -233,7 +233,7 @@ The keyword arguments can be any combination of:
      with (default, `:tree`) or without (`:flat`) indentation indicating tree
      structure.
   * `C` -- If `true`, backtraces from C and Fortran code are shown (normally they are excluded). Try running the introductory
-    example with `Profile.print(C = true)`. This can be extremely helpful in deciding whether it's
+    example with `Profile.Time.print(C = true)`. This can be extremely helpful in deciding whether it's
     Julia code or C code that is causing a bottleneck; setting `C = true` also improves the interpretability
     of the nesting, at the cost of longer profile dumps.
   * `combine` -- Some lines of code contain multiple operations; for example, `s += A[i]` contains both an array
@@ -256,13 +256,13 @@ to save to a file using a wide `displaysize` in an [`IOContext`](@ref):
 
 ```julia
 open("/tmp/prof.txt", "w") do s
-    Profile.print(IOContext(s, :displaysize => (24, 500)))
+    Profile.Time.print(IOContext(s, :displaysize => (24, 500)))
 end
 ```
 
 ## Configuration
 
-[`@profile`](@ref) just accumulates backtraces, and the analysis happens when you call [`Profile.print()`](@ref).
+[`@profile`](@ref) just accumulates backtraces, and the analysis happens when you call [`Profile.Time.print()`](@ref).
 For a long-running computation, it's entirely possible that the pre-allocated buffer for storing
 backtraces will be filled. If that happens, the backtraces stop but your computation continues.
 As a consequence, you may miss some important profiling data (you will get a warning when that
@@ -271,8 +271,8 @@ happens).
 You can obtain and configure the relevant parameters this way:
 
 ```julia
-Profile.init() # returns the current settings
-Profile.init(n = 10^7, delay = 0.01)
+Profile.Time.init() # returns the current settings
+Profile.Time.init(n = 10^7, delay = 0.01)
 ```
 
 `n` is the total number of instruction pointers you can store, with a default value of `10^6`.
diff --git a/src/Makefile b/src/Makefile
index 04ea8d50f86ca5..d62af6f4fbd227 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -45,7 +45,7 @@ RUNTIME_C_SRCS := \
 	simplevector runtime_intrinsics precompile \
 	threading partr stackwalk gc gc-debug gc-pages gc-stacks method \
 	jlapi signal-handling safepoint jloptions timing subtype \
-	crc32c
+	crc32c memprofile
 RUNTIME_SRCS := APInt-C runtime_ccall processor rtutils $(RUNTIME_C_SRCS)
 SRCS := $(RUNTIME_SRCS)
 
diff --git a/src/array.c b/src/array.c
index a0627ff24ab8e5..0eb77e7e5a38d9 100644
--- a/src/array.c
+++ b/src/array.c
@@ -98,6 +98,7 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims,
         tsz += tot;
         tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align whole object
         a = (jl_array_t*)jl_gc_alloc(ptls, tsz, atype);
+        jl_memprofile_set_typeof(a, atype);
         // No allocation or safepoint allowed after this
         a->flags.how = 0;
         data = (char*)a + doffs;
@@ -107,12 +108,14 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims,
     else {
         tsz = JL_ARRAY_ALIGN(tsz, JL_CACHE_BYTE_ALIGNMENT); // align whole object
         data = jl_gc_managed_malloc(tot);
+        jl_memprofile_set_typeof(data, atype);
         // Allocate the Array **after** allocating the data
         // to make sure the array is still young
         a = (jl_array_t*)jl_gc_alloc(ptls, tsz, atype);
         // No allocation or safepoint allowed after this
         a->flags.how = 2;
         jl_gc_track_malloced_array(ptls, a);
+        jl_memprofile_set_typeof(a, atype);
         if (!isunboxed || isunion)
             // need to zero out isbits union array selector bytes to ensure a valid type index
             memset(data, 0, tot);
@@ -334,7 +337,9 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data,
     if (own_buffer) {
         a->flags.how = 2;
         jl_gc_track_malloced_array(ptls, a);
-        jl_gc_count_allocd(nel*elsz + (elsz == 1 ? 1 : 0));
+        jl_gc_count_allocd(a, nel*elsz + (elsz == 1 ? 1 : 0),
+                           JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC);
+        jl_memprofile_set_typeof(a, atype);
     }
     else {
         a->flags.how = 0;
@@ -401,7 +406,9 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array(jl_value_t *atype, void *data,
     if (own_buffer) {
         a->flags.how = 2;
         jl_gc_track_malloced_array(ptls, a);
-        jl_gc_count_allocd(nel*elsz + (elsz == 1 ? 1 : 0));
+        jl_gc_count_allocd(a, nel*elsz + (elsz == 1 ? 1 : 0),
+                           JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC);
+        jl_memprofile_set_typeof(a, atype);
     }
     else {
         a->flags.how = 0;
@@ -669,6 +676,7 @@ static int NOINLINE array_resize_buffer(jl_array_t *a, size_t newlen)
             a->flags.how = 1;
             jl_gc_wb_buf(a, a->data, nbytes);
         }
+        jl_memprofile_set_typeof(a->data, jl_typeof(a));
     }
     if (JL_ARRAY_IMPL_NUL && elsz == 1 && !isbitsunion)
         memset((char*)a->data + oldnbytes - 1, 0, nbytes - oldnbytes + 1);
diff --git a/src/gc.c b/src/gc.c
index 831dc14545348b..741321e742fd83 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -847,7 +847,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
         jl_throw(jl_memory_exception);
     gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t,
         gc_cblist_notify_external_alloc, (v, allocsz));
-    ptls->gc_num.allocd += allocsz;
+    jl_gc_count_allocd(jl_valueof(&v->header), allocsz, JL_MEMPROF_TAG_DOMAIN_CPU |
+                                                        JL_MEMPROF_TAG_ALLOC_BIGALLOC);
     ptls->gc_num.bigalloc++;
 #ifdef MEMDEBUG
     memset(v, 0xee, allocsz);
@@ -887,7 +888,8 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
             *pv = nxt;
             if (nxt)
                 nxt->prev = pv;
-            gc_num.freed += v->sz&~3;
+            jl_gc_count_freed(jl_valueof(&v->header), v->sz&~3, JL_MEMPROF_TAG_DOMAIN_CPU |
+                                                                JL_MEMPROF_TAG_ALLOC_BIGALLOC);
 #ifdef MEMDEBUG
             memset(v, 0xbb, v->sz&~3);
 #endif
@@ -938,10 +940,44 @@ void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT
     ptls->heap.mallocarrays = ma;
 }
 
-void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
+void jl_gc_count_allocd(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT
 {
     jl_ptls_t ptls = jl_get_ptls_states();
     ptls->gc_num.allocd += sz;
+
+    if (__unlikely(jl_memprofile_running())) {
+        jl_memprofile_track_alloc(addr, tag, sz);
+    }
+}
+
+void jl_gc_count_freed(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    ptls->gc_num.freed += sz;
+
+    if (__unlikely(jl_memprofile_running())) {
+        jl_memprofile_track_dealloc(addr, tag);
+    }
+}
+
+void jl_gc_count_reallocd(void * oldaddr, size_t oldsz, void * newaddr, size_t newsz, uint16_t tag) JL_NOTSAFEPOINT
+{
+    jl_ptls_t ptls = jl_get_ptls_states();
+    if (oldsz < newsz) {
+        ptls->gc_num.allocd += newsz - oldsz;
+    } else {
+        ptls->gc_num.freed += oldsz - newsz;
+    }
+
+    // Our memprofile does not yet have a way to represent "realloc", so we just
+    // represent this as a free immediately followed by a malloc.  This makes the
+    // absolute value of the memory deltas look larger than the Julia GC's statistics
+    // would have you believe, as the Julia GC shows only the difference between
+    // the two values when realloc'ing.
+    if (__unlikely(jl_memprofile_running())) {
+        jl_memprofile_track_dealloc(oldaddr, tag);
+        jl_memprofile_track_alloc(newaddr, tag, newsz);
+    }
 }
 
 static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
@@ -1002,7 +1038,7 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
             jl_free_aligned(d);
         else
             free(d);
-        gc_num.freed += array_nbytes(a);
+        jl_gc_count_freed(d, array_nbytes(a), JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC);
     }
 }
 
@@ -1094,7 +1130,6 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
     return jl_gc_big_alloc(ptls, osize);
 #endif
     maybe_collect(ptls);
-    ptls->gc_num.allocd += osize;
     ptls->gc_num.poolalloc++;
     // first try to use the freelist
     jl_taggedvalue_t *v = p->freelist;
@@ -1109,6 +1144,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
             pg->nfree = 0;
             pg->has_young = 1;
         }
+        jl_gc_count_allocd(jl_valueof(v), osize, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_POOLALLOC);
         return jl_valueof(v);
     }
     // if the freelist is empty we reuse empty but not freed pages
@@ -1133,6 +1169,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
         next = (jl_taggedvalue_t*)((char*)v + osize);
     }
     p->newpages = next;
+    jl_gc_count_allocd(jl_valueof(v), osize, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_POOLALLOC);
     return jl_valueof(v);
 }
 
@@ -1157,7 +1194,6 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t
     uint8_t *ages = pg->ages;
     jl_taggedvalue_t *v = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET);
     char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - osize;
-    size_t old_nfree = pg->nfree;
     size_t nfree;
 
     int freedall = 1;
@@ -1262,7 +1298,6 @@ static jl_taggedvalue_t **sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t *pg, jl_t
 
 done:
     gc_time_count_page(freedall, pg_skpd);
-    gc_num.freed += (nfree - old_nfree) * osize;
     return pfl;
 }
 
@@ -2560,6 +2595,8 @@ JL_DLLEXPORT int jl_gc_enable(int on)
     if (on && !prev) {
         // disable -> enable
         if (jl_atomic_fetch_add(&jl_gc_disable_counter, -1) == 1) {
+            // This to restore the value of `allocd` that was clobbered in `jl_gc_collect()`
+            // when `jl_gc_disable_counter` is nonzero.
             gc_num.allocd += gc_num.deferred_alloc;
             gc_num.deferred_alloc = 0;
         }
@@ -2652,18 +2689,75 @@ static void jl_gc_queue_remset(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp
     ptls2->heap.rem_bindings.len = n_bnd_refyoung;
 }
 
-static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, jl_ptls_t ptls2)
+static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp, uintptr_t * bt_data, size_t bt_size)
 {
     size_t n = 0;
-    while (n+2 < ptls2->bt_size) {
-        if (ptls2->bt_data[n] == JL_BT_INTERP_FRAME) {
-            gc_mark_queue_obj(gc_cache, sp, (jl_value_t*)ptls2->bt_data[n+1]);
+    while (n+2 < bt_size) {
+        if (bt_data[n] == JL_BT_INTERP_FRAME) {
+            gc_mark_queue_obj(gc_cache, sp, (jl_value_t*)bt_data[n+1]);
             n += 2;
         }
         n++;
     }
 }
 
+static void gc_track_pool_frees()
+{
+    // Don't do anything if the memory profiler isn't running.
+    if (!jl_memprofile_running())
+        return;
+
+    // Iterate over the three levels of our pagetable.  We collapse indentation here
+    // to make it more readable, especially as we do essentially the same thing
+    // three times with just slightly changed variable names:
+    for (int pg2_i = 0; pg2_i < (REGION2_PG_COUNT + 31) / 32; pg2_i++) {
+        uint32_t line2 = memory_map.allocmap1[pg2_i];
+        if (line2) {
+            for (int j = 0; j < 32; j++) {
+                if ((line2 >> j) & 1) {
+                    pagetable1_t * pagetable1 = memory_map.meta1[pg2_i * 32 + j];
+
+    for (int pg1_i = 0; pg1_i < REGION1_PG_COUNT / 32; pg1_i++) {
+        uint32_t line1 = pagetable1->allocmap0[pg1_i];
+        if (line1) {
+            for (int k = 0; k < 32; k++) {
+                if ((line1 >> k) & 1) {
+                    pagetable0_t * pagetable0 = pagetable1->meta0[pg1_i * 32 + k];
+
+    for (int pg0_i = 0; pg0_i < REGION0_PG_COUNT / 32; pg0_i++) {
+        uint32_t line0 = pagetable0->allocmap[pg0_i];
+        if (line0) {
+            for (int l = 0; l < 32; l++) {
+                if ((line0 >> l) & 1) {
+                    jl_gc_pagemeta_t * pg = pagetable0->meta[pg0_i * 32 + l];
+
+                    // Once we have an actual page, iterate over the cells:
+                    jl_taggedvalue_t *v = (jl_taggedvalue_t*)(pg->data + GC_PAGE_OFFSET);
+                    char *lim = (char*)v + GC_PAGE_SZ - GC_PAGE_OFFSET - pg->osize;
+
+                    while ((char *)v <= lim) {
+                        // If this object is live but unmarked, then it's about to be freed,
+                        // so track that via jl_gc_count_freed().
+                        if (v->bits.gc == GC_CLEAN) {
+                            jl_value_t * ptr = jl_gc_internal_obj_base_ptr(v);
+                            if (ptr != NULL) {
+                                jl_gc_count_freed(ptr, pg->osize, JL_MEMPROF_TAG_DOMAIN_CPU |
+                                                                  JL_MEMPROF_TAG_ALLOC_POOLALLOC);
+                            }
+                        }
+
+                        // Move to next cell in the page
+                        v = (jl_taggedvalue_t*)((char*)v + pg->osize);
+                    }
+
+    // Region 0
+    }}}}
+    // Region 1
+    }}}}
+    // Region 2
+    }}}}
+}
+
 size_t jl_maxrss(void);
 
 // Only one thread should be running in this function
@@ -2688,8 +2782,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         jl_gc_queue_remset(gc_cache, &sp, ptls2);
         // 2.2. mark every thread local root
         jl_gc_queue_thread_local(gc_cache, &sp, ptls2);
-        // 2.3. mark any managed objects in the backtrace buffer
-        jl_gc_queue_bt_buf(gc_cache, &sp, ptls2);
+        // 2.3. mark any managed objects in the backtrace buffers,
+        // so that things like Interpreter frame objects do not disappear.
+        jl_gc_queue_bt_buf(gc_cache, &sp, ptls2->bt_data, ptls2->bt_size);
+        jl_gc_queue_bt_buf(gc_cache, &sp, (uintptr_t *)jl_profile_get_data(), jl_profile_len_data());
+        jl_gc_queue_bt_buf(gc_cache, &sp, (uintptr_t *)jl_memprofile_get_bt_data(), jl_memprofile_len_bt_data());
     }
 
     // 3. walk roots
@@ -2813,6 +2910,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     gc_sweep_other(ptls, sweep_full);
     gc_scrub();
     gc_verify_tags();
+    gc_track_pool_frees();
     gc_sweep_pool(sweep_full);
     if (sweep_full)
         gc_sweep_perm_alloc();
@@ -3018,23 +3116,27 @@ JL_DLLEXPORT void jl_throw_out_of_memory_error(void)
 JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
-    if (ptls && ptls->world_age) {
+    if (ptls && ptls->world_age)
         maybe_collect(ptls);
-        ptls->gc_num.allocd += sz;
+    void *b = malloc(sz);
+    if (ptls && ptls->world_age) {
+        jl_gc_count_allocd(b, sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC);
         ptls->gc_num.malloc++;
     }
-    return malloc(sz);
+    return b;
 }
 
 JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
-    if (ptls && ptls->world_age) {
+    if (ptls && ptls->world_age)
         maybe_collect(ptls);
-        ptls->gc_num.allocd += nm*sz;
+    void *b = calloc(nm, sz);
+    if (ptls && ptls->world_age) {
+        jl_gc_count_allocd(b, nm*sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC);
         ptls->gc_num.malloc++;
     }
-    return calloc(nm, sz);
+    return b;
 }
 
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
@@ -3042,7 +3144,7 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
     jl_ptls_t ptls = jl_get_ptls_states();
     free(p);
     if (ptls && ptls->world_age) {
-        ptls->gc_num.freed += sz;
+        jl_gc_count_freed(p, sz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC);
         ptls->gc_num.freecall++;
     }
 }
@@ -3050,15 +3152,14 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
-    if (ptls && ptls->world_age) {
+    if (ptls && ptls->world_age)
         maybe_collect(ptls);
-        if (sz < old)
-            ptls->gc_num.freed += (old - sz);
-        else
-            ptls->gc_num.allocd += (sz - old);
+    void *b = realloc(p, sz);
+    if (ptls && ptls->world_age) {
+        jl_gc_count_reallocd(p, old, b, sz, JL_MEMPROF_TAG_DOMAIN_CPU);
         ptls->gc_num.realloc++;
     }
-    return realloc(p, sz);
+    return b;
 }
 
 // allocation wrappers that save the size of allocations, to allow using
@@ -3120,11 +3221,11 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
     if (allocsz < sz)  // overflow in adding offs, size was "negative"
         jl_throw(jl_memory_exception);
-    ptls->gc_num.allocd += allocsz;
-    ptls->gc_num.malloc++;
     void *b = malloc_cache_align(allocsz);
     if (b == NULL)
         jl_throw(jl_memory_exception);
+    jl_gc_count_allocd(b, allocsz, JL_MEMPROF_TAG_DOMAIN_CPU | JL_MEMPROF_TAG_ALLOC_STDALLOC);
+    ptls->gc_num.malloc++;
     return b;
 }
 
@@ -3138,16 +3239,6 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
     if (allocsz < sz)  // overflow in adding offs, size was "negative"
         jl_throw(jl_memory_exception);
 
-    if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) {
-        ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz;
-        live_bytes += allocsz - oldsz;
-    }
-    else if (allocsz < oldsz)
-        ptls->gc_num.freed += (oldsz - allocsz);
-    else
-        ptls->gc_num.allocd += (allocsz - oldsz);
-    ptls->gc_num.realloc++;
-
     void *b;
     if (isaligned)
         b = realloc_cache_align(d, allocsz, oldsz);
@@ -3156,6 +3247,14 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
     if (b == NULL)
         jl_throw(jl_memory_exception);
 
+    if (jl_astaggedvalue(owner)->bits.gc == GC_OLD_MARKED) {
+        ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz;
+        live_bytes += allocsz - oldsz;
+    }
+    else
+        jl_gc_count_reallocd(d, oldsz, b, allocsz, JL_MEMPROF_TAG_DOMAIN_CPU);
+    ptls->gc_num.realloc++;
+
     return b;
 }
 
diff --git a/src/julia_internal.h b/src/julia_internal.h
index d408257150e00e..4543d95335252c 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -227,6 +227,75 @@ STATIC_INLINE uint8_t JL_CONST_FUNC jl_gc_szclass(unsigned sz)
     return klass + N;
 }
 
+// Flags that determine when a certain buffer has overrun itself
+#define JL_MEMPROF_BT_OVERFLOW     0x01
+#define JL_MEMPROF_ALLOC_OVERFLOW  0x02
+
+// Tags applied to memory allocations to specify which domain the memory is
+// stored on, and also which "kind" of memory allocator was used.
+// When filtering, a filter tag value of `0xffff` means "accept everything".
+// We support the "CPU", "GPU" and "External" (e.g. "other") domains.
+#define JL_MEMPROF_TAG_DOMAIN_CPU           0x0001
+#define JL_MEMPROF_TAG_DOMAIN_GPU           0x0002
+#define JL_MEMPROF_TAG_DOMAIN_EXTERNAL      0x0080
+// We differentiate between just normal "standard" allocation by malloc, versus
+// the "pool" allocator, and finally "bigalloc" for special big things as
+// that's often what we're most interested in, which are the pieces of memory
+// allocated by `jl_gc_big_alloc()`.
+#define JL_MEMPROF_TAG_ALLOC_STDALLOC       0x0100
+#define JL_MEMPROF_TAG_ALLOC_POOLALLOC      0x0200
+#define JL_MEMPROF_TAG_ALLOC_BIGALLOC       0x0400
+// We denote a free() by setting yet another tag
+#define JL_MEMPROF_TAG_DEALLOC              0x8000
+
+typedef struct _memprof_allocation_info_t
+{
+    // The location of the chunk of data in memory, used to match
+    // allocations with deallocations.  This value, once it has been
+    // deallocated, cannot be dereferenced, in general.
+    void *memory_location;
+
+    // The location of the type information for this chunk of memory
+    // This should always be dereferencable.
+    void *type;
+
+    // The time at which this happened
+    double time;
+
+    // The size of the allocation, or 0 if this was a free of a
+    // previously-allocated piece of data.
+    size_t allocsz;
+
+    // Used to "tag" this allocation within a particular domain (CPU, GPU, other)
+    // or within a particular allocator (Pool, std, malloc), or as a free instead.
+    uint16_t tag;
+} allocation_info_t;
+
+// Memory profiler data structures
+static uintptr_t * memprof_bt_data = NULL;
+static volatile size_t memprof_bt_data_size = 0;
+static volatile size_t memprof_bt_data_size_max = 0;
+
+static allocation_info_t * memprof_alloc_data = NULL;
+static volatile size_t memprof_alloc_data_size = 0;
+static volatile size_t memprof_alloc_data_size_max = 0;
+
+static volatile uint8_t memprof_running = 0;
+static volatile uint8_t memprof_overflow = 0;
+static volatile uint16_t memprof_tag_filter = 0xffff;
+
+// Necessary memory profiler prototypes
+JL_DLLEXPORT uint8_t* jl_memprofile_get_bt_data(void);
+JL_DLLEXPORT size_t jl_memprofile_len_bt_data(void);
+JL_DLLEXPORT void jl_memprofile_track_alloc(void *v, uint16_t tag, size_t allocsz) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void jl_memprofile_track_dealloc(void *v, uint16_t tag) JL_NOTSAFEPOINT;
+JL_DLLEXPORT int jl_memprofile_running(void) JL_NOTSAFEPOINT;
+JL_DLLEXPORT void jl_memprofile_set_typeof(void * v, void * ty) JL_NOTSAFEPOINT;
+
+// Necessary time profiler prototypes
+JL_DLLEXPORT uint8_t *jl_profile_get_data(void);
+JL_DLLEXPORT size_t jl_profile_len_data(void);
+
 #define JL_SMALL_BYTE_ALIGNMENT 16
 #define JL_CACHE_BYTE_ALIGNMENT 64
 // JL_HEAP_ALIGNMENT is the maximum alignment that the GC can provide
@@ -249,6 +318,7 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty)
         v = jl_gc_big_alloc(ptls, allocsz);
     }
     jl_set_typeof(v, ty);
+    jl_memprofile_set_typeof(v, ty);
     return v;
 }
 JL_DLLEXPORT jl_value_t *jl_gc_alloc(jl_ptls_t ptls, size_t sz, void *ty);
@@ -341,7 +411,9 @@ JL_DLLEXPORT jl_value_t *jl_apply_2va(jl_value_t *f, jl_value_t **args, uint32_t
 
 void jl_gc_sync_total_bytes(void);
 void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT;
-void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT;
+void jl_gc_count_allocd(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT;
+void jl_gc_count_freed(void * addr, size_t sz, uint16_t tag) JL_NOTSAFEPOINT;
+void jl_gc_count_reallocd(void * oldaddr, size_t oldsz, void * newaddr, size_t newsz, uint16_t tag) JL_NOTSAFEPOINT;
 void jl_gc_run_all_finalizers(jl_ptls_t ptls);
 
 void gc_queue_binding(jl_binding_t *bnd) JL_NOTSAFEPOINT;
diff --git a/src/llvm-final-gc-lowering.cpp b/src/llvm-final-gc-lowering.cpp
index a68b2fc193ed83..7a2d418e322eba 100644
--- a/src/llvm-final-gc-lowering.cpp
+++ b/src/llvm-final-gc-lowering.cpp
@@ -39,6 +39,7 @@ struct FinalLowerGC: public FunctionPass, private JuliaPassContext {
     Function *queueRootFunc;
     Function *poolAllocFunc;
     Function *bigAllocFunc;
+    Function *memprofileSetTypeofFunc;
     CallInst *ptlsStates;
 
     bool doInitialization(Module &M) override;
@@ -60,6 +61,9 @@ struct FinalLowerGC: public FunctionPass, private JuliaPassContext {
     // Lowers a `julia.gc_alloc_bytes` intrinsic.
     Value *lowerGCAllocBytes(CallInst *target, Function &F);
 
+    // Lowers a `julia.gc_set_typeof` intrinsic.
+    Value *lowerGCSetTypeof(CallInst *target, Function &F);
+
     // Lowers a `julia.queue_gc_root` intrinsic.
     Value *lowerQueueGCRoot(CallInst *target, Function &F);
 
@@ -212,6 +216,13 @@ Value *FinalLowerGC::lowerGCAllocBytes(CallInst *target, Function &F)
     return newI;
 }
 
+Value *FinalLowerGC::lowerGCSetTypeof(CallInst *target, Function &F)
+{
+    assert(target->getNumArgOperands() == 2);
+    target->setCalledFunction(memprofileSetTypeofFunc);
+    return target;
+}
+
 bool FinalLowerGC::doInitialization(Module &M) {
     // Initialize platform-agnostic references.
     initAll(M);
@@ -220,8 +231,9 @@ bool FinalLowerGC::doInitialization(Module &M) {
     queueRootFunc = getOrDeclare(jl_well_known::GCQueueRoot);
     poolAllocFunc = getOrDeclare(jl_well_known::GCPoolAlloc);
     bigAllocFunc = getOrDeclare(jl_well_known::GCBigAlloc);
+    memprofileSetTypeofFunc = getOrDeclare(jl_well_known::MemProfileSetTypeof);
 
-    GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc};
+    GlobalValue *functionList[] = {queueRootFunc, poolAllocFunc, bigAllocFunc, memprofileSetTypeofFunc};
     unsigned j = 0;
     for (unsigned i = 0; i < sizeof(functionList) / sizeof(void*); i++) {
         if (!functionList[i])
@@ -301,6 +313,7 @@ bool FinalLowerGC::runOnFunction(Function &F)
     auto popGCFrameFunc = getOrNull(jl_intrinsics::popGCFrame);
     auto getGCFrameSlotFunc = getOrNull(jl_intrinsics::getGCFrameSlot);
     auto GCAllocBytesFunc = getOrNull(jl_intrinsics::GCAllocBytes);
+    auto GCSetTypeofFunc = getOrNull(jl_intrinsics::GCSetTypeof);
     auto queueGCRootFunc = getOrNull(jl_intrinsics::queueGCRoot);
 
     // Lower all calls to supported intrinsics.
@@ -331,6 +344,9 @@ bool FinalLowerGC::runOnFunction(Function &F)
             else if (callee == GCAllocBytesFunc) {
                 replaceInstruction(CI, lowerGCAllocBytes(CI, F), it);
             }
+            else if (callee == GCSetTypeofFunc) {
+                replaceInstruction(CI, lowerGCSetTypeof(CI, F), it);
+            }
             else if (callee == queueGCRootFunc) {
                 replaceInstruction(CI, lowerQueueGCRoot(CI, F), it);
             }
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 03ed17887acab9..149b41007229c0 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -1765,6 +1765,9 @@ bool LateLowerGCFrame::CleanupIR(Function &F, State *S) {
                     EmitTagPtr(builder, T_prjlvalue, newI));
                 store->setMetadata(LLVMContext::MD_tbaa, tbaa_tag);
 
+                auto setTypeofIntrinsic = getOrDeclare(jl_intrinsics::GCSetTypeof);
+                builder.CreateCall(setTypeofIntrinsic, {newI, CI->getArgOperand(2)});
+
                 // Replace uses of the call to `julia.gc_alloc_obj` with the call to
                 // `julia.gc_alloc_bytes`.
                 CI->replaceAllUsesWith(newI);
diff --git a/src/llvm-pass-helpers.cpp b/src/llvm-pass-helpers.cpp
index 7efb5394c968a4..03169569421737 100644
--- a/src/llvm-pass-helpers.cpp
+++ b/src/llvm-pass-helpers.cpp
@@ -129,6 +129,7 @@ llvm::Function *JuliaPassContext::getOrDeclare(
 namespace jl_intrinsics {
     static const char *GET_GC_FRAME_SLOT_NAME = "julia.get_gc_frame_slot";
     static const char *GC_ALLOC_BYTES_NAME = "julia.gc_alloc_bytes";
+    static const char *GC_SET_TYPEOF_NAME = "julia.gc_set_typeof";
     static const char *NEW_GC_FRAME_NAME = "julia.new_gc_frame";
     static const char *PUSH_GC_FRAME_NAME = "julia.push_gc_frame";
     static const char *POP_GC_FRAME_NAME = "julia.pop_gc_frame";
@@ -171,6 +172,18 @@ namespace jl_intrinsics {
             return addGCAllocAttributes(intrinsic, context.getLLVMContext());
         });
 
+    const IntrinsicDescription GCSetTypeof(
+        GC_SET_TYPEOF_NAME,
+        [](const JuliaPassContext &context) {
+            return Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue, context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                GC_SET_TYPEOF_NAME);
+        });
+
     const IntrinsicDescription newGCFrame(
         NEW_GC_FRAME_NAME,
         [](const JuliaPassContext &context) {
@@ -227,6 +240,7 @@ namespace jl_well_known {
     static const char *GC_BIG_ALLOC_NAME = "jl_gc_big_alloc";
     static const char *GC_POOL_ALLOC_NAME = "jl_gc_pool_alloc";
     static const char *GC_QUEUE_ROOT_NAME = "jl_gc_queue_root";
+    static const char *MEMPROFILE_SET_TYPEOF_NAME = "jl_memprofile_set_typeof";
 
     using jl_intrinsics::addGCAllocAttributes;
 
@@ -271,4 +285,18 @@ namespace jl_well_known {
             func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
             return func;
         });
+
+    const WellKnownFunctionDescription MemProfileSetTypeof(
+        MEMPROFILE_SET_TYPEOF_NAME,
+        [](const JuliaPassContext &context) {
+            auto func = Function::Create(
+                FunctionType::get(
+                    Type::getVoidTy(context.getLLVMContext()),
+                    { context.T_prjlvalue, context.T_prjlvalue },
+                    false),
+                Function::ExternalLinkage,
+                MEMPROFILE_SET_TYPEOF_NAME);
+            func->addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+            return func;
+        });
 }
diff --git a/src/llvm-pass-helpers.h b/src/llvm-pass-helpers.h
index 71cab27e76ceba..271fbc2b82b30b 100644
--- a/src/llvm-pass-helpers.h
+++ b/src/llvm-pass-helpers.h
@@ -121,6 +121,9 @@ namespace jl_intrinsics {
     // passed as an argument.
     extern const IntrinsicDescription GCAllocBytes;
 
+    // `julia.gc_set_typeof`: an intrinsic that tags an allocation with a type.
+    extern const IntrinsicDescription GCSetTypeof;
+
     // `julia.new_gc_frame`: an intrinsic that creates a new GC frame.
     extern const IntrinsicDescription newGCFrame;
 
@@ -152,6 +155,9 @@ namespace jl_well_known {
 
     // `jl_gc_queue_root`: queues a GC root.
     extern const WellKnownFunctionDescription GCQueueRoot;
+
+    // `jl_memprofile_set_typeof`: informs the memory profiler about a type.
+    extern const WellKnownFunctionDescription MemProfileSetTypeof;
 }
 
 #endif
diff --git a/src/memprofile.c b/src/memprofile.c
new file mode 100644
index 00000000000000..8d64279423b9b8
--- /dev/null
+++ b/src/memprofile.c
@@ -0,0 +1,203 @@
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include "julia.h"
+#include "julia_internal.h"
+#include "threading.h"
+#include "gc.h"
+
+JL_DLLEXPORT void jl_memprofile_clear_data(void)
+{
+    memprof_bt_data_size = 0;
+    memprof_alloc_data_size = 0;
+    memprof_overflow = 0;
+}
+
+JL_DLLEXPORT int jl_memprofile_init(size_t bt_maxsize, size_t alloc_maxsize, uint16_t tag_filter)
+{
+    // Free previous profile buffers, if we have any
+    if (memprof_bt_data != NULL) {
+        free((void*)memprof_bt_data);
+        memprof_bt_data = NULL;
+        memprof_bt_data_size = 0;
+        memprof_bt_data_size_max = 0;
+    }
+    if (memprof_alloc_data != NULL) {
+        free((void*)memprof_alloc_data);
+        memprof_alloc_data = NULL;
+        memprof_alloc_data_size = 0;
+        memprof_alloc_data_size_max = 0;
+    }
+
+    // Initialize new profile buffers.  We assume at least 10x
+    memprof_bt_data = (uintptr_t*) calloc(bt_maxsize, sizeof(uintptr_t));
+    if (memprof_bt_data == NULL && bt_maxsize > 0) {
+        return -1;
+    }
+
+    memprof_alloc_data = (allocation_info_t*) calloc(alloc_maxsize, sizeof(allocation_info_t));
+    if (memprof_alloc_data == NULL && alloc_maxsize > 0) {
+        // Cleanup the previous allocation in the event of failure, so that it
+        // cannot be used accidentally.
+        free((void*)memprof_bt_data);
+        memprof_bt_data = NULL;
+        return -1;
+    }
+
+    memprof_bt_data_size_max = bt_maxsize;
+    memprof_alloc_data_size_max = alloc_maxsize;
+
+    // We store the filter, but we always track deallocs.  :P
+    memprof_tag_filter = tag_filter | JL_MEMPROF_TAG_DEALLOC;
+    jl_memprofile_clear_data();
+    return 0;
+}
+
+JL_DLLEXPORT uint8_t* jl_memprofile_get_bt_data(void)
+{
+    return (uint8_t*) memprof_bt_data;
+}
+
+JL_DLLEXPORT uint8_t* jl_memprofile_get_alloc_data(void)
+{
+    return (uint8_t*) memprof_alloc_data;
+}
+
+JL_DLLEXPORT size_t jl_memprofile_len_bt_data(void)
+{
+    return memprof_bt_data_size;
+}
+
+JL_DLLEXPORT size_t jl_memprofile_len_alloc_data(void)
+{
+    return memprof_alloc_data_size;
+}
+
+JL_DLLEXPORT size_t jl_memprofile_maxlen_bt_data(void)
+{
+    return memprof_bt_data_size_max;
+}
+
+JL_DLLEXPORT size_t jl_memprofile_maxlen_alloc_data(void)
+{
+    return memprof_alloc_data_size_max;
+}
+
+JL_DLLEXPORT int jl_memprofile_running(void) JL_NOTSAFEPOINT
+{
+    return memprof_running == 1;
+}
+
+JL_DLLEXPORT int jl_memprofile_overflow(void)
+{
+    return memprof_overflow;
+}
+
+JL_DLLEXPORT int jl_memprofile_tag_filter(void)
+{
+    return memprof_tag_filter;
+}
+
+JL_DLLEXPORT void jl_memprofile_start(void)
+{
+    memprof_running = 1;
+}
+
+JL_DLLEXPORT void jl_memprofile_stop(void) JL_NOTSAFEPOINT
+{
+    memprof_running = 0;
+}
+
+// Helper function that makes it easy to take a chunk of plain-old-data that was
+// allocated for an Array and find the "holding" jl_array_t object.
+JL_DLLEXPORT jl_array_t * jl_memprofile_find_malloc_array(void * adata)
+{
+    // We walk every thread, so we need to disable the GC while we do this.
+    int prev_value = jl_gc_enable(0);
+
+    // For each thread
+    for (int t_i = 0; t_i < jl_n_threads; t_i++) {
+        // Get its thread-local storage
+        jl_ptls_t ptls2 = jl_all_tls_states[t_i];
+
+        // Take a look at the malloc'ed arrays for this thread
+        mallocarray_t *ma = ptls2->heap.mallocarrays;
+
+        // Zoom through seeing if the given pointer matches this array's data pointer
+        while (ma != NULL) {
+            if (ma->a->data == adata) {
+                // If it matches, re-enable the GC and return that value
+                jl_gc_enable(prev_value);
+                return ma->a;
+            }
+            ma = ma->next;
+        }
+    }
+
+    // We were unable to find it. :(
+    jl_gc_enable(prev_value);
+    return NULL;
+}
+
+JL_DLLEXPORT void jl_memprofile_track_alloc(void *v, uint16_t tag, size_t allocsz) JL_NOTSAFEPOINT
+{
+    // Filter out this call with our tag filter
+    if ((tag & memprof_tag_filter) != tag)
+        return;
+
+    // Store the current backtrace location into our buffer, and increment the
+    // buffer index by the number of elements added.
+    size_t bt_step = 0;
+    bt_step = rec_backtrace(memprof_bt_data + memprof_bt_data_size,
+                            memprof_bt_data_size_max - memprof_bt_data_size - 1,
+                            0);
+
+    // If we overran this buffer, then don't record the memory trace and quit.
+    if (bt_step == memprof_bt_data_size_max - memprof_bt_data_size) {
+        memprof_overflow |= JL_MEMPROF_BT_OVERFLOW;
+        jl_memprofile_stop();
+        return;
+    } else {
+        // Otherwise, include this block and add a NULL-separator
+        memprof_bt_data_size += bt_step;
+        memprof_bt_data[memprof_bt_data_size++] = (uintptr_t)NULL;
+    }
+
+    // Next up; store allocation/type information
+    memprof_alloc_data[memprof_alloc_data_size].memory_location = v;
+    memprof_alloc_data[memprof_alloc_data_size].time = jl_clock_now();
+
+    // If we are deallocating, then we set allocsz to 0 and must pair this alloc_data entry
+    // with a previous allocation within alloc_data to make sense of it.
+    memprof_alloc_data[memprof_alloc_data_size].allocsz = allocsz;
+
+    // In general, we don't know the type at this point; we make use of `jl_memprofile_set_typeof()`.
+    memprof_alloc_data[memprof_alloc_data_size].type = NULL;
+
+    // Tags are used to track the "domain" of this chunk of memory
+    memprof_alloc_data[memprof_alloc_data_size].tag = tag;
+
+    memprof_alloc_data_size++;
+
+    if (memprof_alloc_data_size >= memprof_alloc_data_size_max) {
+        memprof_overflow |= JL_MEMPROF_ALLOC_OVERFLOW;
+        jl_memprofile_stop();
+    }
+}
+
+JL_DLLEXPORT void jl_memprofile_set_typeof(void * v, void * ty) JL_NOTSAFEPOINT
+{
+    // We only search the very last allocation
+    if (memprof_alloc_data_size > 0) {
+        if (memprof_alloc_data[memprof_alloc_data_size-1].memory_location == v) {
+            // If it matches, then set the type pointer
+            memprof_alloc_data[memprof_alloc_data_size-1].type = ty;
+        }
+    }
+}
+
+JL_DLLEXPORT void jl_memprofile_track_dealloc(void *v, uint16_t tag) JL_NOTSAFEPOINT
+{
+    jl_memprofile_track_alloc(v, tag | JL_MEMPROF_TAG_DEALLOC, 0);
+}
diff --git a/src/support/timefuncs.c b/src/support/timefuncs.c
index 031967638ec9e9..cfdff0683a256f 100644
--- a/src/support/timefuncs.c
+++ b/src/support/timefuncs.c
@@ -28,7 +28,7 @@
 extern "C" {
 #endif
 
-JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv)
+JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) JL_NOTSAFEPOINT
 {
 #if defined(_OS_WINDOWS_)
     struct __timeb64 tb;
@@ -44,7 +44,7 @@ JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv)
     return code;
 }
 
-JL_DLLEXPORT double jl_clock_now(void)
+JL_DLLEXPORT double jl_clock_now(void) JL_NOTSAFEPOINT
 {
     struct jl_timeval now;
     jl_gettimeofday(&now);
diff --git a/src/support/timefuncs.h b/src/support/timefuncs.h
index db3d0fccd97a9b..24a4add6b6bfe8 100644
--- a/src/support/timefuncs.h
+++ b/src/support/timefuncs.h
@@ -12,8 +12,8 @@ struct jl_timeval {
     int64_t usec;   /* microseconds */
 };
 
-JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv);
-JL_DLLEXPORT double jl_clock_now(void);
+JL_DLLEXPORT int jl_gettimeofday(struct jl_timeval *jtv) JL_NOTSAFEPOINT;
+JL_DLLEXPORT double jl_clock_now(void) JL_NOTSAFEPOINT;
 void sleep_ms(int ms);
 
 #ifdef __cplusplus
diff --git a/stdlib/Profile/docs/src/index.md b/stdlib/Profile/docs/src/index.md
index ac60bb92cb5ed8..eba8299cff6214 100644
--- a/stdlib/Profile/docs/src/index.md
+++ b/stdlib/Profile/docs/src/index.md
@@ -7,11 +7,18 @@ Profile.@profile
 The methods in `Profile` are not exported and need to be called e.g. as `Profile.print()`.
 
 ```@docs
-Profile.clear
-Profile.print
-Profile.init
-Profile.fetch
-Profile.retrieve
+Profile.Time.clear
+Profile.Time.print
+Profile.Time.init
+Profile.Time.fetch
+Profile.Time.retrieve
+```
+
+```@docs
+Profile.Memory.init
+```
+
+```@docs
 Profile.callers
 Profile.clear_malloc_data
 ```
diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index 429bb80fe4f60b..f2371e035dfca8 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -5,850 +5,27 @@ Profiling support, main entry point is the [`@profile`](@ref) macro.
 """
 module Profile
 
-import Base.StackTraces: lookup, UNKNOWN, show_spec_linfo, StackFrame
-
-# deprecated functions: use `getdict` instead
-lookup(ip::UInt) = lookup(convert(Ptr{Cvoid}, ip))
-
-export @profile
-
-"""
-    @profile
-
-`@profile <expression>` runs your expression while taking periodic backtraces. These are
-appended to an internal buffer of backtraces.
-"""
-macro profile(ex)
-    return quote
-        try
-            status = start_timer()
-            if status < 0
-                error(error_codes[status])
-            end
-            $(esc(ex))
-        finally
-            stop_timer()
-        end
-    end
-end
-
-####
-#### User-level functions
-####
-
-"""
-    init(; n::Integer, delay::Real))
-
-Configure the `delay` between backtraces (measured in seconds), and the number `n` of
-instruction pointers that may be stored. Each instruction pointer corresponds to a single
-line of code; backtraces generally consist of a long list of instruction pointers. Default
-settings can be obtained by calling this function with no arguments, and each can be set
-independently using keywords or in the order `(n, delay)`.
-"""
-function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing)
-    n_cur = ccall(:jl_profile_maxlen_data, Csize_t, ())
-    delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9
-    if n === nothing && delay === nothing
-        return Int(n_cur), delay_cur
-    end
-    nnew = (n === nothing) ? n_cur : n
-    delaynew = (delay === nothing) ? delay_cur : delay
-    init(nnew, delaynew)
-end
-
-function init(n::Integer, delay::Real)
-    status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), n, round(UInt64,10^9*delay))
-    if status == -1
-        error("could not allocate space for ", n, " instruction pointers")
-    end
-end
-
 # init with default values
-# Use a max size of 1M profile samples, and fire timer every 1ms
-if Sys.iswindows()
-    __init__() = init(1_000_000, 0.01)
-else
-    __init__() = init(1_000_000, 0.001)
-end
-
-"""
-    clear()
-
-Clear any existing backtraces from the internal buffer.
-"""
-clear() = ccall(:jl_profile_clear_data, Cvoid, ())
-
-const LineInfoDict = Dict{UInt64, Vector{StackFrame}}
-const LineInfoFlatDict = Dict{UInt64, StackFrame}
-
-struct ProfileFormat
-    maxdepth::Int
-    mincount::Int
-    noisefloor::Float64
-    sortedby::Symbol
-    combine::Bool
-    C::Bool
-    recur::Symbol
-    function ProfileFormat(;
-        C = false,
-        combine = true,
-        maxdepth::Int = typemax(Int),
-        mincount::Int = 0,
-        noisefloor = 0,
-        sortedby::Symbol = :filefuncline,
-        recur::Symbol = :off)
-        return new(maxdepth, mincount, noisefloor, sortedby, combine, C, recur)
-    end
-end
-
-"""
-    print([io::IO = stdout,] [data::Vector]; kwargs...)
-
-Prints profiling results to `io` (by default, `stdout`). If you do not
-supply a `data` vector, the internal buffer of accumulated backtraces
-will be used.
-
-The keyword arguments can be any combination of:
-
- - `format` -- Determines whether backtraces are printed with (default, `:tree`) or without (`:flat`)
-   indentation indicating tree structure.
-
- - `C` -- If `true`, backtraces from C and Fortran code are shown (normally they are excluded).
-
- - `combine` -- If `true` (default), instruction pointers are merged that correspond to the same line of code.
-
- - `maxdepth` -- Limits the depth higher than `maxdepth` in the `:tree` format.
-
- - `sortedby` -- Controls the order in `:flat` format. `:filefuncline` (default) sorts by the source
-    line, `:count` sorts in order of number of collected samples, and `:overhead` sorts by the number of samples
-    incurred by each function by itself.
-
- - `noisefloor` -- Limits frames that exceed the heuristic noise floor of the sample (only applies to format `:tree`).
-    A suggested value to try for this is 2.0 (the default is 0). This parameter hides samples for which `n <= noisefloor * √N`,
-    where `n` is the number of samples on this line, and `N` is the number of samples for the callee.
-
- - `mincount` -- Limits the printout to only those lines with at least `mincount` occurrences.
-
- - `recur` -- Controls the recursion handling in `:tree` format. `:off` (default) prints the tree as normal. `:flat` instead
-    compresses any recursion (by ip), showing the approximate effect of converting any self-recursion into an iterator.
-    `:flatc` does the same but also includes collapsing of C frames (may do odd things around `jl_apply`).
-"""
-function print(io::IO,
-        data::Vector{<:Unsigned} = fetch(),
-        lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data)
-        ;
-        format = :tree,
-        C = false,
-        combine = true,
-        maxdepth::Int = typemax(Int),
-        mincount::Int = 0,
-        noisefloor = 0,
-        sortedby::Symbol = :filefuncline,
-        recur::Symbol = :off)
-    print(io, data, lidict, ProfileFormat(
-            C = C,
-            combine = combine,
-            maxdepth = maxdepth,
-            mincount = mincount,
-            noisefloor = noisefloor,
-            sortedby = sortedby,
-            recur = recur),
-        format)
-end
-
-function print(io::IO, data::Vector{<:Unsigned}, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat, format::Symbol)
-    cols::Int = Base.displaysize(io)[2]
-    data = convert(Vector{UInt64}, data)
-    fmt.recur ∈ (:off, :flat, :flatc) || throw(ArgumentError("recur value not recognized"))
-    if format === :tree
-        tree(io, data, lidict, cols, fmt)
-    elseif format === :flat
-        fmt.recur === :off || throw(ArgumentError("format flat only implements recur=:off"))
-        flat(io, data, lidict, cols, fmt)
-    else
-        throw(ArgumentError("output format $(repr(format)) not recognized"))
-    end
-end
-
-"""
-    print([io::IO = stdout,] data::Vector, lidict::LineInfoDict; kwargs...)
-
-Prints profiling results to `io`. This variant is used to examine results exported by a
-previous call to [`retrieve`](@ref). Supply the vector `data` of backtraces and
-a dictionary `lidict` of line information.
-
-See `Profile.print([io], data)` for an explanation of the valid keyword arguments.
-"""
-print(data::Vector{<:Unsigned} = fetch(), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) =
-    print(stdout, data, lidict; kwargs...)
-
-"""
-    retrieve() -> data, lidict
-
-"Exports" profiling results in a portable format, returning the set of all backtraces
-(`data`) and a dictionary that maps the (session-specific) instruction pointers in `data` to
-`LineInfo` values that store the file name, function name, and line number. This function
-allows you to save profiling results for future analysis.
-"""
-function retrieve()
-    data = fetch()
-    return (data, getdict(data))
-end
-
-function getdict(data::Vector{UInt})
-    dict = LineInfoDict()
-    for ip in data
-        get!(() -> lookup(convert(Ptr{Cvoid}, ip)), dict, UInt64(ip))
-    end
-    return dict
-end
-
-"""
-    flatten(btdata::Vector, lidict::LineInfoDict) -> (newdata::Vector{UInt64}, newdict::LineInfoFlatDict)
-
-Produces "flattened" backtrace data. Individual instruction pointers
-sometimes correspond to a multi-frame backtrace due to inlining; in
-such cases, this function inserts fake instruction pointers for the
-inlined calls, and returns a dictionary that is a 1-to-1 mapping
-between instruction pointers and a single StackFrame.
-"""
-function flatten(data::Vector, lidict::LineInfoDict)
-    # Makes fake instruction pointers, counting down from typemax(UInt)
-    newip = typemax(UInt64) - 1
-    taken = Set(keys(lidict))  # make sure we don't pick one that's already used
-    newdict = Dict{UInt64,StackFrame}()
-    newmap  = Dict{UInt64,Vector{UInt64}}()
-    for (ip, trace) in lidict
-        if length(trace) == 1
-            newdict[ip] = trace[1]
-        else
-            newm = UInt64[]
-            for sf in trace
-                while newip ∈ taken && newip > 0
-                    newip -= 1
-                end
-                newip == 0 && error("all possible instruction pointers used")
-                push!(newm, newip)
-                newdict[newip] = sf
-                newip -= 1
-            end
-            newmap[ip] = newm
-        end
-    end
-    newdata = UInt64[]
-    for ip::UInt64 in data
-        if haskey(newmap, ip)
-            append!(newdata, newmap[ip])
-        else
-            push!(newdata, ip)
-        end
+# Use a max size of 1M profile samples, and fire timer every 1ms, except on Windows
+function __init__()
+    timer_ms = 0.001
+    @static if Sys.iswindows()
+        timer_ms = 0.01
     end
-    return (newdata, newdict)
+    Time.init(1_000_000, timer_ms)
+    Memory.init(20_000_000, 1_000_000, 0xffff)
 end
 
-# Take a file-system path and try to form a concise representation of it
-# based on the package ecosystem
-function short_path(spath::Symbol, filenamecache::Dict{Symbol, String})
-    return get!(filenamecache, spath) do
-        path = string(spath)
-        if isabspath(path)
-            if ispath(path)
-                # try to replace the file-system prefix with a short "@Module" one,
-                # assuming that profile came from the current machine
-                # (or at least has the same file-system layout)
-                root = path
-                while !isempty(root)
-                    root, base = splitdir(root)
-                    isempty(base) && break
-                    @assert startswith(path, root)
-                    for proj in Base.project_names
-                        project_file = joinpath(root, proj)
-                        if Base.isfile_casesensitive(project_file)
-                            pkgid = Base.project_file_name_uuid(project_file, "")
-                            isempty(pkgid.name) && return path # bad Project file
-                            # return the joined the module name prefix and path suffix
-                            path = path[nextind(path, sizeof(root)):end]
-                            return string("@", pkgid.name, path)
-                        end
-                    end
-                end
-            end
-            return path
-        elseif isfile(joinpath(Sys.BINDIR::String, Base.DATAROOTDIR, "julia", "base", path))
-            # do the same mechanic for Base (or Core/Compiler) files as above,
-            # but they start from a relative path
-            return joinpath("@Base", normpath(path))
-        else
-            # for non-existent relative paths (such as "REPL[1]"), just consider simplifying them
-            return normpath(path) # drop leading "./"
-        end
-    end
-end
+include("stacktrace_tools.jl")
+include("memory.jl")
+include("time.jl")
+include("compat.jl")
 
-"""
-    callers(funcname, [data, lidict], [filename=<filename>], [linerange=<start:stop>]) -> Vector{Tuple{count, lineinfo}}
-
-Given a previous profiling run, determine who called a particular function. Supplying the
-filename (and optionally, range of line numbers over which the function is defined) allows
-you to disambiguate an overloaded method. The returned value is a vector containing a count
-of the number of calls and line information about the caller. One can optionally supply
-backtrace `data` obtained from [`retrieve`](@ref); otherwise, the current internal
-profile buffer is used.
-"""
-function callers end
-
-function callers(funcname::String, bt::Vector, lidict::LineInfoFlatDict; filename = nothing, linerange = nothing)
-    if filename === nothing && linerange === nothing
-        return callersf(li -> String(li.func) == funcname,
-            bt, lidict)
-    end
-    filename === nothing && throw(ArgumentError("if supplying linerange, you must also supply the filename"))
-    filename = String(filename)
-    if linerange === nothing
-        return callersf(li -> String(li.func) == funcname && String(li.file) == filename,
-            bt, lidict)
-    else
-        return callersf(li -> String(li.func) == funcname && String(li.file) == filename && in(li.line, linerange),
-            bt, lidict)
-    end
-end
-
-callers(funcname::String, bt::Vector, lidict::LineInfoDict; kwargs...) =
-    callers(funcname, flatten(bt, lidict)...; kwargs...)
-callers(funcname::String; kwargs...) = callers(funcname, retrieve()...; kwargs...)
-callers(func::Function, bt::Vector, lidict::LineInfoFlatDict; kwargs...) =
-    callers(string(func), bt, lidict; kwargs...)
-callers(func::Function; kwargs...) = callers(string(func), retrieve()...; kwargs...)
-
-##
-## For --track-allocation
-##
-# Reset the malloc log. Used to avoid counting memory allocated during
-# compilation.
-
-"""
-    clear_malloc_data()
-
-Clears any stored memory allocation data when running julia with `--track-allocation`.
-Execute the command(s) you want to test (to force JIT-compilation), then call
-[`clear_malloc_data`](@ref). Then execute your command(s) again, quit
-Julia, and examine the resulting `*.mem` files.
-"""
-clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ())
-
-# C wrappers
-start_timer() = ccall(:jl_profile_start_timer, Cint, ())
-
-stop_timer() = ccall(:jl_profile_stop_timer, Cvoid, ())
-
-is_running() = ccall(:jl_profile_is_running, Cint, ())!=0
-
-get_data_pointer() = convert(Ptr{UInt}, ccall(:jl_profile_get_data, Ptr{UInt8}, ()))
-
-len_data() = convert(Int, ccall(:jl_profile_len_data, Csize_t, ()))
-
-maxlen_data() = convert(Int, ccall(:jl_profile_maxlen_data, Csize_t, ()))
-
-error_codes = Dict(
-    -1=>"cannot specify signal action for profiling",
-    -2=>"cannot create the timer for profiling",
-    -3=>"cannot start the timer for profiling",
-    -4=>"cannot unblock SIGUSR1")
-
-
-"""
-    fetch() -> data
-
-Returns a copy of the buffer of profile backtraces. Note that the
-values in `data` have meaning only on this machine in the current session, because it
-depends on the exact memory addresses used in JIT-compiling. This function is primarily for
-internal use; [`retrieve`](@ref) may be a better choice for most users.
-"""
-function fetch()
-    maxlen = maxlen_data()
-    len = len_data()
-    if (len == maxlen)
-        @warn """The profile data buffer is full; profiling probably terminated
-                 before your program finished. To profile for longer runs, call
-                 `Profile.init()` with a larger buffer and/or larger delay."""
-    end
-    data = Vector{UInt}(undef, len)
-    GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len)
-    return data
-end
-
-
-## Print as a flat list
-# Counts the number of times each line appears, at any nesting level and at the topmost level
-# Merging multiple equivalent entries and recursive calls
-function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, C::Bool) where {T}
-    lilist = StackFrame[]
-    n = Int[]
-    m = Int[]
-    lilist_idx = Dict{T, Int}()
-    recursive = Set{T}()
-    first = true
-    totalshots = 0
-    for ip in data
-        if ip == 0
-            totalshots += 1
-            empty!(recursive)
-            first = true
-            continue
-        end
-        frames = lidict[ip]
-        nframes = (frames isa Vector ? length(frames) : 1)
-        for i = 1:nframes
-            frame = (frames isa Vector ? frames[i] : frames)
-            !C && frame.from_c && continue
-            key = (T === UInt64 ? ip : frame)
-            idx = get!(lilist_idx, key, length(lilist) + 1)
-            if idx > length(lilist)
-                push!(recursive, key)
-                push!(lilist, frame)
-                push!(n, 1)
-                push!(m, 0)
-            elseif !(key in recursive)
-                push!(recursive, key)
-                n[idx] += 1
-            end
-            if first
-                m[idx] += 1
-                first = false
-            end
-        end
-    end
-    @assert length(lilist) == length(n) == length(m) == length(lilist_idx)
-    return (lilist, n, m, totalshots)
-end
-
-function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat)
-    lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C)
-    if isempty(lilist)
-        warning_empty()
-        return
-    end
-    if false # optional: drop the "non-interpretable" ones
-        keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist)
-        lilist = lilist[keep]
-        n = n[keep]
-        m = m[keep]
-    end
-    filenamemap = Dict{Symbol,String}()
-    print_flat(io, lilist, n, m, cols, filenamemap, fmt)
-    Base.println(io, "Total snapshots: ", totalshots)
-    nothing
-end
-
-function print_flat(io::IO, lilist::Vector{StackFrame},
-        n::Vector{Int}, m::Vector{Int},
-        cols::Int, filenamemap::Dict{Symbol,String},
-        fmt::ProfileFormat)
-    if fmt.sortedby == :count
-        p = sortperm(n)
-    elseif fmt.sortedby == :overhead
-        p = sortperm(m)
-    else
-        p = liperm(lilist)
-    end
-    lilist = lilist[p]
-    n = n[p]
-    m = m[p]
-    filenames = String[short_path(li.file, filenamemap) for li in lilist]
-    funcnames = String[string(li.func) for li in lilist]
-    wcounts = max(6, ndigits(maximum(n)))
-    wself = max(9, ndigits(maximum(m)))
-    maxline = 1
-    maxfile = 6
-    maxfunc = 10
-    for i in 1:length(lilist)
-        li = lilist[i]
-        maxline = max(maxline, li.line)
-        maxfunc = max(maxfunc, length(funcnames[i]))
-        maxfile = max(maxfile, length(filenames[i]))
-    end
-    wline = max(5, ndigits(maxline))
-    ntext = max(20, cols - wcounts - wself - wline - 3)
-    maxfunc += 25 # for type signatures
-    if maxfile + maxfunc <= ntext
-        wfile = maxfile
-        wfunc = ntext - maxfunc # take the full width (for type sig)
-    else
-        wfile = 2*ntext÷5
-        wfunc = 3*ntext÷5
-    end
-    println(io, lpad("Count", wcounts, " "), " ", lpad("Overhead", wself, " "), " ",
-            rpad("File", wfile, " "), " ", lpad("Line", wline, " "), " Function")
-    println(io, lpad("=====", wcounts, " "), " ", lpad("========", wself, " "), " ",
-            rpad("====", wfile, " "), " ", lpad("====", wline, " "), " ========")
-    for i = 1:length(n)
-        n[i] < fmt.mincount && continue
-        li = lilist[i]
-        Base.print(io, lpad(string(n[i]), wcounts, " "), " ")
-        Base.print(io, lpad(string(m[i]), wself, " "), " ")
-        if li == UNKNOWN
-            if !fmt.combine && li.pointer != 0
-                Base.print(io, "@0x", string(li.pointer, base=16))
-            else
-                Base.print(io, "[any unknown stackframes]")
-            end
-        else
-            file = filenames[i]
-            isempty(file) && (file = "[unknown file]")
-            Base.print(io, rpad(rtruncto(file, wfile), wfile, " "), " ")
-            Base.print(io, lpad(li.line > 0 ? string(li.line) : "?", wline, " "), " ")
-            fname = funcnames[i]
-            if !li.from_c && li.linfo !== nothing
-                fname = sprint(show_spec_linfo, li)
-            end
-            isempty(fname) && (fname = "[unknown function]")
-            Base.print(io, ltruncto(fname, wfunc))
-        end
-        println(io)
-    end
-    nothing
-end
-
-## A tree representation
-
-# Representation of a prefix trie of backtrace counts
-mutable struct StackFrameTree{T} # where T <: Union{UInt64, StackFrame}
-    # content fields:
-    frame::StackFrame
-    count::Int          # number of frames this appeared in
-    overhead::Int       # number frames where this was the code being executed
-    flat_count::Int     # number of times this frame was in the flattened representation (unlike count, this'll sum to 100% of parent)
-    max_recur::Int      # maximum number of times this frame was the *top* of the recursion in the stack
-    count_recur::Int    # sum of the number of times this frame was the *top* of the recursion in a stack (divide by count to get an average)
-    down::Dict{T, StackFrameTree{T}}
-    # construction workers:
-    recur::Int
-    builder_key::Vector{UInt64}
-    builder_value::Vector{StackFrameTree{T}}
-    up::StackFrameTree{T}
-    StackFrameTree{T}() where {T} = new(UNKNOWN, 0, 0, 0, 0, 0, Dict{T, StackFrameTree{T}}(), 0, UInt64[], StackFrameTree{T}[])
-end
-
-
-const indent_s = "    ╎"^10
-const indent_z = collect(eachindex(indent_s))
-function indent(depth::Int)
-    depth < 1 && return ""
-    depth <= length(indent_z) && return indent_s[1:indent_z[depth]]
-    div, rem = divrem(depth, length(indent_z))
-    return (indent_s^div) * SubString(indent_s, 1, indent_z[rem])
-end
-
-function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, maxes, filenamemap::Dict{Symbol,String}, showpointer::Bool)
-    nindent = min(cols>>1, level)
-    ndigoverhead = ndigits(maxes.overhead)
-    ndigcounts = ndigits(maxes.count)
-    ndigline = ndigits(maximum(frame.frame.line for frame in frames)) + 6
-    ntext = max(30, cols - ndigoverhead - nindent - ndigcounts - ndigline - 6)
-    widthfile = 2*ntext÷5 # min 12
-    widthfunc = 3*ntext÷5 # min 18
-    strs = Vector{String}(undef, length(frames))
-    showextra = false
-    if level > nindent
-        nextra = level - nindent
-        nindent -= ndigits(nextra) + 2
-        showextra = true
-    end
-    for i = 1:length(frames)
-        frame = frames[i]
-        li = frame.frame
-        stroverhead = lpad(frame.overhead > 0 ? string(frame.overhead) : "", ndigoverhead, " ")
-        base = nindent == 0 ? "" : indent(nindent - 1) * " "
-        if showextra
-            base = string(base, "+", nextra, " ")
-        end
-        strcount = rpad(string(frame.count), ndigcounts, " ")
-        if li != UNKNOWN
-            if li.line == li.pointer
-                strs[i] = string(stroverhead, "╎", base, strcount, " ",
-                    "[unknown function] (pointer: 0x",
-                    string(li.pointer, base = 16, pad = 2*sizeof(Ptr{Cvoid})),
-                    ")")
-            else
-                if !li.from_c && li.linfo !== nothing
-                    fname = sprint(show_spec_linfo, li)
-                else
-                    fname = string(li.func)
-                end
-                filename = short_path(li.file, filenamemap)
-                if showpointer
-                    fname = string(
-                        "0x",
-                        string(li.pointer, base = 16, pad = 2*sizeof(Ptr{Cvoid})),
-                        " ",
-                        fname)
-                end
-                strs[i] = string(stroverhead, "╎", base, strcount, " ",
-                    rtruncto(filename, widthfile),
-                    ":",
-                    li.line == -1 ? "?" : string(li.line),
-                    "; ",
-                    ltruncto(fname, widthfunc))
-            end
-        else
-            strs[i] = string(stroverhead, "╎", base, strcount, " [unknown stackframe]")
-        end
-    end
-    return strs
-end
-
-# turn a list of backtraces into a tree (implicitly separated by NULL markers)
-function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, C::Bool, recur::Symbol) where {T}
-    parent = root
-    tops = Vector{StackFrameTree{T}}()
-    build = Vector{StackFrameTree{T}}()
-    startframe = length(all)
-    for i in startframe:-1:1
-        ip = all[i]
-        if ip == 0
-            # sentinel value indicates the start of a new backtrace
-            empty!(build)
-            root.recur = 0
-            if recur !== :off
-                # We mark all visited nodes to so we'll only count those branches
-                # once for each backtrace. Reset that now for the next backtrace.
-                push!(tops, parent)
-                for top in tops
-                    while top.recur != 0
-                        top.max_recur < top.recur && (top.max_recur = top.recur)
-                        top.recur = 0
-                        top = top.up
-                    end
-                end
-                empty!(tops)
-            end
-            let this = parent
-                while this !== root
-                    this.flat_count += 1
-                    this = this.up
-                end
-            end
-            parent.overhead += 1
-            parent = root
-            root.count += 1
-            startframe = i
-        else
-            pushfirst!(build, parent)
-            if recur === :flat || recur == :flatc
-                # Rewind the `parent` tree back, if this exact ip was already present *higher* in the current tree
-                found = false
-                for j in 1:(startframe - i)
-                    if ip == all[i + j]
-                        if recur === :flat # if not flattening C frames, check that now
-                            frames = lidict[ip]
-                            frame = (frames isa Vector ? frames[1] : frames)
-                            frame.from_c && break # not flattening this frame
-                        end
-                        push!(tops, parent)
-                        parent = build[j]
-                        parent.recur += 1
-                        parent.count_recur += 1
-                        found = true
-                        break
-                    end
-                end
-                found && continue
-            end
-            builder_key = parent.builder_key
-            builder_value = parent.builder_value
-            fastkey = searchsortedfirst(builder_key, ip)
-            if fastkey < length(builder_key) && builder_key[fastkey] === ip
-                # jump forward to the end of the inlining chain
-                # avoiding an extra (slow) lookup of `ip` in `lidict`
-                # and an extra chain of them in `down`
-                # note that we may even have this === parent (if we're ignoring this frame ip)
-                this = builder_value[fastkey]
-                let this = this
-                    while this !== parent && (recur === :off || this.recur == 0)
-                        this.count += 1
-                        this.recur = 1
-                        this = this.up
-                    end
-                end
-                parent = this
-                continue
-            end
-            frames = lidict[ip]
-            nframes = (frames isa Vector ? length(frames) : 1)
-            this = parent
-            # add all the inlining frames
-            for i = nframes:-1:1
-                frame = (frames isa Vector ? frames[i] : frames)
-                !C && frame.from_c && continue
-                key = (T === UInt64 ? ip : frame)
-                this = get!(StackFrameTree{T}, parent.down, key)
-                if recur === :off || this.recur == 0
-                    this.frame = frame
-                    this.up = parent
-                    this.count += 1
-                    this.recur = 1
-                end
-                parent = this
-            end
-            # record where the end of this chain is for this ip
-            insert!(builder_key, fastkey, ip)
-            insert!(builder_value, fastkey, this)
-        end
-    end
-    function cleanup!(node::StackFrameTree)
-        stack = [node]
-        while !isempty(stack)
-            node = pop!(stack)
-            node.recur = 0
-            empty!(node.builder_key)
-            empty!(node.builder_value)
-            append!(stack, values(node.down))
-        end
-        nothing
-    end
-    cleanup!(root)
-    return root
-end
-
-function maxstats(root::StackFrameTree)
-    maxcount = Ref(0)
-    maxflatcount = Ref(0)
-    maxoverhead = Ref(0)
-    maxmaxrecur = Ref(0)
-    stack = [root]
-    while !isempty(stack)
-        node = pop!(stack)
-        maxcount[] = max(maxcount[], node.count)
-        maxoverhead[] = max(maxoverhead[], node.overhead)
-        maxflatcount[] = max(maxflatcount[], node.flat_count)
-        maxmaxrecur[] = max(maxmaxrecur[], node.max_recur)
-        append!(stack, values(node.down))
-    end
-    return (count=maxcount[], count_flat=maxflatcount[], overhead=maxoverhead[], max_recur=maxmaxrecur[])
-end
-
-# Print the stack frame tree starting at a particular root. Uses a worklist to
-# avoid stack overflows.
-function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat) where T
-    maxes = maxstats(bt)
-    filenamemap = Dict{Symbol,String}()
-    worklist = [(bt, 0, 0, "")]
-    println(io, "Overhead ╎ [+additional indent] Count File:Line; Function")
-    println(io, "=========================================================")
-    while !isempty(worklist)
-        (bt, level, noisefloor, str) = popfirst!(worklist)
-        isempty(str) || println(io, str)
-        level > fmt.maxdepth && continue
-        isempty(bt.down) && continue
-        # Order the line information
-        nexts = collect(values(bt.down))
-        # Generate the string for each line
-        strs = tree_format(nexts, level, cols, maxes, filenamemap, T === UInt64)
-        # Recurse to the next level
-        if fmt.sortedby == :count
-            counts = collect(frame.count for frame in nexts)
-            p = sortperm(counts)
-        elseif fmt.sortedby == :overhead
-            m = collect(frame.overhead for frame in nexts)
-            p = sortperm(m)
-        elseif fmt.sortedby == :flat_count
-            m = collect(frame.flat_count for frame in nexts)
-            p = sortperm(m)
-        else
-            lilist = collect(frame.frame for frame in nexts)
-            p = liperm(lilist)
-        end
-        for i in reverse(p)
-            down = nexts[i]
-            count = down.count
-            count < fmt.mincount && continue
-            count < noisefloor && continue
-            str = strs[i]
-            noisefloor_down = fmt.noisefloor > 0 ? floor(Int, fmt.noisefloor * sqrt(count)) : 0
-            pushfirst!(worklist, (down, level + 1, noisefloor_down, str))
-        end
-    end
-end
-
-function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat)
-    if fmt.combine
-        root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur)
-    else
-        root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur)
-    end
-    if isempty(root.down)
-        warning_empty()
-        return
-    end
-    print_tree(io, root, cols, fmt)
-    Base.println(io, "Total snapshots: ", root.count)
-    nothing
-end
-
-function callersf(matchfunc::Function, bt::Vector, lidict::LineInfoFlatDict)
-    counts = Dict{StackFrame, Int}()
-    lastmatched = false
-    for id in bt
-        if id == 0
-            lastmatched = false
-            continue
-        end
-        li = lidict[id]
-        if lastmatched
-            if haskey(counts, li)
-                counts[li] += 1
-            else
-                counts[li] = 1
-            end
-        end
-        lastmatched = matchfunc(li)
-    end
-    k = collect(keys(counts))
-    v = collect(values(counts))
-    p = sortperm(v, rev=true)
-    return [(v[i], k[i]) for i in p]
-end
-
-# Utilities
-function rtruncto(str::String, w::Int)
-    if length(str) <= w
-        return str
-    else
-        return string("...", str[prevind(str, end, w-4):end])
-    end
-end
-function ltruncto(str::String, w::Int)
-    if length(str) <= w
-        return str
-    else
-        return string(str[1:nextind(str, 1, w-4)], "...")
-    end
-end
-
-
-truncto(str::Symbol, w::Int) = truncto(string(str), w)
-
-# Order alphabetically (file, function) and then by line number
-function liperm(lilist::Vector{StackFrame})
-    function lt(a::StackFrame, b::StackFrame)
-        a == UNKNOWN && return false
-        b == UNKNOWN && return true
-        fcmp = cmp(a.file, b.file)
-        fcmp < 0 && return true
-        fcmp > 0 && return false
-        fcmp = cmp(a.func, b.func)
-        fcmp < 0 && return true
-        fcmp > 0 && return false
-        fcmp = cmp(a.line, b.line)
-        fcmp < 0 && return true
-        return false
-    end
-    return sortperm(lilist, lt = lt)
-end
+using .Memory, .Time
 
-warning_empty() = @warn """
-            There were no samples collected. Run your program longer (perhaps by
-            running it multiple times), or adjust the delay between samples with
-            `Profile.init()`."""
+# Export our two measuring tools
+import .Time: @profile
+import .Memory: @memprofile
+export @profile, @memprofile
 
 end # module
diff --git a/stdlib/Profile/src/compat.jl b/stdlib/Profile/src/compat.jl
new file mode 100644
index 00000000000000..29f74b4450a84b
--- /dev/null
+++ b/stdlib/Profile/src/compat.jl
@@ -0,0 +1,22 @@
+# Compatibility shims for old users who aren't used to the `Time` and `Memory` sub-modules
+
+function init(args...; kwargs...)
+    @warn("Profile.init() is deprecated, use Profile.Time.init() or Profile.Memory.init() directly")
+    Time.init(args...; kwargs...)
+end
+
+function print(args...; kwargs...)
+    @warn("Profile.print() is deprecated, use Profile.Time.print() or Profile.Memory.print() directly")
+    Time.print(args...; kwargs...)
+end
+
+
+"""
+    clear_malloc_data()
+
+Clears any stored memory allocation data when running julia with `--track-allocation`.
+Execute the command(s) you want to test (to force JIT-compilation), then call
+[`clear_malloc_data`](@ref). Then execute your command(s) again, quit
+Julia, and examine the resulting `*.mem` files.
+"""
+clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ())
diff --git a/stdlib/Profile/src/memory.jl b/stdlib/Profile/src/memory.jl
new file mode 100644
index 00000000000000..8e331b11698a17
--- /dev/null
+++ b/stdlib/Profile/src/memory.jl
@@ -0,0 +1,349 @@
+module Memory
+
+using ..Profile: getdict
+
+using Base.StackTraces
+
+
+##
+## For --track-allocation (unrelated to @memprofile)
+##
+# Reset the malloc log. Used to avoid counting memory allocated during
+# compilation.
+
+"""
+    clear_malloc_data()
+
+Clears any stored memory allocation data when running julia with `--track-allocation`.
+Execute the command(s) you want to test (to force JIT-compilation), then call
+[`clear_malloc_data`](@ref). Then execute your command(s) again, quit
+Julia, and examine the resulting `*.mem` files.
+"""
+clear_malloc_data() = ccall(:jl_clear_malloc_data, Cvoid, ())
+
+
+tag_lookup(d::Dict, s::Symbol) = d[s]
+tag_lookup(d::Dict, v::Vector{Symbol}) = mapreduce(s->d[s], |, v)
+tag_lookup(d::Dict, n::Number) = [t for (t, v) in d if (v & n == v) && (t != :all)]
+
+const memory_domain_map = Dict(
+    :cpu => 0x0001,
+    :gpu => 0x0002,
+    :external => 0x0080,
+
+    :all => 0x00ff,
+)
+const allocator_map = Dict(
+    :std => 0x0100,
+    :pool => 0x0200,
+    :big => 0x0400,
+
+    :all => 0xff00,
+)
+const closing_tag = 0x8000
+
+"""
+    build_tag_filter(memory_domain = :all, allocator = :all)
+
+Build a memory profile tag filter that will capture only events matching the given
+filter parameters.  This method is meant to mirror the `#define`'d `JL_MEMPROF_TAG_*`
+values within `julia_internal,.h`.  Valid memory domains are `:cpu`, `:gpu`, `:external`
+and `:all`.  Valid allocators are `:std`, `:pool`, `:big`, and `:all`.
+
+You can build a union of multiple allocators and memory domains by passing in a
+vector of symbols, e.g. `build_tag_filter(:all, [:std, :pool])`.
+"""
+function build_tag_filter(memory_domain = :all, allocator = :all)
+    return tag_lookup(memory_domain_map, memory_domain) | tag_lookup(allocator_map, allocator)
+end
+
+"""
+    init(; bt_size::Integer, alloc_size::Integer, tag_filter::UInt16)
+
+Configure the number `bt_size` of instruction pointers that may be stored for backtraces
+of memory allocation and deallocation locatinos, as well as the number of allocation event
+structures that are prepared for when profiling memory usage. Each instruction pointer
+corresponds to a single line of code; backtraces generally consist of a long list of
+instruction pointers. Default settings can be obtained by calling this function with no
+arguments, and each can be set independently using keywords or in the order `(bt_size, alloc_size)`.
+
+`tag_filter` can be used to filter what kind of events are captured by the memory profiler,
+see `build_tag_filter()` for more.
+"""
+function init(; bt_size::Union{Nothing,Integer} = nothing,
+                alloc_size::Union{Nothing,Integer} = nothing,
+                tag_filter::UInt16 = build_tag_filter())
+    bt_size = something(bt_size, get_memprofile_bt_data_maxlen())
+    alloc_size = something(alloc_size, get_memprofile_alloc_data_maxlen())
+
+    # Sub off to our friend
+    return init(bt_size, alloc_size, tag_filter)
+end
+
+function init(bt_size::Integer, alloc_size::Integer, tag_filter::UInt16)
+    status = ccall(:jl_memprofile_init, Cint, (Csize_t, Csize_t, UInt16), bt_size, alloc_size, tag_filter)
+    if status == -1
+        error("could not allocate space for ", bt_size, " instruction pointers and ", alloc_size, " allocation event structures")
+    end
+end
+
+# This struct must stay in sync with `src/memprofiler.c`
+struct allocation_info_t
+    address::Ptr{Cvoid}
+    T::Ptr{Cvoid}
+    time::Float64
+    allocsz::Csize_t
+    tag::UInt16
+end
+
+# This struct is a combined version of the above `allocation_info_t` with backtrace data
+# and automatic GC marking for the objects that are still open so that they don't get
+# collected while we're modifying them.
+struct AllocationInfo
+    address::UInt
+    T::Any
+    alloc_time::Float64
+    free_time::Float64
+    alloc_stacktrace::Vector{StackFrame}
+    free_stacktrace::Vector{StackFrame}
+    allocsz::UInt64
+    tag::UInt16
+
+    # For an "open" allocation, this is the object that is still alive.  For a "closed"
+    # allocation, this is just `nothing`.
+    object::Any
+end
+
+# If a chunk is "open", we know it's live so actually grab the object and its type
+# The only subtlety is when it's a malloc'd list.
+function update_live_object(a::AllocationInfo)
+    array_addr = ccall(:jl_memprofile_find_malloc_array, Ptr{Cvoid}, (Ptr{Cvoid},), Ptr{Cvoid}(a.address))
+    if array_addr != C_NULL
+        obj = unsafe_pointer_to_objref(array_addr)
+    else
+        # Not doing this right now, because it appears that they can ~~disappear~~
+        obj = nothing # unsafe_pointer_to_objref(Ptr{Cvoid}(a.address))
+    end
+
+    return AllocationInfo(
+        a.address,
+        a.T,
+        a.alloc_time,
+        a.free_time,
+        a.alloc_stacktrace,
+        a.free_stacktrace,
+        a.allocsz,
+        a.tag,
+        obj
+    )
+end
+
+function Base.show(io::IO, a::AllocationInfo)
+    md = join(tag_lookup(memory_domain_map, a.tag), ", ")
+    at = join(tag_lookup(allocator_map, a.tag), ", ")
+    T = a.T === nothing ? "(?)" : string(a.T)
+    print(io, "AllocationInfo (@$(repr(a.address)): $(a.allocsz) B, domain: ($(md)), allocator: ($(at)), type: $(T))")
+end
+
+"""
+    open_AI(a::allocation_info_t, alloc_stacktrace)
+
+Create an `AllocationInfo` that represents an allocated (and not yet deallocated) chunk
+of memory.  This `AllocationInfo` will later be "closed" to represent the full lifecycle
+of an allocated and then freed piece of memory.  This function has also been known to
+create highly competitive dota agents.
+"""
+function open_AI(a::allocation_info_t, alloc_stacktrace)
+    # If a_type is still C_NULL, just set the type as `nothing`:
+    if a.T == C_NULL
+        a_type = nothing
+    else
+        a_type = unsafe_pointer_to_objref(a.T)
+    end
+
+    return AllocationInfo(
+        UInt(a.address),
+        a_type,
+        a.time,
+        0.0,
+        alloc_stacktrace,
+        StackFrame[],
+        UInt64(a.allocsz),
+        a.tag,
+
+        # We initialize this as `nothing` because we don't know if this
+        # object is actually still live or not.
+        nothing,
+    )
+end
+
+"""
+    close_AI(a::AllocationInfo, d::allocation_info_t, dealloc_stacktrace)
+
+Mark a previously opened `AllocationInfo` as closed, denoting that we know both the
+allocation and deallocation points for this chunk of memory.
+"""
+function close_AI(a::AllocationInfo, d::allocation_info_t, dealloc_stacktrace)
+    # Ensure that we're dealing with the same memory location
+    @assert a.address == UInt(d.address) "Attempting to close different memory locations! ($(a.address) != $(UInt(d.address)))"
+    @assert (a.tag | closing_tag) == d.tag "Attempting to close memory from different allocators/domains! ($(a.tag | closing_tag) != $(d.tag))"
+
+    # If a_type is still C_NULL, just set the type as `nothing`:
+    if d.T == C_NULL
+        d_type = nothing
+    else
+        d_type = unsafe_pointer_to_objref(d.T)
+    end
+
+    nstr(x) = x === nothing ? "nothing" : string(x)
+    @assert ((d_type == nothing || a.T == nothing) || d_type == a.T) "Mismatched types! ($(nstr(d_type)) != $(nstr(a.T)))"
+
+    return AllocationInfo(
+        a.address,
+        something(a.T, d_type, Some(nothing)),
+        a.alloc_time,
+        d.time,
+        a.alloc_stacktrace,
+        dealloc_stacktrace,
+        a.allocsz,
+        a.tag,
+        nothing,
+    )
+end
+
+macro memprofile(ex)
+    return quote
+        try
+            # Run a GC before we start profiling, to eliminate "ghost" chunks
+            Base.GC.gc()
+            start_memprofile()
+
+            # Explicit `begin/end` to help GC scope resolution
+            begin
+                $(esc(ex))
+            end
+        finally
+            # Run a GC afterwards to eliminate things that aren't actually leaked,
+            # they just weren't cleaned up before we stopped profiling.
+            Base.GC.gc()
+            stop_memprofile()
+        end
+    end
+end
+
+start_memprofile() = ccall(:jl_memprofile_start, Cvoid, ())
+stop_memprofile() = ccall(:jl_memprofile_stop, Cvoid, ())
+
+get_memprofile_bt_data() = ccall(:jl_memprofile_get_bt_data, Ptr{UInt}, ())
+get_memprofile_bt_data_len() = convert(Int, ccall(:jl_memprofile_len_bt_data, Csize_t, ()))
+get_memprofile_bt_data_maxlen() = convert(Int, ccall(:jl_memprofile_maxlen_bt_data, Csize_t, ()))
+
+get_memprofile_alloc_data() = ccall(:jl_memprofile_get_alloc_data, Ptr{allocation_info_t}, ())
+get_memprofile_alloc_data_len() = convert(Int, ccall(:jl_memprofile_len_alloc_data, Csize_t, ()))
+get_memprofile_alloc_data_maxlen() = convert(Int, ccall(:jl_memprofile_maxlen_alloc_data, Csize_t, ()))
+
+get_memprofile_overflow() = ccall(:jl_memprofile_overflow, Cint, ())
+get_memprofile_tag_filter() = UInt16(ccall(:jl_memprofile_tag_filter, Cint, ()))
+
+"""
+    read_and_coalesce_memprofile_data()
+
+Load in the backtrace and allocation data buffers from the C side of Julia-land, returns two
+vectors, `open_chunks` and `closed_chunks`, which represent all allocations that are yet to
+be freed, and all allocations that were properly freed.
+"""
+function read_and_coalesce_memprofile_data()
+    overflow = get_memprofile_overflow()
+    if overflow & 0x01 != 0
+        @warn """The memory profile backtrace buffer overflowed; profiling terminated
+                 before your program finished. To profile for longer runs, call
+                 `Profile.init()` with a larger buffer and/or larger delay."""
+    end
+    if overflow & 0x02 != 0
+        @warn """The memory profile allocation info buffer overflowed; profiling terminated
+                 before your program finished. To profile for longer runs, call
+                 `Profile.init()` with a larger buffer and/or larger delay."""
+    end
+
+    bt_data = unsafe_wrap(Array, get_memprofile_bt_data(), (get_memprofile_bt_data_len(),))
+    alloc_data = unsafe_wrap(Array, get_memprofile_alloc_data(), (get_memprofile_alloc_data_len(),))
+
+    # Build backtrace lookup table
+    bt_data = Base._reformat_bt(bt_data)
+    bt_lookup = getdict(bt_data)
+
+    # These are chunks of memory that have been allocated, but not yet freed.  They are keyed by
+    # (memory_address, tag) for easy lookup
+    open_chunks_map = Dict{Tuple{UInt,UInt16},AllocationInfo}()
+
+    # These are chunks of memory that have been allocated and then freed.  They will start in
+    # open_chunks_map, then get migrated over here once they are freed.
+    closed_chunks = AllocationInfo[]
+
+    # These are chunks of memory that were free'd but we didn't see them get created.
+    ghost_chunks = Tuple[]
+
+    # Loop over every event in `alloc_data`, looking for identical memory pointers; attempt to pair
+    # them together to create "closed" chunks:
+    bt_idx = 1
+    for a in alloc_data
+        # Helper function to extract the next backtrace from the buffer
+        function next_bt_trace(bt_data, bt_idx)
+            bt_end_idx = bt_idx
+            while bt_data[bt_end_idx] != 0x0
+                bt_end_idx += 1
+            end
+            bt = [bt_lookup[x][1] for x in bt_data[bt_idx:bt_end_idx-1]]
+            return (bt, bt_end_idx + 1)
+        end
+
+        # Extract the backtrace associated with this allocation event
+        bt, bt_idx = next_bt_trace(bt_data, bt_idx)
+
+        # Construct the "chunk identifier" that we use to loop up within open_chunks_map, etc...
+        chunk_id = (UInt(a.address), a.tag | closing_tag)
+
+        # Are we an allocation?
+        if a.allocsz != 0
+            @info("Opening $(chunk_id) $(a.T) $(a.allocsz)")
+
+            # Assert that we're not inserting an identical chunk
+            @assert !(chunk_id in keys(open_chunks_map)) "Doubly-opened memory chunk!"
+            open_chunks_map[chunk_id] = open_AI(a, bt)
+        else
+            @info("Closing $(chunk_id) $(a.T)")
+
+            # If this `a` represents a free(), let's see if we're closing a previously opened chunk.
+            if !(chunk_id in keys(open_chunks_map))
+                push!(ghost_chunks, (a, bt))
+            else
+                # "Close" the chunk by adding deallocation information to it, then remove
+                # that chunk from the open_chunks_map!
+                push!(closed_chunks, close_AI(open_chunks_map[chunk_id], a, bt))
+                delete!(open_chunks_map, chunk_id)
+            end
+        end
+    end
+
+    if !isempty(ghost_chunks)
+        @warn "Attempted to close $(length(ghost_chunks)) ghost memory chunks!"
+    end
+
+    # Any leftover "open" chunks are just plain old leaked memory.
+    open_chunks = sort(collect(values(open_chunks_map)); by = x -> x.alloc_time)
+    closed_chunks = sort(closed_chunks, by = x -> x.alloc_time)
+
+    # Leftover "open" chunks still have their memory, so let's get their types!
+    open_chunks = update_live_object.(open_chunks)
+    return open_chunks, closed_chunks, ghost_chunks
+end
+
+
+"""
+    clear()
+
+Clear any existing memory profile data from the internal buffers.
+"""
+clear_memprofile_data() = ccall(:jl_memprofile_clear_data, Cvoid, ())
+
+end # module Memory
diff --git a/stdlib/Profile/src/stacktrace_tools.jl b/stdlib/Profile/src/stacktrace_tools.jl
new file mode 100644
index 00000000000000..8d0ef7dad182ad
--- /dev/null
+++ b/stdlib/Profile/src/stacktrace_tools.jl
@@ -0,0 +1,628 @@
+import Base.StackTraces: lookup, UNKNOWN, show_spec_linfo, StackFrame
+
+# deprecated functions: use `getdict` instead
+lookup(ip::UInt) = lookup(convert(Ptr{Cvoid}, ip))
+
+
+const IP = Union{UInt,Base.InterpreterIP}
+const LineInfoDict = Dict{IP, Vector{StackFrame}}
+const LineInfoFlatDict = Dict{IP, StackFrame}
+
+struct ProfileFormat
+    maxdepth::Int
+    mincount::Int
+    noisefloor::Float64
+    sortedby::Symbol
+    combine::Bool
+    C::Bool
+    recur::Symbol
+    function ProfileFormat(;
+        C = false,
+        combine = true,
+        maxdepth::Int = typemax(Int),
+        mincount::Int = 0,
+        noisefloor = 0,
+        sortedby::Symbol = :filefuncline,
+        recur::Symbol = :off)
+        return new(maxdepth, mincount, noisefloor, sortedby, combine, C, recur)
+    end
+end
+
+"""
+    getdict(data::Vector{UInt})
+Given a list of instruction pointers, build a dictionary mapping from those pointers to
+LineInfo objects (as returned by `StackTraces.lookup()`).  This allows us to quickly take
+a list of instruction pointers and convert it to `LineInfo` objects.
+"""
+function getdict(data)
+    return LineInfoDict(x=>lookup(x) for x in unique(data))
+end
+
+"""
+    flatten(btdata::Vector, lidict::LineInfoDict) -> (newdata::Vector{UInt64}, newdict::LineInfoFlatDict)
+
+Produces "flattened" backtrace data. Individual instruction pointers
+sometimes correspond to a multi-frame backtrace due to inlining; in
+such cases, this function inserts fake instruction pointers for the
+inlined calls, and returns a dictionary that is a 1-to-1 mapping
+between instruction pointers and a single StackFrame.
+"""
+function flatten(data::Vector, lidict::LineInfoDict)
+    # Makes fake instruction pointers, counting down from typemax(UInt)
+    newip = typemax(UInt64) - 1
+    taken = Set(keys(lidict))  # make sure we don't pick one that's already used
+    newdict = Dict{IP,StackFrame}()
+    newmap  = Dict{IP,Vector{UInt64}}()
+    for (ip, trace) in lidict
+        if length(trace) == 1
+            newdict[ip] = trace[1]
+        else
+            newm = UInt64[]
+            for sf in trace
+                while newip ∈ taken && newip > 0
+                    newip -= 1
+                end
+                newip == 0 && error("all possible instruction pointers used")
+                push!(newm, newip)
+                newdict[newip] = sf
+                newip -= 1
+            end
+            newmap[ip] = newm
+        end
+    end
+    newdata = UInt64[]
+    for ip::UInt64 in data
+        if haskey(newmap, ip)
+            append!(newdata, newmap[ip])
+        else
+            push!(newdata, ip)
+        end
+    end
+    return (newdata, newdict)
+end
+
+# Take a file-system path and try to form a concise representation of it
+# based on the package ecosystem
+function short_path(spath::Symbol, filenamecache::Dict{Symbol, String})
+    return get!(filenamecache, spath) do
+        path = string(spath)
+        if isabspath(path)
+            if ispath(path)
+                # try to replace the file-system prefix with a short "@Module" one,
+                # assuming that profile came from the current machine
+                # (or at least has the same file-system layout)
+                root = path
+                while !isempty(root)
+                    root, base = splitdir(root)
+                    isempty(base) && break
+                    @assert startswith(path, root)
+                    for proj in Base.project_names
+                        project_file = joinpath(root, proj)
+                        if Base.isfile_casesensitive(project_file)
+                            pkgid = Base.project_file_name_uuid(project_file, "")
+                            isempty(pkgid.name) && return path # bad Project file
+                            # return the joined the module name prefix and path suffix
+                            path = path[nextind(path, sizeof(root)):end]
+                            return string("@", pkgid.name, path)
+                        end
+                    end
+                end
+            end
+            return path
+        elseif isfile(joinpath(Sys.BINDIR::String, Base.DATAROOTDIR, "julia", "base", path))
+            # do the same mechanic for Base (or Core/Compiler) files as above,
+            # but they start from a relative path
+            return joinpath("@Base", normpath(path))
+        else
+            # for non-existent relative paths (such as "REPL[1]"), just consider simplifying them
+            return normpath(path) # drop leading "./"
+        end
+    end
+end
+
+"""
+    callers(funcname, [data, lidict], [filename=<filename>], [linerange=<start:stop>]) -> Vector{Tuple{count, lineinfo}}
+
+Given a previous profiling run, determine who called a particular function. Supplying the
+filename (and optionally, range of line numbers over which the function is defined) allows
+you to disambiguate an overloaded method. The returned value is a vector containing a count
+of the number of calls and line information about the caller. One can optionally supply
+backtrace `data` obtained from [`Time.retrieve`](@ref); otherwise, the current internal
+profile buffer is used.
+"""
+function callers end
+
+function callers(funcname::String, bt::Vector, lidict::LineInfoFlatDict; filename = nothing, linerange = nothing)
+    if filename === nothing && linerange === nothing
+        return callersf(li -> String(li.func) == funcname,
+            bt, lidict)
+    end
+    filename === nothing && throw(ArgumentError("if supplying linerange, you must also supply the filename"))
+    filename = String(filename)
+    if linerange === nothing
+        return callersf(li -> String(li.func) == funcname && String(li.file) == filename,
+            bt, lidict)
+    else
+        return callersf(li -> String(li.func) == funcname && String(li.file) == filename && in(li.line, linerange),
+            bt, lidict)
+    end
+end
+
+callers(funcname::String, bt::Vector, lidict::LineInfoDict; kwargs...) =
+    callers(funcname, flatten(bt, lidict)...; kwargs...)
+callers(func::Function, bt::Vector, lidict::LineInfoFlatDict; kwargs...) =
+    callers(string(func), bt, lidict; kwargs...)
+
+
+## Print as a flat list
+# Counts the number of times each line appears, at any nesting level and at the topmost level
+# Merging multiple equivalent entries and recursive calls
+function parse_flat(::Type{T}, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, C::Bool) where {T}
+    lilist = StackFrame[]
+    n = Int[]
+    m = Int[]
+    lilist_idx = Dict{T, Int}()
+    recursive = Set{T}()
+    first = true
+    totalshots = 0
+    for ip in data
+        if ip == 0
+            totalshots += 1
+            empty!(recursive)
+            first = true
+            continue
+        end
+        frames = lidict[ip]
+        nframes = (frames isa Vector ? length(frames) : 1)
+        for i = 1:nframes
+            frame = (frames isa Vector ? frames[i] : frames)
+            !C && frame.from_c && continue
+            key = (T === UInt64 ? ip : frame)
+            idx = get!(lilist_idx, key, length(lilist) + 1)
+            if idx > length(lilist)
+                push!(recursive, key)
+                push!(lilist, frame)
+                push!(n, 1)
+                push!(m, 0)
+            elseif !(key in recursive)
+                push!(recursive, key)
+                n[idx] += 1
+            end
+            if first
+                m[idx] += 1
+                first = false
+            end
+        end
+    end
+    @assert length(lilist) == length(n) == length(m) == length(lilist_idx)
+    return (lilist, n, m, totalshots)
+end
+
+function flat(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoDict, LineInfoFlatDict}, cols::Int, fmt::ProfileFormat)
+    lilist, n, m, totalshots = parse_flat(fmt.combine ? StackFrame : UInt64, data, lidict, fmt.C)
+    if isempty(lilist)
+        warning_empty()
+        return
+    end
+    if false # optional: drop the "non-interpretable" ones
+        keep = map(frame -> frame != UNKNOWN && frame.line != 0, lilist)
+        lilist = lilist[keep]
+        n = n[keep]
+        m = m[keep]
+    end
+    filenamemap = Dict{Symbol,String}()
+    print_flat(io, lilist, n, m, cols, filenamemap, fmt)
+    Base.println(io, "Total snapshots: ", totalshots)
+    nothing
+end
+
+function print_flat(io::IO, lilist::Vector{StackFrame},
+        n::Vector{Int}, m::Vector{Int},
+        cols::Int, filenamemap::Dict{Symbol,String},
+        fmt::ProfileFormat)
+    if fmt.sortedby == :count
+        p = sortperm(n)
+    elseif fmt.sortedby == :overhead
+        p = sortperm(m)
+    else
+        p = liperm(lilist)
+    end
+    lilist = lilist[p]
+    n = n[p]
+    m = m[p]
+    filenames = String[short_path(li.file, filenamemap) for li in lilist]
+    funcnames = String[string(li.func) for li in lilist]
+    wcounts = max(6, ndigits(maximum(n)))
+    wself = max(9, ndigits(maximum(m)))
+    maxline = 1
+    maxfile = 6
+    maxfunc = 10
+    for i in 1:length(lilist)
+        li = lilist[i]
+        maxline = max(maxline, li.line)
+        maxfunc = max(maxfunc, length(funcnames[i]))
+        maxfile = max(maxfile, length(filenames[i]))
+    end
+    wline = max(5, ndigits(maxline))
+    ntext = max(20, cols - wcounts - wself - wline - 3)
+    maxfunc += 25 # for type signatures
+    if maxfile + maxfunc <= ntext
+        wfile = maxfile
+        wfunc = ntext - maxfunc # take the full width (for type sig)
+    else
+        wfile = 2*ntext÷5
+        wfunc = 3*ntext÷5
+    end
+    println(io, lpad("Count", wcounts, " "), " ", lpad("Overhead", wself, " "), " ",
+            rpad("File", wfile, " "), " ", lpad("Line", wline, " "), " Function")
+    println(io, lpad("=====", wcounts, " "), " ", lpad("========", wself, " "), " ",
+            rpad("====", wfile, " "), " ", lpad("====", wline, " "), " ========")
+    for i = 1:length(n)
+        n[i] < fmt.mincount && continue
+        li = lilist[i]
+        Base.print(io, lpad(string(n[i]), wcounts, " "), " ")
+        Base.print(io, lpad(string(m[i]), wself, " "), " ")
+        if li == UNKNOWN
+            if !fmt.combine && li.pointer != 0
+                Base.print(io, "@0x", string(li.pointer, base=16))
+            else
+                Base.print(io, "[any unknown stackframes]")
+            end
+        else
+            file = filenames[i]
+            isempty(file) && (file = "[unknown file]")
+            Base.print(io, rpad(rtruncto(file, wfile), wfile, " "), " ")
+            Base.print(io, lpad(li.line > 0 ? string(li.line) : "?", wline, " "), " ")
+            fname = funcnames[i]
+            if !li.from_c && li.linfo !== nothing
+                fname = sprint(show_spec_linfo, li)
+            end
+            isempty(fname) && (fname = "[unknown function]")
+            Base.print(io, ltruncto(fname, wfunc))
+        end
+        println(io)
+    end
+    nothing
+end
+
+## A tree representation
+
+# Representation of a prefix trie of backtrace counts
+mutable struct StackFrameTree{T} # where T <: Union{UInt64, StackFrame}
+    # content fields:
+    frame::StackFrame
+    count::Int          # number of frames this appeared in
+    overhead::Int       # number frames where this was the code being executed
+    flat_count::Int     # number of times this frame was in the flattened representation (unlike count, this'll sum to 100% of parent)
+    max_recur::Int      # maximum number of times this frame was the *top* of the recursion in the stack
+    count_recur::Int    # sum of the number of times this frame was the *top* of the recursion in a stack (divide by count to get an average)
+    down::Dict{T, StackFrameTree{T}}
+    # construction workers:
+    recur::Int
+    builder_key::Vector{UInt64}
+    builder_value::Vector{StackFrameTree{T}}
+    up::StackFrameTree{T}
+    StackFrameTree{T}() where {T} = new(UNKNOWN, 0, 0, 0, 0, 0, Dict{T, StackFrameTree{T}}(), 0, UInt64[], StackFrameTree{T}[])
+end
+
+
+const indent_s = "    ╎"^10
+const indent_z = collect(eachindex(indent_s))
+function indent(depth::Int)
+    depth < 1 && return ""
+    depth <= length(indent_z) && return indent_s[1:indent_z[depth]]
+    div, rem = divrem(depth, length(indent_z))
+    return (indent_s^div) * SubString(indent_s, 1, indent_z[rem])
+end
+
+function tree_format(frames::Vector{<:StackFrameTree}, level::Int, cols::Int, maxes, filenamemap::Dict{Symbol,String}, showpointer::Bool)
+    nindent = min(cols>>1, level)
+    ndigoverhead = ndigits(maxes.overhead)
+    ndigcounts = ndigits(maxes.count)
+    ndigline = ndigits(maximum(frame.frame.line for frame in frames)) + 6
+    ntext = max(30, cols - ndigoverhead - nindent - ndigcounts - ndigline - 6)
+    widthfile = 2*ntext÷5 # min 12
+    widthfunc = 3*ntext÷5 # min 18
+    strs = Vector{String}(undef, length(frames))
+    showextra = false
+    if level > nindent
+        nextra = level - nindent
+        nindent -= ndigits(nextra) + 2
+        showextra = true
+    end
+    for i = 1:length(frames)
+        frame = frames[i]
+        li = frame.frame
+        stroverhead = lpad(frame.overhead > 0 ? string(frame.overhead) : "", ndigoverhead, " ")
+        base = nindent == 0 ? "" : indent(nindent - 1) * " "
+        if showextra
+            base = string(base, "+", nextra, " ")
+        end
+        strcount = rpad(string(frame.count), ndigcounts, " ")
+        if li != UNKNOWN
+            if li.line == li.pointer
+                strs[i] = string(stroverhead, "╎", base, strcount, " ",
+                    "[unknown function] (pointer: 0x",
+                    string(li.pointer, base = 16, pad = 2*sizeof(Ptr{Cvoid})),
+                    ")")
+            else
+                if !li.from_c && li.linfo !== nothing
+                    fname = sprint(show_spec_linfo, li)
+                else
+                    fname = string(li.func)
+                end
+                filename = short_path(li.file, filenamemap)
+                if showpointer
+                    fname = string(
+                        "0x",
+                        string(li.pointer, base = 16, pad = 2*sizeof(Ptr{Cvoid})),
+                        " ",
+                        fname)
+                end
+                strs[i] = string(stroverhead, "╎", base, strcount, " ",
+                    rtruncto(filename, widthfile),
+                    ":",
+                    li.line == -1 ? "?" : string(li.line),
+                    "; ",
+                    ltruncto(fname, widthfunc))
+            end
+        else
+            strs[i] = string(stroverhead, "╎", base, strcount, " [unknown stackframe]")
+        end
+    end
+    return strs
+end
+
+# turn a list of backtraces into a tree (implicitly separated by NULL markers)
+function tree!(root::StackFrameTree{T}, all::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, C::Bool, recur::Symbol) where {T}
+    parent = root
+    tops = Vector{StackFrameTree{T}}()
+    build = Vector{StackFrameTree{T}}()
+    startframe = length(all)
+    for i in startframe:-1:1
+        ip = all[i]
+        if ip == 0
+            # sentinel value indicates the start of a new backtrace
+            empty!(build)
+            root.recur = 0
+            if recur !== :off
+                # We mark all visited nodes to so we'll only count those branches
+                # once for each backtrace. Reset that now for the next backtrace.
+                push!(tops, parent)
+                for top in tops
+                    while top.recur != 0
+                        top.max_recur < top.recur && (top.max_recur = top.recur)
+                        top.recur = 0
+                        top = top.up
+                    end
+                end
+                empty!(tops)
+            end
+            let this = parent
+                while this !== root
+                    this.flat_count += 1
+                    this = this.up
+                end
+            end
+            parent.overhead += 1
+            parent = root
+            root.count += 1
+            startframe = i
+        else
+            pushfirst!(build, parent)
+            if recur === :flat || recur == :flatc
+                # Rewind the `parent` tree back, if this exact ip was already present *higher* in the current tree
+                found = false
+                for j in 1:(startframe - i)
+                    if ip == all[i + j]
+                        if recur === :flat # if not flattening C frames, check that now
+                            frames = lidict[ip]
+                            frame = (frames isa Vector ? frames[1] : frames)
+                            frame.from_c && break # not flattening this frame
+                        end
+                        push!(tops, parent)
+                        parent = build[j]
+                        parent.recur += 1
+                        parent.count_recur += 1
+                        found = true
+                        break
+                    end
+                end
+                found && continue
+            end
+            builder_key = parent.builder_key
+            builder_value = parent.builder_value
+            fastkey = searchsortedfirst(builder_key, ip)
+            if fastkey < length(builder_key) && builder_key[fastkey] === ip
+                # jump forward to the end of the inlining chain
+                # avoiding an extra (slow) lookup of `ip` in `lidict`
+                # and an extra chain of them in `down`
+                # note that we may even have this === parent (if we're ignoring this frame ip)
+                this = builder_value[fastkey]
+                let this = this
+                    while this !== parent && (recur === :off || this.recur == 0)
+                        this.count += 1
+                        this.recur = 1
+                        this = this.up
+                    end
+                end
+                parent = this
+                continue
+            end
+            frames = lidict[ip]
+            nframes = (frames isa Vector ? length(frames) : 1)
+            this = parent
+            # add all the inlining frames
+            for i = nframes:-1:1
+                frame = (frames isa Vector ? frames[i] : frames)
+                !C && frame.from_c && continue
+                key = (T === UInt64 ? ip : frame)
+                this = get!(StackFrameTree{T}, parent.down, key)
+                if recur === :off || this.recur == 0
+                    this.frame = frame
+                    this.up = parent
+                    this.count += 1
+                    this.recur = 1
+                end
+                parent = this
+            end
+            # record where the end of this chain is for this ip
+            insert!(builder_key, fastkey, ip)
+            insert!(builder_value, fastkey, this)
+        end
+    end
+    function cleanup!(node::StackFrameTree)
+        stack = [node]
+        while !isempty(stack)
+            node = pop!(stack)
+            node.recur = 0
+            empty!(node.builder_key)
+            empty!(node.builder_value)
+            append!(stack, values(node.down))
+        end
+        nothing
+    end
+    cleanup!(root)
+    return root
+end
+
+function maxstats(root::StackFrameTree)
+    maxcount = Ref(0)
+    maxflatcount = Ref(0)
+    maxoverhead = Ref(0)
+    maxmaxrecur = Ref(0)
+    stack = [root]
+    while !isempty(stack)
+        node = pop!(stack)
+        maxcount[] = max(maxcount[], node.count)
+        maxoverhead[] = max(maxoverhead[], node.overhead)
+        maxflatcount[] = max(maxflatcount[], node.flat_count)
+        maxmaxrecur[] = max(maxmaxrecur[], node.max_recur)
+        append!(stack, values(node.down))
+    end
+    return (count=maxcount[], count_flat=maxflatcount[], overhead=maxoverhead[], max_recur=maxmaxrecur[])
+end
+
+# Print the stack frame tree starting at a particular root. Uses a worklist to
+# avoid stack overflows.
+function print_tree(io::IO, bt::StackFrameTree{T}, cols::Int, fmt::ProfileFormat) where T
+    maxes = maxstats(bt)
+    filenamemap = Dict{Symbol,String}()
+    worklist = [(bt, 0, 0, "")]
+    println(io, "Overhead ╎ [+additional indent] Count File:Line; Function")
+    println(io, "=========================================================")
+    while !isempty(worklist)
+        (bt, level, noisefloor, str) = popfirst!(worklist)
+        isempty(str) || println(io, str)
+        level > fmt.maxdepth && continue
+        isempty(bt.down) && continue
+        # Order the line information
+        nexts = collect(values(bt.down))
+        # Generate the string for each line
+        strs = tree_format(nexts, level, cols, maxes, filenamemap, T === UInt64)
+        # Recurse to the next level
+        if fmt.sortedby == :count
+            counts = collect(frame.count for frame in nexts)
+            p = sortperm(counts)
+        elseif fmt.sortedby == :overhead
+            m = collect(frame.overhead for frame in nexts)
+            p = sortperm(m)
+        elseif fmt.sortedby == :flat_count
+            m = collect(frame.flat_count for frame in nexts)
+            p = sortperm(m)
+        else
+            lilist = collect(frame.frame for frame in nexts)
+            p = liperm(lilist)
+        end
+        for i in reverse(p)
+            down = nexts[i]
+            count = down.count
+            count < fmt.mincount && continue
+            count < noisefloor && continue
+            str = strs[i]
+            noisefloor_down = fmt.noisefloor > 0 ? floor(Int, fmt.noisefloor * sqrt(count)) : 0
+            pushfirst!(worklist, (down, level + 1, noisefloor_down, str))
+        end
+    end
+end
+
+function tree(io::IO, data::Vector{UInt64}, lidict::Union{LineInfoFlatDict, LineInfoDict}, cols::Int, fmt::ProfileFormat)
+    if fmt.combine
+        root = tree!(StackFrameTree{StackFrame}(), data, lidict, fmt.C, fmt.recur)
+    else
+        root = tree!(StackFrameTree{UInt64}(), data, lidict, fmt.C, fmt.recur)
+    end
+    if isempty(root.down)
+        warning_empty()
+        return
+    end
+    print_tree(io, root, cols, fmt)
+    Base.println(io, "Total snapshots: ", root.count)
+    nothing
+end
+
+function callersf(matchfunc::Function, bt::Vector, lidict::LineInfoFlatDict)
+    counts = Dict{StackFrame, Int}()
+    lastmatched = false
+    for id in bt
+        if id == 0
+            lastmatched = false
+            continue
+        end
+        li = lidict[id]
+        if lastmatched
+            if haskey(counts, li)
+                counts[li] += 1
+            else
+                counts[li] = 1
+            end
+        end
+        lastmatched = matchfunc(li)
+    end
+    k = collect(keys(counts))
+    v = collect(values(counts))
+    p = sortperm(v, rev=true)
+    return [(v[i], k[i]) for i in p]
+end
+
+# Utilities
+function rtruncto(str::String, w::Int)
+    if length(str) <= w
+        return str
+    else
+        return string("...", str[prevind(str, end, w-4):end])
+    end
+end
+function ltruncto(str::String, w::Int)
+    if length(str) <= w
+        return str
+    else
+        return string(str[1:nextind(str, 1, w-4)], "...")
+    end
+end
+
+
+truncto(str::Symbol, w::Int) = truncto(string(str), w)
+
+# Order alphabetically (file, function) and then by line number
+function liperm(lilist::Vector{StackFrame})
+    function lt(a::StackFrame, b::StackFrame)
+        a == UNKNOWN && return false
+        b == UNKNOWN && return true
+        fcmp = cmp(a.file, b.file)
+        fcmp < 0 && return true
+        fcmp > 0 && return false
+        fcmp = cmp(a.func, b.func)
+        fcmp < 0 && return true
+        fcmp > 0 && return false
+        fcmp = cmp(a.line, b.line)
+        fcmp < 0 && return true
+        return false
+    end
+    return sortperm(lilist, lt = lt)
+end
+
+warning_empty() = @warn """
+            There were no samples collected. Run your program longer (perhaps by
+            running it multiple times), or adjust the delay between samples with
+            `Profile.init()`."""
diff --git a/stdlib/Profile/src/time.jl b/stdlib/Profile/src/time.jl
new file mode 100644
index 00000000000000..76479df2e23575
--- /dev/null
+++ b/stdlib/Profile/src/time.jl
@@ -0,0 +1,207 @@
+module Time
+
+using ..Profile: LineInfoDict, LineInfoFlatDict, ProfileFormat
+using ..Profile: getdict, flatten, tree, flat
+import ..Profile: callers
+
+export @profile
+
+"""
+    @profile
+
+`@profile <expression>` runs your expression while taking periodic backtraces. These are
+appended to an internal buffer of backtraces.
+"""
+macro profile(ex)
+    return quote
+        try
+            status = start_timer()
+            if status < 0
+                error(error_codes[status])
+            end
+            $(esc(ex))
+        finally
+            stop_timer()
+        end
+    end
+end
+
+####
+#### User-level functions
+####
+
+"""
+    init(; n::Integer, delay::Real))
+
+Configure the `delay` between backtraces (measured in seconds), and the number `n` of
+instruction pointers that may be stored. Each instruction pointer corresponds to a single
+line of code; backtraces generally consist of a long list of instruction pointers. Default
+settings can be obtained by calling this function with no arguments, and each can be set
+independently using keywords or in the order `(n, delay)`.
+"""
+function init(; n::Union{Nothing,Integer} = nothing, delay::Union{Nothing,Real} = nothing)
+    n_cur = ccall(:jl_profile_maxlen_data, Csize_t, ())
+    delay_cur = ccall(:jl_profile_delay_nsec, UInt64, ())/10^9
+    if n === nothing && delay === nothing
+        return Int(n_cur), delay_cur
+    end
+    nnew = (n === nothing) ? n_cur : n
+    delaynew = (delay === nothing) ? delay_cur : delay
+    init(nnew, delaynew)
+end
+
+function init(n::Integer, delay::Real)
+    status = ccall(:jl_profile_init, Cint, (Csize_t, UInt64), n, round(UInt64,10^9*delay))
+    if status == -1
+        error("could not allocate space for ", n, " instruction pointers")
+    end
+end
+
+"""
+    clear()
+
+Clear any existing backtraces from the internal buffer.
+"""
+clear() = ccall(:jl_profile_clear_data, Cvoid, ())
+
+"""
+    print([io::IO = stdout,] [data::Vector]; kwargs...)
+
+Prints profiling results to `io` (by default, `stdout`). If you do not
+supply a `data` vector, the internal buffer of accumulated backtraces
+will be used.
+
+The keyword arguments can be any combination of:
+
+ - `format` -- Determines whether backtraces are printed with (default, `:tree`) or without (`:flat`)
+   indentation indicating tree structure.
+
+ - `C` -- If `true`, backtraces from C and Fortran code are shown (normally they are excluded).
+
+ - `combine` -- If `true` (default), instruction pointers are merged that correspond to the same line of code.
+
+ - `maxdepth` -- Limits the depth higher than `maxdepth` in the `:tree` format.
+
+ - `sortedby` -- Controls the order in `:flat` format. `:filefuncline` (default) sorts by the source
+    line, `:count` sorts in order of number of collected samples, and `:overhead` sorts by the number of samples
+    incurred by each function by itself.
+
+ - `noisefloor` -- Limits frames that exceed the heuristic noise floor of the sample (only applies to format `:tree`).
+    A suggested value to try for this is 2.0 (the default is 0). This parameter hides samples for which `n <= noisefloor * √N`,
+    where `n` is the number of samples on this line, and `N` is the number of samples for the callee.
+
+ - `mincount` -- Limits the printout to only those lines with at least `mincount` occurrences.
+
+ - `recur` -- Controls the recursion handling in `:tree` format. `:off` (default) prints the tree as normal. `:flat` instead
+    compresses any recursion (by ip), showing the approximate effect of converting any self-recursion into an iterator.
+    `:flatc` does the same but also includes collapsing of C frames (may do odd things around `jl_apply`).
+"""
+function print(io::IO,
+        data::Vector = fetch(),
+        lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data)
+        ;
+        format = :tree,
+        C = false,
+        combine = true,
+        maxdepth::Int = typemax(Int),
+        mincount::Int = 0,
+        noisefloor = 0,
+        sortedby::Symbol = :filefuncline,
+        recur::Symbol = :off)
+    print(io, data, lidict, ProfileFormat(
+            C = C,
+            combine = combine,
+            maxdepth = maxdepth,
+            mincount = mincount,
+            noisefloor = noisefloor,
+            sortedby = sortedby,
+            recur = recur),
+        format)
+end
+
+function print(io::IO, data::Vector, lidict::Union{LineInfoDict, LineInfoFlatDict}, fmt::ProfileFormat, format::Symbol)
+    cols::Int = Base.displaysize(io)[2]
+    data = convert(Vector{UInt64}, data)
+    fmt.recur ∈ (:off, :flat, :flatc) || throw(ArgumentError("recur value not recognized"))
+    if format === :tree
+        tree(io, data, lidict, cols, fmt)
+    elseif format === :flat
+        fmt.recur === :off || throw(ArgumentError("format flat only implements recur=:off"))
+        flat(io, data, lidict, cols, fmt)
+    else
+        throw(ArgumentError("output format $(repr(format)) not recognized"))
+    end
+end
+
+"""
+    print([io::IO = stdout,] data::Vector, lidict::LineInfoDict; kwargs...)
+
+Prints profiling results to `io`. This variant is used to examine results exported by a
+previous call to [`retrieve`](@ref). Supply the vector `data` of backtraces and
+a dictionary `lidict` of line information.
+
+See `Profile.print([io], data)` for an explanation of the valid keyword arguments.
+"""
+print(data::Vector = fetch(), lidict::Union{LineInfoDict, LineInfoFlatDict} = getdict(data); kwargs...) =
+    print(stdout, data, lidict; kwargs...)
+
+"""
+    retrieve() -> data, lidict
+
+"Exports" profiling results in a portable format, returning the set of all backtraces
+(`data`) and a dictionary that maps the (session-specific) instruction pointers in `data` to
+`LineInfo` values that store the file name, function name, and line number. This function
+allows you to save profiling results for future analysis.
+"""
+function retrieve()
+    data = fetch()
+    return (data, getdict(data))
+end
+
+
+# C wrappers
+start_timer() = ccall(:jl_profile_start_timer, Cint, ())
+
+stop_timer() = ccall(:jl_profile_stop_timer, Cvoid, ())
+
+is_running() = ccall(:jl_profile_is_running, Cint, ())!=0
+
+ did_overflow() = ccall(:jl_profile_overflow, Cint, ()) != 0
+
+get_data_pointer() = convert(Ptr{UInt}, ccall(:jl_profile_get_data, Ptr{UInt8}, ()))
+
+len_data() = convert(Int, ccall(:jl_profile_len_data, Csize_t, ()))
+
+maxlen_data() = convert(Int, ccall(:jl_profile_maxlen_data, Csize_t, ()))
+
+error_codes = Dict(
+    -1=>"cannot specify signal action for profiling",
+    -2=>"cannot create the timer for profiling",
+    -3=>"cannot start the timer for profiling",
+    -4=>"cannot unblock SIGUSR1")
+
+
+"""
+    fetch() -> data
+
+Returns a copy of the buffer of profile backtraces. Note that the
+values in `data` have meaning only on this machine in the current session, because it
+depends on the exact memory addresses used in JIT-compiling. This function is primarily for
+internal use; [`retrieve`](@ref) may be a better choice for most users.
+"""
+function fetch()
+    len = len_data()
+    if did_overflow()
+        @warn """The time profile data buffer overflowed; profiling terminated
+                 before your program finished. To profile for longer runs, call
+                 `Profile.init()` with a larger buffer and/or larger delay."""
+    end
+    data = Vector{UInt}(undef, len)
+    GC.@preserve data unsafe_copyto!(pointer(data), get_data_pointer(), len)
+    return data
+end
+
+callers(funcname::String; kwargs...) = callers(funcname, retrieve()...; kwargs...)
+callers(func::Function; kwargs...) = callers(string(func), retrieve()...; kwargs...)
+
+end # module
diff --git a/stdlib/Profile/test/memory.jl b/stdlib/Profile/test/memory.jl
new file mode 100644
index 00000000000000..f8be2a127c5092
--- /dev/null
+++ b/stdlib/Profile/test/memory.jl
@@ -0,0 +1,49 @@
+using Profile
+
+import Profile.Memory: clear_memprofile_data, read_and_coalesce_memprofile_data, get_memprofile_bt_data, get_memprofile_bt_data_len, get_memprofile_alloc_data, get_memprofile_alloc_data_len, AllocationInfo, get_memprofile_overflow, getdict, close_AI, open_AI, start_memprofile, stop_memprofile, closing_tag
+
+forever_chunks = []
+function foo()
+    global forever_chunks
+
+    # Create a chunk that will live forever
+    push!(forever_chunks, Array{UInt8,2}(undef,1000,1000))
+
+    # Create a chunk that......will not.
+    Array{UInt8,2}(undef,1000,100)
+
+    # Create lots of little objects
+    tups = Any[(1,), (2,3)]
+    for idx in 1:20
+        addition = (tups[end-1]..., tups[end]...)
+        addition = addition[2:end]
+        push!(tups, addition)
+    end
+    # Keep a few of them around
+    push!(forever_chunks, tups[1])
+    push!(forever_chunks, tups[end])
+    return nothing
+end
+
+function test(should_profile::Bool = false)
+    # We use the trick of setting the tag filter to 0x0000 to disable
+    # memory profiling so that we don't ever have to recompile.
+    Profile.Memory.init(50_000_000, 1_000_000, 0xffff * should_profile)
+    global forever_chunks = []
+    @memprofile foo()
+end
+
+@info("Precompiling test()")
+test(false)
+
+@info("Running test()")
+test(true)
+
+@info("Reading memprofile data...")
+open_chunks, closed_chunks, ghost_chunks = read_and_coalesce_memprofile_data()
+println("open_chunks:")
+display(open_chunks)
+
+# This often crashes us, if we've held on to a bad object address.
+# Explicitly run it to make sure everything is on the up-and-up.
+Base.GC.gc()
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index b26f1f70e5d465..c25af5b0eda2bb 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -2,94 +2,11 @@
 
 using Test, Profile, Serialization
 
-Profile.clear()
-Profile.init()
-
-let iobuf = IOBuffer()
-    for fmt in (:tree, :flat)
-        Test.@test_logs (:warn, r"^There were no samples collected\.") Profile.print(iobuf, format=fmt, C=true)
-        Test.@test_logs (:warn, r"^There were no samples collected\.") Profile.print(iobuf, [0x0000000000000001], Dict(0x0000000000000001 => [Base.StackTraces.UNKNOWN]), format=fmt, C=false)
-    end
-end
-
-@noinline function busywait(t, n_tries)
-    iter = 0
-    init_data = Profile.len_data()
-    while iter < n_tries && Profile.len_data() == init_data
-        iter += 1
-        tend = time() + t
-        while time() < tend end
-    end
+@testset "time" begin
+    include("time.jl")
 end
 
-busywait(0, 0) # compile
-@profile busywait(1, 20)
-
-let r = Profile.retrieve()
-    mktemp() do path, io
-        serialize(io, r)
-        close(io)
-        open(path) do io
-            @test isa(deserialize(io), Tuple{Vector{UInt},Dict{UInt64,Vector{Base.StackTraces.StackFrame}}})
-        end
-    end
+@testset "memory" begin
+    include("memory.jl")
 end
 
-let iobuf = IOBuffer()
-    Profile.print(iobuf, format=:tree, C=true)
-    str = String(take!(iobuf))
-    @test !isempty(str)
-    truncate(iobuf, 0)
-    Profile.print(iobuf, format=:tree, maxdepth=2)
-    str = String(take!(iobuf))
-    @test !isempty(str)
-    truncate(iobuf, 0)
-    Profile.print(iobuf, format=:flat, C=true)
-    str = String(take!(iobuf))
-    @test !isempty(str)
-    truncate(iobuf, 0)
-    Profile.print(iobuf)
-    @test !isempty(String(take!(iobuf)))
-    truncate(iobuf, 0)
-    Profile.print(iobuf, format=:flat, sortedby=:count)
-    @test !isempty(String(take!(iobuf)))
-    Profile.print(iobuf, format=:tree, recur=:flat)
-    str = String(take!(iobuf))
-    @test !isempty(str)
-    truncate(iobuf, 0)
-end
-
-Profile.clear()
-@test isempty(Profile.fetch())
-
-let
-    @test Profile.callers("\\") !== nothing
-    @test Profile.callers(\) !== nothing
-    # linerange with no filename provided should fail
-    @test_throws ArgumentError Profile.callers(\; linerange=10:50)
-end
-
-# issue #13229
-module I13229
-using Test, Profile
-global z = 0
-@timed @profile for i = 1:5
-    function f(x)
-        return x + i
-    end
-    global z = f(i)
-end
-@test z == 10
-end
-
-@testset "setting sample count and delay in init" begin
-    n_, delay_ = Profile.init()
-    @test n_ == 1_000_000
-    def_delay = Sys.iswindows() ? 0.01 : 0.001
-    @test delay_ == def_delay
-    Profile.init(n=1_000_001, delay=0.0005)
-    n_, delay_ = Profile.init()
-    @test n_ == 1_000_001
-    @test delay_ == 0.0005
-    Profile.init(n=1_000_000, delay=def_delay)
-end
diff --git a/stdlib/Profile/test/time.jl b/stdlib/Profile/test/time.jl
new file mode 100644
index 00000000000000..2d774bb2de8bbf
--- /dev/null
+++ b/stdlib/Profile/test/time.jl
@@ -0,0 +1,90 @@
+Profile.Time.init()
+
+let iobuf = IOBuffer()
+    for fmt in (:tree, :flat)
+        Test.@test_logs (:warn, r"^There were no samples collected\.") Profile.Time.print(iobuf, format=fmt, C=true)
+        Test.@test_logs (:warn, r"^There were no samples collected\.") Profile.Time.print(iobuf, [0x0000000000000001], Profile.LineInfoDict(0x0000000000000001 => [Base.StackTraces.UNKNOWN]), format=fmt, C=false)
+    end
+end
+
+@noinline function busywait(t, n_tries)
+    iter = 0
+    init_data = Profile.Time.len_data()
+    while iter < n_tries && Profile.Time.len_data() == init_data
+        iter += 1
+        tend = time() + t
+        while time() < tend end
+    end
+end
+
+busywait(0, 0) # compile
+@profile busywait(1, 20)
+
+let r = Profile.Time.retrieve()
+    mktemp() do path, io
+        serialize(io, r)
+        close(io)
+        open(path) do io
+            @test isa(deserialize(io), Tuple{Vector{UInt},Profile.LineInfoDict})
+        end
+    end
+end
+
+let iobuf = IOBuffer()
+    Profile.Time.print(iobuf, format=:tree, C=true)
+    str = String(take!(iobuf))
+    @test !isempty(str)
+    truncate(iobuf, 0)
+    Profile.Time.print(iobuf, format=:tree, maxdepth=2)
+    str = String(take!(iobuf))
+    @test !isempty(str)
+    truncate(iobuf, 0)
+    Profile.Time.print(iobuf, format=:flat, C=true)
+    str = String(take!(iobuf))
+    @test !isempty(str)
+    truncate(iobuf, 0)
+    Profile.Time.print(iobuf)
+    @test !isempty(String(take!(iobuf)))
+    truncate(iobuf, 0)
+    Profile.Time.print(iobuf, format=:flat, sortedby=:count)
+    @test !isempty(String(take!(iobuf)))
+    Profile.Time.print(iobuf, format=:tree, recur=:flat)
+    str = String(take!(iobuf))
+    @test !isempty(str)
+    truncate(iobuf, 0)
+end
+
+Profile.Time.clear()
+@test isempty(Profile.Time.fetch())
+
+let
+    @test Profile.Time.callers("\\") !== nothing
+    @test Profile.Time.callers(\) !== nothing
+    # linerange with no filename provided should fail
+    @test_throws ArgumentError Profile.Time.callers(\; linerange=10:50)
+end
+
+# issue #13229
+module I13229
+using Test, Profile
+global z = 0
+@timed @profile for i = 1:5
+    function f(x)
+        return x + i
+    end
+    global z = f(i)
+end
+@test z == 10
+end
+
+@testset "setting sample count and delay in init" begin
+    n_, delay_ = Profile.Time.init()
+    @test n_ == 1_000_000
+    def_delay = Sys.iswindows() ? 0.01 : 0.001
+    @test delay_ == def_delay
+    Profile.Time.init(n=1_000_001, delay=0.0005)
+    n_, delay_ = Profile.Time.init()
+    @test n_ == 1_000_001
+    @test delay_ == 0.0005
+    Profile.Time.init(n=1_000_000, delay=def_delay)
+end