diff --git a/src/gc-alloc-profiler.h b/src/gc-alloc-profiler.h
index 3cfe7237dd183..8be6fed21a899 100644
--- a/src/gc-alloc-profiler.h
+++ b/src/gc-alloc-profiler.h
@@ -35,7 +35,7 @@ void _maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t
 
 extern int g_alloc_profile_enabled;
 
-#define jl_gc_unknown_type_tag ((uintptr_t)0xdeadaa03)
+#define jl_gc_unknown_type_tag ((jl_datatype_t*)0xdeadaa03)
 
 static inline void maybe_record_alloc_to_profile(jl_value_t *val, size_t size, jl_datatype_t *typ) JL_NOTSAFEPOINT {
     if (__unlikely(g_alloc_profile_enabled)) {
diff --git a/stdlib/Profile/src/Allocs.jl b/stdlib/Profile/src/Allocs.jl
index 70a9978b456ca..15a7afbff76fe 100644
--- a/stdlib/Profile/src/Allocs.jl
+++ b/stdlib/Profile/src/Allocs.jl
@@ -44,6 +44,18 @@ julia> results = Profile.Allocs.fetch()
 julia> last(sort(results.allocs, by=x->x.size))
 Profile.Allocs.Alloc(Vector{Any}, Base.StackTraces.StackFrame[_new_array_ at array.c:127, ...], 5576)
 ```
+
+!!! note
+    The current implementation of the Allocations Profiler does not
+    capture types for all allocations. Allocations for which the profiler
+    could not capture the type are represented as having type
+    `Profile.Allocs.UnknownType`.
+
+    You can read more about the missing types and the plan to improve this, here:
+    https://github.com/JuliaLang/julia/issues/43688.
+
+!!! compat "Julia 1.8"
+    The allocation profiler was added in Julia 1.8.
 """
 macro profile(opts, ex)
     _prof_expr(ex, opts)
@@ -52,18 +64,14 @@ macro profile(ex)
     _prof_expr(ex, :(sample_rate=0.0001))
 end
 
-# globals used for tracking how many allocs we're missing
-# vs the alloc counters used by @time
-const _g_gc_num_before = Ref{Base.GC_Num}()
-const _g_sample_rate = Ref{Real}()
-const _g_expected_sampled_allocs = Ref{Float64}(0)
-
 function _prof_expr(expr, opts)
     quote
         $start(; $(esc(opts)))
-        local res = $(esc(expr))
-        $stop()
-        res
+        try
+            $(esc(expr))
+        finally
+            $stop()
+        end
     end
 end
 
@@ -75,9 +83,6 @@ A sample rate of 1.0 will record everything; 0.0 will record nothing.
 """
 function start(; sample_rate::Real)
     ccall(:jl_start_alloc_profile, Cvoid, (Cdouble,), Float64(sample_rate))
-
-    _g_sample_rate[] = sample_rate
-    _g_gc_num_before[] = Base.gc_num()
 end
 
 """
@@ -87,15 +92,6 @@ Stop recording allocations.
 """
 function stop()
     ccall(:jl_stop_alloc_profile, Cvoid, ())
-
-    # increment a counter of how many allocs we would expect
-    # the memory profiler to see, based on how many allocs
-    # actually happened.
-    gc_num_after = Base.gc_num()
-    gc_diff = Base.GC_Diff(gc_num_after, _g_gc_num_before[])
-    alloc_count = Base.gc_alloc_count(gc_diff)
-    expected_samples = alloc_count * _g_sample_rate[]
-    _g_expected_sampled_allocs[] += expected_samples
 end
 
 """
@@ -105,8 +101,6 @@ Clear all previously profiled allocation information from memory.
 """
 function clear()
     ccall(:jl_free_alloc_profile, Cvoid, ())
-
-    _g_expected_sampled_allocs[] = 0
     return nothing
 end
 
@@ -118,25 +112,7 @@ objects which can be analyzed.
 """
 function fetch()
     raw_results = ccall(:jl_fetch_alloc_profile, RawResults, ())
-    decoded_results = decode(raw_results)
-
-    # avoid divide-by-0 errors
-    if _g_expected_sampled_allocs[] > 0
-        missed_allocs = max(0, _g_expected_sampled_allocs[] - length(decoded_results.allocs))
-        missed_percentage = max(0, round(Int, missed_allocs / _g_expected_sampled_allocs[] * 100))
-        if missed_percentage > 0
-            @warn("The allocation profiler is not fully implemented, and missed approximately" *
-            " $(missed_percentage)% (estimated $(round(Int, missed_allocs)) / $(round(Int,
-            _g_expected_sampled_allocs[]))) " *
-                    "of sampled allocs in the last run. " *
-                    "For more info see https://github.com/JuliaLang/julia/issues/43688")
-        else
-            @warn("The allocation profiler is not fully implemented, and may have missed" *
-            " some of the allocs. " *
-                    "For more info see https://github.com/JuliaLang/julia/issues/43688")
-        end
-    end
-    return decoded_results
+    return decode(raw_results)
 end
 
 # decoded results
diff --git a/stdlib/Profile/test/allocs.jl b/stdlib/Profile/test/allocs.jl
new file mode 100644
index 0000000000000..b8d6222d07567
--- /dev/null
+++ b/stdlib/Profile/test/allocs.jl
@@ -0,0 +1,122 @@
+using Test
+using Profile: Allocs
+
+@testset "alloc profiler doesn't segfault" begin
+    res = Allocs.@profile sample_rate=1.0 begin
+        # test the allocations during compilation
+        using Base64
+    end
+    profile = Allocs.fetch()
+
+    @test length(profile.allocs) > 0
+    first_alloc = profile.allocs[1]
+    @test first_alloc.size > 0
+    @test length(first_alloc.stacktrace) > 0
+    @test length(string(first_alloc.type)) > 0
+end
+
+@testset "alloc profiler works when there are multiple tasks on multiple threads" begin
+    NUM_TASKS = 1000
+
+    # This test is only really meaningful if we're running on
+    # multiple threads, but this isn't true on the windows tests,
+    # causing them to fail. So, commenting this assertion out.
+    # @test Threads.nthreads() > 1
+
+    function do_work()
+        ch = Channel{Vector{Float64}}(Inf)
+        @sync for i in 1:NUM_TASKS
+            Threads.@spawn begin
+                # generate garbage
+                put!(ch, zeros(100))
+            end
+        end
+        close(ch)
+    end
+
+    # call once to make sure it's compiled
+    precompile(do_work, ())
+    do_work()
+
+    res = Allocs.@profile sample_rate=1 begin
+        do_work()
+    end
+    profile = Allocs.fetch()
+
+    # expecting at least 2 allocations per task:
+    # 1. the task
+    # 2. the vector
+    @test length(profile.allocs) >= 2*NUM_TASKS
+    first_alloc = profile.allocs[1]
+    @test first_alloc.size > 0
+    @test length(first_alloc.stacktrace) > 0
+    @test length(string(first_alloc.type)) > 0
+
+    @testset for type in (Task, Vector{Float64},)
+        @test length(filter(a->a.type <: type, profile.allocs)) >= NUM_TASKS
+    end
+
+    # TODO: it would be nice to assert that these tasks
+    # were actually scheduled onto multiple threads,
+    # and we see allocs from all threads in the profile
+end
+
+@testset "alloc profiler start stop fetch clear" begin
+    function do_work()
+        # Compiling allocates a lot
+        for f in (gensym() for _ in 1:10)
+            @eval begin
+                $f() = 10
+                $f()
+            end
+        end
+    end
+
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    Allocs.@profile sample_rate=1 do_work()
+    curr_allocs = length(Allocs.fetch().allocs)
+    @test curr_allocs > 10
+
+    # Do _more_ work, adding into the same profile
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > curr_allocs
+
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    # Clear without fetching
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.clear()
+    @test length(Allocs.fetch().allocs) == 0
+
+    # And things still work like normal afterwards
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.@profile sample_rate=1 do_work()
+    Allocs.@profile sample_rate=1 do_work()
+    @test length(Allocs.fetch().allocs) > 10
+
+    Allocs.clear()
+end
+
+@testset "alloc profiler catches strings" begin
+    Allocs.@profile sample_rate=1 "$(rand())"
+
+    prof = Allocs.fetch()
+    Allocs.clear()
+
+    @test length(prof.allocs) >= 1
+    @test length([a for a in prof.allocs if a.type == String]) >= 1
+end