diff --git a/.gitattributes b/.gitattributes
index 18b5a6ee4f9..0f0445cc7f9 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -193,7 +193,7 @@ tools/ocaml-objcopy-macosx text eol=lf
 tools/ocamlsize text eol=lf
 tools/pre-commit-githook text eol=lf
 tools/markdown-add-pr-links.sh text eol=lf
-runtime/caml/sizeclasses.h typo.missing-header typo.white-at-eol
+runtime/caml/sizeclasses.h typo.missing-header
 
 # Tests which include references spanning multiple lines fail with \r\n
 # endings, so use \n endings only, even on Windows.
diff --git a/otherlibs/runtime_events/runtime_events.ml b/otherlibs/runtime_events/runtime_events.ml
index b07b0c206ff..cdbd6ac4a79 100644
--- a/otherlibs/runtime_events/runtime_events.ml
+++ b/otherlibs/runtime_events/runtime_events.ml
@@ -72,6 +72,10 @@ type runtime_phase =
 | EV_MINOR_LOCAL_ROOTS_PROMOTE
 | EV_DOMAIN_CONDITION_WAIT
 | EV_DOMAIN_RESIZE_HEAP_RESERVATION
+| EV_COMPACT
+| EV_COMPACT_EVACUATE
+| EV_COMPACT_FORWARD
+| EV_COMPACT_RELEASE
 
 type lifecycle =
   EV_RING_START
@@ -153,6 +157,10 @@ let runtime_phase_name phase =
   | EV_DOMAIN_CONDITION_WAIT -> "domain_condition_wait"
   | EV_MAJOR_FINISH_CYCLE -> "major_finish_cycle"
   | EV_DOMAIN_RESIZE_HEAP_RESERVATION -> "domain_resize_heap_reservation"
+  | EV_COMPACT -> "compaction"
+  | EV_COMPACT_EVACUATE -> "compaction_evacuate"
+  | EV_COMPACT_FORWARD -> "compaction_forward"
+  | EV_COMPACT_RELEASE -> "compaction_release"
 
 let lifecycle_name lifecycle =
   match lifecycle with
diff --git a/otherlibs/runtime_events/runtime_events.mli b/otherlibs/runtime_events/runtime_events.mli
index dc13257907e..2e2c67a30b9 100644
--- a/otherlibs/runtime_events/runtime_events.mli
+++ b/otherlibs/runtime_events/runtime_events.mli
@@ -130,6 +130,10 @@ type runtime_phase =
 | EV_MINOR_LOCAL_ROOTS_PROMOTE
 | EV_DOMAIN_CONDITION_WAIT
 | EV_DOMAIN_RESIZE_HEAP_RESERVATION
+| EV_COMPACT
+| EV_COMPACT_EVACUATE
+| EV_COMPACT_FORWARD
+| EV_COMPACT_RELEASE
 
 (** Lifecycle events for the ring itself *)
 type lifecycle =
diff --git a/runtime/caml/compact.h b/runtime/caml/compact.h
deleted file mode 100644
index c2022c9020c..00000000000
--- a/runtime/caml/compact.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/**************************************************************************/
-/*                                                                        */
-/*                                 OCaml                                  */
-/*                                                                        */
-/*              Damien Doligez, projet Para, INRIA Rocquencourt           */
-/*                                                                        */
-/*   Copyright 1996 Institut National de Recherche en Informatique et     */
-/*     en Automatique.                                                    */
-/*                                                                        */
-/*   All rights reserved.  This file is distributed under the terms of    */
-/*   the GNU Lesser General Public License version 2.1, with the          */
-/*   special exception on linking described in the file LICENSE.          */
-/*                                                                        */
-/**************************************************************************/
-
-#ifndef CAML_COMPACT_H
-#define CAML_COMPACT_H
-
-#ifdef CAML_INTERNALS
-
-#include "config.h"
-#include "misc.h"
-#include "mlvalues.h"
-
-/* [caml_compact_heap] compacts the heap and optionally changes the
-   allocation policy.
-   if [new_allocation_policy] is -1, the policy is not changed.
-*/
-void caml_compact_heap (intnat new_allocation_policy);
-
-void caml_compact_heap_maybe (double previous_overhead);
-void caml_invert_root (value v, value *p);
-
-#endif /* CAML_INTERNALS */
-
-#endif /* CAML_COMPACT_H */
diff --git a/runtime/caml/dune b/runtime/caml/dune
index 03dc3fe46d9..dbd7ff58700 100644
--- a/runtime/caml/dune
+++ b/runtime/caml/dune
@@ -40,7 +40,6 @@
     (callback.h as caml/callback.h)
     (camlatomic.h as caml/camlatomic.h)
     (codefrag.h as caml/codefrag.h)
-    (compact.h as caml/compact.h)
     (compare.h as caml/compare.h)
     (config.h as caml/config.h)
     (custom.h as caml/custom.h)
diff --git a/runtime/caml/major_gc.h b/runtime/caml/major_gc.h
index dbcd42126b8..9defb6de3d1 100644
--- a/runtime/caml/major_gc.h
+++ b/runtime/caml/major_gc.h
@@ -53,8 +53,10 @@ void caml_darken(void*, value, volatile value* ignored);
 void caml_darken_cont(value);
 void caml_mark_root(value, value*);
 void caml_empty_mark_stack(void);
-void caml_finish_major_cycle(void);
-
+void caml_finish_major_cycle(int force_compaction);
+#ifdef DEBUG
+int caml_mark_stack_is_empty(void);
+#endif
 /* Ephemerons and finalisers */
 void caml_orphan_allocated_words(void);
 void caml_add_to_orphaned_ephe_list(struct caml_ephe_info* ephe_info);
diff --git a/runtime/caml/osdeps.h b/runtime/caml/osdeps.h
index be6d192d5ef..b916a62f149 100644
--- a/runtime/caml/osdeps.h
+++ b/runtime/caml/osdeps.h
@@ -108,7 +108,7 @@ extern void caml_print_timestamp(FILE* channel, int formatted);
 
 /* Memory management platform-specific operations */
 
-void *caml_plat_mem_map(uintnat, uintnat, int);
+void *caml_plat_mem_map(uintnat, int);
 void *caml_plat_mem_commit(void *, uintnat);
 void caml_plat_mem_decommit(void *, uintnat);
 void caml_plat_mem_unmap(void *, uintnat);
diff --git a/runtime/caml/platform.h b/runtime/caml/platform.h
index 373419e3c9e..b21d2bcc611 100644
--- a/runtime/caml/platform.h
+++ b/runtime/caml/platform.h
@@ -122,13 +122,8 @@ uintnat caml_mem_round_up_pages(uintnat size);
 /* The size given to caml_mem_map and caml_mem_commit must be a multiple of
    caml_plat_pagesize. The size given to caml_mem_unmap and caml_mem_decommit
    must match the size given to caml_mem_map/caml_mem_commit for mem.
-
-   The Windows and Cygwin implementations do not support arbitrary alignment
-   and will fail for alignment values greater than caml_plat_mmap_alignment.
-   Luckily, this value is rather large on those platforms: 64KiB. This is enough
-   for all alignments used in the runtime system so far, the larger being the
-   major heap pools aligned on 32KiB boundaries. */
-void* caml_mem_map(uintnat size, uintnat alignment, int reserve_only);
+*/
+void* caml_mem_map(uintnat size, int reserve_only);
 void* caml_mem_commit(void* mem, uintnat size);
 void caml_mem_decommit(void* mem, uintnat size);
 void caml_mem_unmap(void* mem, uintnat size);
diff --git a/runtime/caml/runtime_events.h b/runtime/caml/runtime_events.h
index 0c591f57d6c..b6238d9ea2f 100644
--- a/runtime/caml/runtime_events.h
+++ b/runtime/caml/runtime_events.h
@@ -110,7 +110,11 @@ typedef enum {
     EV_MINOR_REMEMBERED_SET_PROMOTE,
     EV_MINOR_LOCAL_ROOTS_PROMOTE,
     EV_DOMAIN_CONDITION_WAIT,
-    EV_DOMAIN_RESIZE_HEAP_RESERVATION
+    EV_DOMAIN_RESIZE_HEAP_RESERVATION,
+    EV_COMPACT,
+    EV_COMPACT_EVACUATE,
+    EV_COMPACT_FORWARD,
+    EV_COMPACT_RELEASE
 } ev_runtime_phase;
 
 typedef enum {
diff --git a/runtime/caml/shared_heap.h b/runtime/caml/shared_heap.h
index 768415c586f..a2fb1a81498 100644
--- a/runtime/caml/shared_heap.h
+++ b/runtime/caml/shared_heap.h
@@ -24,6 +24,8 @@
 #include "misc.h"
 #include "gc_stats.h"
 
+CAMLextern atomic_uintnat caml_compactions_count;
+
 struct caml_heap_state;
 struct pool;
 
@@ -31,7 +33,7 @@ struct caml_heap_state* caml_init_shared_heap(void);
 void caml_teardown_shared_heap(struct caml_heap_state* heap);
 
 value* caml_shared_try_alloc(struct caml_heap_state*,
-                             mlsize_t, tag_t, reserved_t, int);
+                             mlsize_t, tag_t, reserved_t);
 
 /* Copy the domain-local heap stats into a heap stats sample. */
 void caml_collect_heap_stats_sample(
@@ -45,7 +47,9 @@ uintnat caml_heap_size(struct caml_heap_state*);
 uintnat caml_top_heap_words(struct caml_heap_state*);
 uintnat caml_heap_blocks(struct caml_heap_state*);
 
-struct pool* caml_pool_of_shared_block(value v);
+void caml_compact_heap(caml_domain_state* domain_state,
+                         int participating_count,
+                         caml_domain_state** participants);
 
 void caml_shared_unpin(value v);
 
diff --git a/runtime/caml/sizeclasses.h b/runtime/caml/sizeclasses.h
index 83d9ccf2f55..012d1d649ca 100644
--- a/runtime/caml/sizeclasses.h
+++ b/runtime/caml/sizeclasses.h
@@ -3,13 +3,13 @@
 #define POOL_HEADER_WSIZE 4
 #define SIZECLASS_MAX 128
 #define NUM_SIZECLASSES 32
-static const unsigned int wsize_sizeclass[NUM_SIZECLASSES] = 
+static const unsigned int wsize_sizeclass[NUM_SIZECLASSES] =
 { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 23, 26, 29, 33, 37, 42,
   47, 53, 59, 65, 73, 81, 89, 99, 108, 118, 128 };
-static const unsigned char wastage_sizeclass[NUM_SIZECLASSES] = 
+static const unsigned char wastage_sizeclass[NUM_SIZECLASSES] =
 { 0, 0, 0, 0, 2, 0, 4, 4, 6, 2, 0, 4, 12, 6, 12, 21, 10, 3, 0, 22, 18, 3, 11,
   21, 62, 4, 42, 87, 33, 96, 80, 124 };
-static const unsigned char sizeclass_wsize[SIZECLASS_MAX + 1] = 
+static const unsigned char sizeclass_wsize[SIZECLASS_MAX + 1] =
 { 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
   15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20,
   20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23,
diff --git a/runtime/domain.c b/runtime/domain.c
index dcee16d0ce3..1b2f136b2d3 100644
--- a/runtime/domain.c
+++ b/runtime/domain.c
@@ -735,8 +735,7 @@ static void reserve_minor_heaps(void) {
   minor_heap_reservation_bsize = minor_heap_max_bsz * Max_domains;
 
   /* reserve memory space for minor heaps */
-  heaps_base = caml_mem_map(minor_heap_reservation_bsize, caml_plat_pagesize,
-                1 /* reserve_only */);
+  heaps_base = caml_mem_map(minor_heap_reservation_bsize, 1 /* reserve_only */);
   if (heaps_base == NULL)
     caml_fatal_error("Not enough heap memory to reserve minor heaps");
 
@@ -1747,7 +1746,7 @@ static void handover_finalisers(caml_domain_state* domain_state)
     if (caml_gc_phase != Phase_sweep_and_mark_main) {
       /* Force a major GC cycle to simplify constraints for
        * handing over finalisers. */
-      caml_finish_major_cycle();
+      caml_finish_major_cycle(0);
       CAMLassert(caml_gc_phase == Phase_sweep_and_mark_main);
     }
     caml_add_orphaned_finalisers (f);
diff --git a/runtime/gc_ctrl.c b/runtime/gc_ctrl.c
index e74e038b0c1..acbee38dd7d 100644
--- a/runtime/gc_ctrl.c
+++ b/runtime/gc_ctrl.c
@@ -56,11 +56,12 @@ CAMLprim value caml_gc_quick_stat(value v)
   CAMLlocal1 (res);
 
   /* get a copy of these before allocating anything... */
-  intnat majcoll, mincoll;
+  intnat majcoll, mincoll, compactions;
   struct gc_stats s;
   caml_compute_gc_stats(&s);
   majcoll = caml_major_cycles_completed;
   mincoll = atomic_load(&caml_minor_collections_count);
+  compactions = atomic_load(&caml_compactions_count);
 
   res = caml_alloc_tuple (17);
   Store_field (res, 0, caml_copy_double ((double)s.alloc_stats.minor_words));
@@ -81,7 +82,7 @@ CAMLprim value caml_gc_quick_stat(value v)
   Store_field (res, 10, Val_long (0));
   Store_field (res, 11, Val_long (0));
   Store_field (res, 12, Val_long (s.heap_stats.pool_frag_words));
-  Store_field (res, 13, Val_long (0));
+  Store_field (res, 13, Val_long (compactions));
   Store_field (res, 14, Val_long (
     s.heap_stats.pool_max_words + s.heap_stats.large_max_words));
   Store_field (res, 15, Val_long (0));
@@ -239,12 +240,12 @@ CAMLprim value caml_gc_minor(value v)
   return caml_raise_if_exception(exn);
 }
 
-static value gc_major_exn(void)
+static value gc_major_exn(int force_compaction)
 {
   CAML_EV_BEGIN(EV_EXPLICIT_GC_MAJOR);
   caml_gc_log ("Major GC cycle requested");
   caml_empty_minor_heaps_once();
-  caml_finish_major_cycle();
+  caml_finish_major_cycle(force_compaction);
   value exn = caml_process_pending_actions_exn();
   CAML_EV_END(EV_EXPLICIT_GC_MAJOR);
   return exn;
@@ -254,7 +255,7 @@ CAMLprim value caml_gc_major(value v)
 {
   Caml_check_caml_state();
   CAMLassert (v == Val_unit);
-  return caml_raise_if_exception(gc_major_exn());
+  return caml_raise_if_exception(gc_major_exn(0));
 }
 
 static value gc_full_major_exn(void)
@@ -267,7 +268,7 @@ static value gc_full_major_exn(void)
      currently-unreachable object to be collected. */
   for (i = 0; i < 3; i++) {
     caml_empty_minor_heaps_once();
-    caml_finish_major_cycle();
+    caml_finish_major_cycle(0);
     exn = caml_process_pending_actions_exn();
     if (Is_exception_result(exn)) break;
   }
@@ -296,13 +297,12 @@ CAMLprim value caml_gc_major_slice (value v)
 CAMLprim value caml_gc_compaction(value v)
 {
   Caml_check_caml_state();
-  value exn = Val_unit;
   CAML_EV_BEGIN(EV_EXPLICIT_GC_COMPACT);
   CAMLassert (v == Val_unit);
-  exn = gc_major_exn();
+  value exn = gc_major_exn(1);
   ++ Caml_state->stat_forced_major_collections;
   CAML_EV_END(EV_EXPLICIT_GC_COMPACT);
-  return exn;
+  return caml_raise_if_exception(exn);
 }
 
 CAMLprim value caml_gc_stat(value v)
diff --git a/runtime/intern.c b/runtime/intern.c
index 87d5d346e97..b7df4bcd54b 100644
--- a/runtime/intern.c
+++ b/runtime/intern.c
@@ -401,8 +401,7 @@ static value intern_alloc_obj(struct caml_intern_state* s, caml_domain_state* d,
     s->intern_dest += 1 + wosize;
   } else {
     p = caml_shared_try_alloc(d->shared_heap, wosize, tag,
-                              0, /* no reserved bits */
-                              0 /* not pinned */);
+                              0 /* no reserved bits */);
     d->allocated_words += Whsize_wosize(wosize);
     if (p == NULL) {
       intern_cleanup (s);
diff --git a/runtime/major_gc.c b/runtime/major_gc.c
index 5c099105536..4b05f02299e 100644
--- a/runtime/major_gc.c
+++ b/runtime/major_gc.c
@@ -1189,11 +1189,18 @@ static intnat ephe_sweep (caml_domain_state* domain_state, intnat budget)
   return budget;
 }
 
-static void cycle_all_domains_callback(caml_domain_state* domain, void* unused,
-                                       int participating_count,
-                                       caml_domain_state** participating)
+struct cycle_callback_params {
+  int force_compaction;
+};
+
+static void stw_cycle_all_domains(caml_domain_state* domain, void* args,
+                                  int participating_count,
+                                  caml_domain_state** participating)
 {
   uintnat num_domains_in_stw;
+  /* We copy params because the stw leader may leave early. No barrier needed
+     because there's one in the minor gc and after. */
+  struct cycle_callback_params params = *((struct cycle_callback_params*)args);
 
   CAML_EV_BEGIN(EV_MAJOR_GC_CYCLE_DOMAINS);
 
@@ -1314,6 +1321,12 @@ static void cycle_all_domains_callback(caml_domain_state* domain, void* unused,
 
   caml_cycle_heap(domain->shared_heap);
 
+  /* Compact here if requested (or, in some future version, if the heap overhead
+      is too high). */
+  if (params.force_compaction) {
+    caml_compact_heap(domain, participating_count, participating);
+  }
+
   /* Collect domain-local stats to emit to runtime events */
   struct heap_stats local_stats;
   caml_collect_heap_stats_sample(Caml_state->shared_heap, &local_stats);
@@ -1477,7 +1490,8 @@ static char collection_slice_mode_char(collection_slice_mode mode)
 static void major_collection_slice(intnat howmuch,
                                      int participant_count,
                                      caml_domain_state** barrier_participants,
-                                     collection_slice_mode mode)
+                                     collection_slice_mode mode,
+                                     int force_compaction)
 {
   caml_domain_state* domain_state = Caml_state;
   intnat sweep_work = 0, mark_work = 0;
@@ -1682,13 +1696,17 @@ static void major_collection_slice(intnat howmuch,
       cycle simultaneously, we loop until the current cycle has ended,
       ignoring whether caml_try_run_on_all_domains succeeds. */
 
+    struct cycle_callback_params params;
+    params.force_compaction = force_compaction;
 
     while (saved_major_cycle == caml_major_cycles_completed) {
       if (barrier_participants) {
-        cycle_all_domains_callback
-              (domain_state, (void*)0, participant_count, barrier_participants);
+        stw_cycle_all_domains
+          (domain_state, (void*)&params,
+           participant_count, barrier_participants);
       } else {
-        caml_try_run_on_all_domains(&cycle_all_domains_callback, 0, 0);
+        caml_try_run_on_all_domains
+          (&stw_cycle_all_domains, (void*)&params, 0);
       }
     }
   }
@@ -1696,7 +1714,7 @@ static void major_collection_slice(intnat howmuch,
 
 void caml_opportunistic_major_collection_slice(intnat howmuch)
 {
-  major_collection_slice(howmuch, 0, 0, Slice_opportunistic);
+  major_collection_slice(howmuch, 0, 0, Slice_opportunistic, 0);
 }
 
 void caml_major_collection_slice(intnat howmuch)
@@ -1709,7 +1727,8 @@ void caml_major_collection_slice(intnat howmuch)
         AUTO_TRIGGERED_MAJOR_SLICE,
         0,
         0,
-        Slice_interruptible
+        Slice_interruptible,
+        0
         );
     if (caml_incoming_interrupts_queued()) {
       caml_gc_log("Major slice interrupted, rescheduling major slice");
@@ -1718,41 +1737,62 @@ void caml_major_collection_slice(intnat howmuch)
   } else {
     /* TODO: could make forced API slices interruptible, but would need to do
        accounting or pass up interrupt */
-    major_collection_slice(howmuch, 0, 0, Slice_uninterruptible);
+    major_collection_slice(howmuch, 0, 0, Slice_uninterruptible, 0);
   }
   /* Record that this domain has completed a major slice for this minor cycle.
    */
   Caml_state->major_slice_epoch = major_slice_epoch;
 }
 
-static void finish_major_cycle_callback (caml_domain_state* domain, void* arg,
-                                         int participating_count,
-                                         caml_domain_state** participating)
+struct finish_major_cycle_params {
+  uintnat saved_major_cycles;
+  int force_compaction;
+};
+
+static void stw_finish_major_cycle (caml_domain_state* domain, void* arg,
+                                    int participating_count,
+                                    caml_domain_state** participating)
 {
-  uintnat saved_major_cycles = (uintnat)arg;
+  /* We must copy params because the leader may exit this
+     before other domains do. There is at least one barrier somewhere
+     in the major cycle ending, so we don't need one immediately
+     after this. */
+  struct finish_major_cycle_params params =
+    *((struct finish_major_cycle_params*)arg);
+
   CAMLassert (domain == Caml_state);
 
   caml_empty_minor_heap_no_major_slice_from_stw
     (domain, (void*)0, participating_count, participating);
 
   CAML_EV_BEGIN(EV_MAJOR_FINISH_CYCLE);
-  while (saved_major_cycles == caml_major_cycles_completed) {
+  while (params.saved_major_cycles == caml_major_cycles_completed) {
     major_collection_slice(10000000, participating_count, participating,
-                           Slice_uninterruptible);
+                           Slice_uninterruptible, params.force_compaction);
   }
   CAML_EV_END(EV_MAJOR_FINISH_CYCLE);
 }
 
-void caml_finish_major_cycle (void)
+void caml_finish_major_cycle (int force_compaction)
 {
   uintnat saved_major_cycles = caml_major_cycles_completed;
 
   while( saved_major_cycles == caml_major_cycles_completed ) {
-    caml_try_run_on_all_domains
-    (&finish_major_cycle_callback, (void*)caml_major_cycles_completed, 0);
+    struct finish_major_cycle_params params;
+    params.force_compaction = force_compaction;
+    params.saved_major_cycles = caml_major_cycles_completed;
+
+    caml_try_run_on_all_domains(&stw_finish_major_cycle, (void*)&params, 0);
   }
 }
 
+#ifdef DEBUG
+int caml_mark_stack_is_empty(void)
+{
+  return Caml_state->mark_stack->count == 0;
+}
+#endif
+
 void caml_empty_mark_stack (void)
 {
   while (!Caml_state->marking_done){
diff --git a/runtime/memory.c b/runtime/memory.c
index a057ab46d6d..0cf3f866363 100644
--- a/runtime/memory.c
+++ b/runtime/memory.c
@@ -339,7 +339,7 @@ Caml_inline value alloc_shr(mlsize_t wosize, tag_t tag, reserved_t reserved,
   Caml_check_caml_state();
   caml_domain_state *dom_st = Caml_state;
   value *v = caml_shared_try_alloc(dom_st->shared_heap,
-                                   wosize, tag, reserved, 0);
+                                   wosize, tag, reserved);
   if (v == NULL) {
     if (!noexc)
       caml_raise_out_of_memory();
diff --git a/runtime/memprof.c b/runtime/memprof.c
index c7bd1b4cfd3..1464cca40cc 100644
--- a/runtime/memprof.c
+++ b/runtime/memprof.c
@@ -42,7 +42,6 @@ CAMLprim value caml_memprof_stop(value unit)
 #include "caml/weak.h"
 #include "caml/stack.h"
 #include "caml/misc.h"
-#include "caml/compact.h"
 #include "caml/printexc.h"
 #include "caml/runtime_events.h"
 
diff --git a/runtime/minor_gc.c b/runtime/minor_gc.c
index 84b163d1ef4..ca35c2a491c 100644
--- a/runtime/minor_gc.c
+++ b/runtime/minor_gc.c
@@ -161,7 +161,7 @@ static value alloc_shared(caml_domain_state* d,
                           mlsize_t wosize, tag_t tag, reserved_t reserved)
 {
   void* mem = caml_shared_try_alloc(d->shared_heap, wosize, tag,
-                                    reserved, 0 /* not pinned */);
+                                    reserved);
   d->allocated_words += Whsize_wosize(wosize);
   if (mem == NULL) {
     caml_fatal_error("allocation failure during minor GC");
@@ -750,6 +750,7 @@ caml_stw_empty_minor_heap_no_major_slice(caml_domain_state* domain,
   CAML_EV_BEGIN(EV_MINOR_CLEAR);
   caml_gc_log("running stw empty_minor_heap_domain_clear");
   caml_empty_minor_heap_domain_clear(domain);
+
 #ifdef DEBUG
   {
     for (uintnat* p = initial_young_ptr; p < (uintnat*)domain->young_end; ++p)
diff --git a/runtime/platform.c b/runtime/platform.c
index b3bf88a7aaf..0ce94b0fea2 100644
--- a/runtime/platform.c
+++ b/runtime/platform.c
@@ -155,12 +155,8 @@ static struct lf_skiplist mmap_blocks = {NULL};
 #ifndef _WIN32
 #endif
 
-void* caml_mem_map(uintnat size, uintnat alignment, int reserve_only)
+void* caml_mem_map(uintnat size, int reserve_only)
 {
-  CAMLassert(Is_power_of_2(alignment));
-  CAMLassert(Is_page_aligned(size));
-  alignment = round_up(alignment, caml_plat_mmap_alignment);
-
 #ifdef DEBUG
   if (mmap_blocks.head == NULL) {
     /* The first call to caml_mem_map should be during caml_init_domains, called
@@ -170,7 +166,7 @@ void* caml_mem_map(uintnat size, uintnat alignment, int reserve_only)
   }
 #endif
 
-  void* mem = caml_plat_mem_map(size, alignment, reserve_only);
+  void* mem = caml_plat_mem_map(size, reserve_only);
 
   if (mem == 0) {
     caml_gc_message(0x1000, "mmap %" ARCH_INTNAT_PRINTF_FORMAT "d bytes failed",
diff --git a/runtime/shared_heap.c b/runtime/shared_heap.c
index 50613446843..5fa2758943c 100644
--- a/runtime/shared_heap.c
+++ b/runtime/shared_heap.c
@@ -25,6 +25,7 @@
 #include "caml/fiber.h" /* for verification */
 #include "caml/gc.h"
 #include "caml/globroots.h"
+#include "caml/major_gc.h"
 #include "caml/memory.h"
 #include "caml/mlvalues.h"
 #include "caml/platform.h"
@@ -32,6 +33,9 @@
 #include "caml/shared_heap.h"
 #include "caml/sizeclasses.h"
 #include "caml/startup_aux.h"
+#include "caml/weak.h"
+
+CAMLexport atomic_uintnat caml_compactions_count;
 
 typedef unsigned int sizeclass;
 
@@ -51,6 +55,12 @@ typedef struct pool {
 CAML_STATIC_ASSERT(sizeof(pool) == Bsize_wsize(POOL_HEADER_WSIZE));
 #define POOL_HEADER_SZ sizeof(pool)
 
+#define POOL_SLAB_WOFFSET(sz) (POOL_HEADER_WSIZE + wastage_sizeclass[sz])
+#define POOL_FIRST_BLOCK(p, sz) ((header_t*)(p) + POOL_SLAB_WOFFSET(sz))
+#define POOL_END(p) ((header_t*)(p) + POOL_WSIZE)
+#define POOL_BLOCKS(p) ((POOL_WSIZE - POOL_HEADER_WSIZE) /  \
+                        wsize_sizeclass[(p)->sz])
+
 typedef struct large_alloc {
   caml_domain_state* owner;
   struct large_alloc* next;
@@ -93,6 +103,10 @@ struct caml_heap_state {
   struct heap_stats stats;
 };
 
+struct compact_pool_stat {
+  int free_blocks;
+  int live_blocks;
+};
 
 /* You need to hold the [pool_freelist] lock to call these functions. */
 static void orphan_heap_stats_with_lock(struct caml_heap_state *);
@@ -113,6 +127,7 @@ struct caml_heap_state* caml_init_shared_heap (void) {
     heap->swept_large = NULL;
     heap->unswept_large = NULL;
     heap->owner = Caml_state;
+
     memset(&heap->stats, 0, sizeof(heap->stats));
   }
   return heap;
@@ -166,23 +181,20 @@ void caml_teardown_shared_heap(struct caml_heap_state* heap) {
 
 /* Allocating and deallocating pools from the global freelist. */
 
-#define POOLS_PER_ALLOCATION 16
 static pool* pool_acquire(struct caml_heap_state* local) {
   pool* r;
 
   caml_plat_lock(&pool_freelist.lock);
   if (!pool_freelist.free) {
-    void* mem = caml_mem_map(Bsize_wsize(POOL_WSIZE) * POOLS_PER_ALLOCATION,
-                              Bsize_wsize(POOL_WSIZE), 0 /* allocate */);
-    int i;
+    void* mem = caml_mem_map(Bsize_wsize(POOL_WSIZE), 0);
+
     if (mem) {
       CAMLassert(pool_freelist.free == NULL);
-      for (i=0; i<POOLS_PER_ALLOCATION; i++) {
-        r = (pool*)(((uintnat)mem) + ((uintnat)i) * Bsize_wsize(POOL_WSIZE));
-        r->next = pool_freelist.free;
-        r->owner = NULL;
-        pool_freelist.free = r;
-      }
+
+      r = (pool*)mem;
+      r->next = pool_freelist.free;
+      r->owner = NULL;
+      pool_freelist.free = r;
     }
   }
   r = pool_freelist.free;
@@ -194,23 +206,37 @@ static pool* pool_acquire(struct caml_heap_state* local) {
   return r;
 }
 
+/* release [pool] to the current free list of pools */
 static void pool_release(struct caml_heap_state* local,
                          pool* pool,
-                         sizeclass sz) {
+                         sizeclass sz)
+{
   pool->owner = NULL;
   CAMLassert(pool->sz == sz);
   local->stats.pool_words -= POOL_WSIZE;
   local->stats.pool_frag_words -= POOL_HEADER_WSIZE + wastage_sizeclass[sz];
-  /* TODO: give free pools back to the OS. Issue #698 */
   caml_plat_lock(&pool_freelist.lock);
   pool->next = pool_freelist.free;
   pool_freelist.free = pool;
   caml_plat_unlock(&pool_freelist.lock);
 }
 
-static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s) {
-  value* p = (value*)((char*)a + POOL_HEADER_SZ);
-  value* end = (value*)a + POOL_WSIZE;
+
+/* free the memory of [pool], giving it back to the OS */
+static void pool_free(struct caml_heap_state* local,
+                      pool* pool,
+                      sizeclass sz)
+{
+    CAMLassert(pool->sz == sz);
+    local->stats.pool_words -= POOL_WSIZE;
+    local->stats.pool_frag_words -= POOL_HEADER_WSIZE + wastage_sizeclass[sz];
+    caml_mem_unmap(pool, Bsize_wsize(POOL_WSIZE));
+}
+
+static void calc_pool_stats(pool* a, sizeclass sz, struct heap_stats* s)
+{
+  header_t* p = POOL_FIRST_BLOCK(a, sz);
+  header_t* end = POOL_END(a);
   mlsize_t wh = wsize_sizeclass[sz];
   s->pool_frag_words += Wsize_bsize(POOL_HEADER_SZ);
 
@@ -235,8 +261,8 @@ Caml_inline void pool_initialize(pool* r,
                                  caml_domain_state* owner)
 {
   mlsize_t wh = wsize_sizeclass[sz];
-  value* p = (value*)((char*)r + POOL_HEADER_SZ);
-  value* end = (value*)((char*)r + Bsize_wsize(POOL_WSIZE));
+  header_t* p = POOL_FIRST_BLOCK(r, sz);
+  header_t* end = POOL_END(r);
 
   r->next = 0;
   r->owner = owner;
@@ -250,9 +276,14 @@ Caml_inline void pool_initialize(pool* r,
   while (p + wh <= end) {
     p[0] = 0; /* zero header indicates free object */
     p[1] = (value)(p - wh);
+    #ifdef DEBUG
+    for (int w = 2 ; w < wh; w++) {
+      p[w] = Debug_free_major;
+    }
+    #endif
     p += wh;
   }
-  r->next_obj = p - wh;
+  r->next_obj = (value*)(p - wh);
 }
 
 /* Allocating an object from a pool */
@@ -395,7 +426,7 @@ static void* large_allocate(struct caml_heap_state* local, mlsize_t sz) {
 }
 
 value* caml_shared_try_alloc(struct caml_heap_state* local, mlsize_t wosize,
-                             tag_t tag, reserved_t reserved, int pinned)
+                             tag_t tag, reserved_t reserved)
 {
   mlsize_t whsize = Whsize_wosize(wosize);
   value* p;
@@ -420,7 +451,7 @@ value* caml_shared_try_alloc(struct caml_heap_state* local, mlsize_t wosize,
     p = large_allocate(local, Bsize_wsize(whsize));
     if (!p) return 0;
   }
-  colour = pinned ? NOT_MARKABLE : caml_global_heap_state.MARKED;
+  colour = caml_global_heap_state.MARKED;
   Hd_hp (p) = Make_header_with_reserved(wosize, tag, colour, reserved);
 #ifdef DEBUG
   {
@@ -433,18 +464,6 @@ value* caml_shared_try_alloc(struct caml_heap_state* local, mlsize_t wosize,
   return p;
 }
 
-struct pool* caml_pool_of_shared_block(value v)
-{
-  mlsize_t whsize;
-  CAMLassert (Is_block(v) && !Is_young(v));
-  whsize = Whsize_wosize(Wosize_val(v));
-  if (whsize > 0 && whsize <= SIZECLASS_MAX) {
-    return (pool*)((uintnat)v &~(POOL_WSIZE * sizeof(value) - 1));
-  } else {
-    return 0;
-  }
-}
-
 /* Sweeping */
 
 static intnat pool_sweep(struct caml_heap_state* local, pool** plist,
@@ -455,8 +474,8 @@ static intnat pool_sweep(struct caml_heap_state* local, pool** plist,
   *plist = a->next;
 
   {
-    value* p = (value*)((char*)a + POOL_HEADER_SZ);
-    value* end = (value*)a + POOL_WSIZE;
+    header_t* p = POOL_FIRST_BLOCK(a, sz);
+    header_t* end = POOL_END(a);
     mlsize_t wh = wsize_sizeclass[sz];
     int all_used = 1;
     struct heap_stats* s = &local->stats;
@@ -485,7 +504,7 @@ static intnat pool_sweep(struct caml_heap_state* local, pool** plist,
           }
         }
 #endif
-        a->next_obj = p;
+        a->next_obj = (value*)p;
         all_used = 0;
         /* update stats */
         s->pool_live_blocks--;
@@ -594,8 +613,8 @@ uintnat caml_heap_blocks(struct caml_heap_state* local) {
 
 void caml_redarken_pool(struct pool* r, scanning_action f, void* fdata) {
   mlsize_t wh = wsize_sizeclass[r->sz];
-  value* p = (value*)((char*)r + POOL_HEADER_SZ);
-  value* end = (value*)((char*)r + Bsize_wsize(POOL_WSIZE));
+  header_t* p = POOL_FIRST_BLOCK(r, r->sz);
+  header_t* end = POOL_END(r);
 
   while (p + wh <= end) {
     header_t hd = p[0];
@@ -785,6 +804,492 @@ void caml_verify_heap(caml_domain_state *domain) {
   caml_stat_free(st);
 }
 
+/* Compaction starts here. See [caml_compact_heap] for entry. */
+
+/* Given a single value `v`, found at `p`, check if it points to an
+   evacuated block, and if so update it using the forwarding pointer
+   created by the compactor. */
+static inline void compact_update_value(void* ignored,
+                                        value v,
+                                        volatile value* p)
+{
+  if (Is_block(v)) {
+    CAMLassert(!Is_young(v));
+
+    tag_t tag = Tag_val(v);
+
+    int infix_offset = 0;
+    if (tag == Infix_tag) {
+      infix_offset = Infix_offset_val(v);
+      /* v currently points to an Infix_tag inside of a Closure_tag.
+        The forwarding pointer we want is in the first field of the
+        Closure_tag. */
+      v -= infix_offset;
+      CAMLassert(Tag_val(v) == Closure_tag);
+    }
+
+    /* non-markable blocks can't move */
+    if (Has_status_val(v, NOT_MARKABLE))
+      return;
+
+    if (Whsize_val(v) <= SIZECLASS_MAX) {
+      /* MARKED header status means the location `p` points to a block that
+         has been evacuated. Use the forwarding pointer in the first field
+         to update to the new location. */
+      if (Has_status_val(v, caml_global_heap_state.MARKED)) {
+        value fwd = Field(v, 0) + infix_offset;
+        CAMLassert(Is_block(fwd));
+        CAMLassert(Tag_val(fwd) == tag);
+        *p = fwd;
+      }
+    }
+  }
+}
+
+/* Given a value found at `p` check if it points to an evacuated
+   block, and if so update it using the forwarding pointer created by
+   the compactor. */
+static inline void compact_update_value_at(volatile value* p)
+{
+  compact_update_value(NULL, *p, p);
+}
+
+/* For each pointer in the block pointed to by `p`, check if it points
+   to an evacuated block and if so update it using the forwarding
+   pointer created by the compactor. */
+static void compact_update_block(header_t* p)
+{
+  header_t hd = Hd_hp(p);
+
+  /* We should never be called with a block that has a zero header (this would
+    indicate a bug in traversing the shared pools). */
+  CAMLassert(hd != 0);
+
+  tag_t tag = Tag_hd(hd);
+
+  /* We should never encounter an Infix tag iterating over the shared pools or
+    large allocations. We could find it in roots but those use
+    [compact_update_value]. */
+  CAMLassert(tag != Infix_tag);
+
+  if (tag == Cont_tag) {
+    value stk = Field(Val_hp(p), 0);
+    if (Ptr_val(stk)) {
+      caml_scan_stack(&compact_update_value, 0, NULL, Ptr_val(stk), 0);
+    }
+  } else {
+    uintnat offset = 0;
+
+    if (tag == Closure_tag) {
+      offset = Start_env_closinfo(Closinfo_val(Val_hp(p)));
+    }
+
+    if (tag < No_scan_tag) {
+      mlsize_t wosz = Wosize_hd(hd);
+      for (mlsize_t i = offset; i < wosz; i++) {
+        compact_update_value_at(&Field(Val_hp(p), i));
+      }
+    }
+  }
+}
+
+/* Update all the live blocks in a list of pools. */
+
+static void compact_update_pools(pool *cur_pool)
+{
+  while (cur_pool) {
+    header_t* p = POOL_FIRST_BLOCK(cur_pool, cur_pool->sz);
+    header_t* end = POOL_END(cur_pool);
+    mlsize_t wh = wsize_sizeclass[cur_pool->sz];
+
+    while (p + wh <= end) {
+      if (*p &&
+          Has_status_val(Val_hp(p), caml_global_heap_state.UNMARKED)) {
+        compact_update_block(p);
+      }
+      p += wh;
+    }
+    cur_pool = cur_pool->next;
+  }
+}
+
+/* Update all the fields in the list of ephemerons found at `*ephe_p` */
+
+static void compact_update_ephe_list(volatile value *ephe_p)
+{
+  while (*ephe_p) {
+    compact_update_value_at(ephe_p);
+
+    value ephe = *ephe_p;
+    mlsize_t wosize = Wosize_val(ephe);
+    compact_update_value_at(&Field(ephe, CAML_EPHE_DATA_OFFSET));
+
+    for (int i = CAML_EPHE_FIRST_KEY ; i < wosize ; i++) {
+      compact_update_value_at(&Field(ephe, i));
+    }
+
+    ephe_p = &Field(ephe, CAML_EPHE_LINK_OFFSET);
+  }
+}
+
+/* Compact the heap for the given domain. Run in parallel for all domains. */
+
+void caml_compact_heap(caml_domain_state* domain_state,
+                         int participating_count,
+                         caml_domain_state** participants)
+{
+  caml_gc_log("Compacting heap start");
+  CAML_EV_BEGIN(EV_COMPACT);
+  /* Warning: caml_compact_heap must only be called from
+     [cycle_all_domains_callback] in major_gc.c as there are
+     very specific conditions the compaction algorithm expects.
+
+    The following code implements a compaction algorithm that is similar to
+    Edward's Two-Finger algorithm from the original 1974 LISP book (The
+    Programming Language LISP). At a high level the algorithm works as a series
+    of parallel (using all running domains) phases separated by global barriers:
+
+    1. For each size class
+      a. Compute the number of live blocks in partially filled pools
+      b. Keep enough pools to fully contain the number of live blocks and
+         set the rest to be evacuated
+      c. For each live block in each pool in the evacuation list,
+         allocate and copy into a non-evacuating pool.
+    2. Proceed through the roots and the heap, updating pointers to evacuated
+       blocks to point to the new location of the block. Update finalisers and
+       ephemerons too.
+    3. Go through pools evacuated and release them. Finally free all but
+        one pool in the freelist.
+    4. One domain needs to release the pools in the freelist back to the OS.
+
+    The algorithm requires one full pass through the whole heap (pools and large
+    allocations) to rewrite pointers, as well as two passes through the
+    partially-occupied pools in the heap to compute the number of live blocks
+    and evacuate them.
+  */
+
+  /* First phase. Here we compute the number of live blocks in partially
+  filled pools, determine pools to be evacuated and then evacuate from them.
+  For the first phase we need not consider full pools, they
+  cannot be evacuated to or from. */
+  caml_global_barrier();
+  CAML_EV_BEGIN(EV_COMPACT_EVACUATE);
+
+  struct caml_heap_state* heap = Caml_state->shared_heap;
+
+  #ifdef DEBUG
+  /* Check preconditions for the heap: */
+  for (int sz_class = 1; sz_class < NUM_SIZECLASSES; sz_class++) {
+    /* No sweeping has happened yet */
+    CAMLassert(heap->avail_pools[sz_class] == NULL);
+    CAMLassert(heap->full_pools[sz_class] == NULL);
+    CAMLassert(heap->swept_large == NULL);
+    /* No pools waiting for adoption */
+    if (participants[0] == Caml_state) {
+      CAMLassert(pool_freelist.global_avail_pools[sz_class] == NULL);
+      CAMLassert(pool_freelist.global_full_pools[sz_class] == NULL);
+    }
+    /* The minor heap is empty */
+    CAMLassert(Caml_state->young_ptr == Caml_state->young_end);
+    /* The mark stack is empty */
+    CAMLassert(caml_mark_stack_is_empty());
+  }
+  #endif
+
+  /* All evacuated pools (of every size class) */
+  pool *evacuated_pools = NULL;
+
+  for (int sz_class = 1; sz_class < NUM_SIZECLASSES; sz_class++) {
+    /* We only care about moving things in pools that aren't full (we cannot
+    evacuate to or from a full pool) */
+    pool* cur_pool = heap->unswept_avail_pools[sz_class];
+
+    if (!cur_pool) {
+      /* No partially filled pools for this size, nothing to do */
+      continue;
+    }
+
+    /* count the number of pools */
+    int num_pools = 0;
+
+    while (cur_pool) {
+      num_pools++;
+      cur_pool = cur_pool->next;
+    }
+
+    struct compact_pool_stat* pool_stats = caml_stat_alloc_noexc(
+      sizeof(struct compact_pool_stat) * num_pools);
+
+    /* if we're unable to allocate pool_stats here then we should avoid
+      evacuating this size class. It's unlikely but it may be that we had
+      better success with an earlier size class and that results in some
+      memory being freed up. */
+    if( pool_stats == NULL ) {
+      caml_gc_log("Unable to allocate pool_stats for size class %d", sz_class);
+
+      continue;
+    }
+
+    cur_pool = heap->unswept_avail_pools[sz_class];
+
+    /* Count the number of free and live blocks in each pool. Note that a live
+       block here currently has the header status UNMARKED (because it was
+       MARKED in the previous cycle). After compaction the shared pools will
+       contain UNMARKED and GARBAGE from the "to" pools and UNMARKED from the
+       "from" pools which were evacuated.
+
+       At the cost of some complexity or an additional pass we could compute the
+       exact amount of space needed or even sweep all pools in this counting
+       pass.
+    */
+    int k = 0;
+    int total_live_blocks = 0;
+#ifdef DEBUG
+    int total_free_blocks = 0;
+#endif
+    while (cur_pool) {
+      header_t* p = POOL_FIRST_BLOCK(cur_pool, sz_class);
+      header_t* end = POOL_END(cur_pool);
+      mlsize_t wh = wsize_sizeclass[sz_class];
+
+      pool_stats[k].free_blocks = 0;
+      pool_stats[k].live_blocks = 0;
+
+      while (p + wh <= end) {
+        header_t h = (header_t)atomic_load_relaxed((atomic_uintnat*)p);
+
+        /* A zero header in a shared heap pool indicates an empty space */
+        if (!h) {
+          pool_stats[k].free_blocks++;
+#ifdef DEBUG
+          total_free_blocks++;
+#endif
+        } else if (Has_status_hd(h, caml_global_heap_state.UNMARKED)) {
+          total_live_blocks++;
+          pool_stats[k].live_blocks++;
+        }
+        p += wh;
+      }
+
+      cur_pool = cur_pool->next;
+      k++;
+    }
+
+    /* Note that partially filled pools must have at least some free space*/
+#ifdef DEBUG
+    CAMLassert(total_free_blocks > 0);
+#endif
+
+    if (!total_live_blocks) {
+      /* No live (i.e unmarked) blocks in partially filled pools, nothing to do
+         for this size class */
+      continue;
+    }
+
+    /* Now we use the pool stats to calculate which pools will be evacuated. We
+       want to walk through the pools and check whether we have enough free
+       blocks in the pools behind us to accommodate all the remaining live
+       blocks. */
+    int free_blocks = 0;
+    int j = 0;
+    int remaining_live_blocks = total_live_blocks;
+
+    cur_pool = heap->unswept_avail_pools[sz_class];
+    /* [last_pool_p] will be a pointer to the next field of the last
+       non-evacuating pool. We need this so we can snip the list of evacuating
+       pools from [unswept_avail_pools] and eventually attach them all to
+       [evacuated_pools]. */
+    pool **last_pool_p = &heap->unswept_avail_pools[sz_class];
+    while (cur_pool) {
+      if (free_blocks >= remaining_live_blocks) {
+        break;
+      }
+
+      free_blocks += pool_stats[j].free_blocks;
+      remaining_live_blocks -= pool_stats[j].live_blocks;
+      last_pool_p = &cur_pool->next;
+      cur_pool = cur_pool->next;
+      j++;
+    }
+
+    /* We're done with the pool stats. */
+    caml_stat_free(pool_stats);
+
+    /* `cur_pool` now points to the first pool we are evacuating, or NULL if
+        we could not compact this particular size class (for this domain) */
+
+    /* Snip the evacuating pools from list of pools we are retaining */
+    *last_pool_p = NULL;
+
+    /* Evacuate marked blocks from the evacuating pools into the
+       avail pools. */
+    while (cur_pool) {
+      header_t* p = POOL_FIRST_BLOCK(cur_pool, sz_class);
+      header_t* end = POOL_END(cur_pool);
+      mlsize_t wh = wsize_sizeclass[sz_class];
+
+      while (p + wh <= end) {
+        header_t hd = (header_t)atomic_load_relaxed((atomic_uintnat*)p);
+
+        /* A zero header in a shared heap pool indicates an empty space */
+        if (hd) {
+          CAMLassert (!Has_status_hd(hd, caml_global_heap_state.MARKED));
+          CAMLassert (!Has_status_hd(hd, NOT_MARKABLE));
+
+          /* Reminder: since colours have rotated, UNMARKED indicates a MARKED
+          (i.e live) block */
+          if (Has_status_hd(hd, caml_global_heap_state.UNMARKED)) {
+            /* live block in an evacuating pool, so we evacuate it to
+             * the first available block */
+            pool* to_pool = heap->unswept_avail_pools[sz_class];
+            value* new_p = to_pool->next_obj;
+            CAMLassert(new_p);
+            value *next = (value*)new_p[1];
+            to_pool->next_obj = next;
+
+            if (!next) {
+              /* This pool is full. Move it to unswept_full_pools */
+              heap->unswept_avail_pools[sz_class] = to_pool->next;
+              to_pool->next = heap->unswept_full_pools[sz_class];
+              heap->unswept_full_pools[sz_class] = to_pool;
+            }
+
+            /* Copy the block to the new location */
+            memcpy(new_p, p, Whsize_hd(hd) * sizeof(value));
+
+            /* Set first field of p to a forwarding pointer */
+            Field(Val_hp(p), 0) = Val_hp(new_p);
+
+            /* Since there can be no blocks with the MARKED status, we use this
+              to indicate that a block has been evacuated and any pointers to
+              it should be updated. */
+            *p = With_status_hd(hd, caml_global_heap_state.MARKED);
+          } else if (Has_status_hd(hd, caml_global_heap_state.GARBAGE)) {
+            /* We are implicitly sweeping pools in the evacuation set and thus
+               we must remember to call finalisers for Custom blocks that would
+               have been swept in a subsequent major cycle. */
+            if (Tag_hd (hd) == Custom_tag) {
+              void (*final_fun)(value) = Custom_ops_val(Val_hp(p))->finalize;
+              if (final_fun) final_fun(Val_hp(p));
+            }
+
+            /* In the DEBUG runtime, we should overwrite the fields of swept
+               blocks. Note: this pool can't be allocated in to again and so
+               we overwrite the header and first fields too. */
+            #ifdef DEBUG
+            for (int w = 0 ; w < wh ; w++) {
+              Field(p, w) = Debug_free_major;
+            }
+            #endif
+          }
+        }
+
+        p += wh;
+      }
+      /* move pool to evacuated pools list, continue to next pool */
+      pool *next = cur_pool->next;
+      cur_pool->next = evacuated_pools;
+      evacuated_pools = cur_pool;
+      cur_pool = next;
+    }
+  }
+
+  CAML_EV_END(EV_COMPACT_EVACUATE);
+  caml_global_barrier();
+  CAML_EV_BEGIN(EV_COMPACT_FORWARD);
+
+  /* Second phase: at this point all live blocks in evacuated pools
+    have been moved and their old locations' first fields now point to
+    their new locations. We now go through all pools again (including
+    full ones this time) and for each field we check if the block the
+    field points to has the header status MARKED - if it does then the block
+    has been evacuated and we need to update the field to point to the new
+    location. We do this by using the forwarding pointer that is in the first
+    field of the evacuated block. */
+
+  /* First we do roots (locals and finalisers) */
+  caml_do_roots(&compact_update_value, 0, NULL, Caml_state, 1);
+
+  /* Next, one domain does the global roots */
+  if (participants[0] == Caml_state) {
+    caml_scan_global_roots(&compact_update_value, NULL);
+  }
+
+  /* Shared heap pools. */
+  for (int sz_class = 1; sz_class < NUM_SIZECLASSES; sz_class++) {
+    compact_update_pools(heap->unswept_avail_pools[sz_class]);
+    compact_update_pools(heap->unswept_full_pools[sz_class]);
+  }
+
+  /* Large allocations */
+  for (large_alloc* la = heap->unswept_large; la != NULL; la = la->next) {
+    header_t* p = (header_t*)((char*)la + LARGE_ALLOC_HEADER_SZ);
+    if (Has_status_val(Val_hp(p), caml_global_heap_state.UNMARKED)) {
+      compact_update_block(p);
+    }
+  }
+
+  /* Ephemerons */
+  struct caml_ephe_info* ephe_info = Caml_state->ephe_info;
+  compact_update_ephe_list(&ephe_info->todo);
+  compact_update_ephe_list(&ephe_info->live);
+
+  CAML_EV_END(EV_COMPACT_FORWARD);
+  caml_global_barrier();
+  CAML_EV_BEGIN(EV_COMPACT_RELEASE);
+
+  /* Third phase: free all evacuated pools and release the mappings back to
+      the OS.
+
+      Note that we may have no "available" pools left, if all
+      remaining pools have been filled up by evacuated blocks. */
+
+  pool* cur_pool = evacuated_pools;
+  while (cur_pool) {
+    pool* next_pool = cur_pool->next;
+
+    #ifdef DEBUG
+    for (header_t *p = POOL_FIRST_BLOCK(cur_pool, cur_pool->sz);
+         p < POOL_END(cur_pool); p++) {
+      *p = Debug_free_major;
+    }
+    #endif
+
+    pool_free(heap, cur_pool, cur_pool->sz);
+    cur_pool = next_pool;
+  }
+
+  CAML_EV_END(EV_COMPACT_RELEASE);
+  caml_global_barrier();
+
+  /* Fourth phase: one domain also needs to release the free list */
+  if( participants[0] == Caml_state ) {
+    pool* cur_pool;
+    pool* next_pool;
+
+    caml_plat_lock(&pool_freelist.lock);
+    cur_pool = pool_freelist.free;
+
+    while( cur_pool ) {
+      next_pool = cur_pool->next;
+      /* No stats to update so just unmap */
+      caml_mem_unmap(cur_pool, Bsize_wsize(POOL_WSIZE));
+      cur_pool = next_pool;
+    }
+
+    pool_freelist.free = NULL;
+
+    caml_plat_unlock(&pool_freelist.lock);
+
+    /* We are done, increment our compaction count */
+    atomic_fetch_add(&caml_compactions_count, 1);
+  }
+
+  caml_gc_log("Compacting heap complete");
+  CAML_EV_END(EV_COMPACT);
+}
+
+/* Compaction end */
 
 struct mem_stats {
   /* unit is words */
@@ -803,8 +1308,8 @@ static void verify_pool(pool* a, sizeclass sz, struct mem_stats* s) {
   }
 
   {
-    value* p = (value*)((char*)a + POOL_HEADER_SZ);
-    value* end = (value*)a + POOL_WSIZE;
+    header_t* p = POOL_FIRST_BLOCK(a, sz);
+    header_t* end = POOL_END(a);
     mlsize_t wh = wsize_sizeclass[sz];
     s->overhead += Wsize_bsize(POOL_HEADER_SZ);
 
diff --git a/runtime/unix.c b/runtime/unix.c
index a421cad188d..0e17a4a0969 100644
--- a/runtime/unix.c
+++ b/runtime/unix.c
@@ -512,23 +512,9 @@ void caml_init_os_params(void)
 
 #ifndef __CYGWIN__
 
-/* Standard Unix implementation: reserve with mmap (and trim to alignment) with
-   commit done using mmap as well. */
-
-Caml_inline void safe_munmap(uintnat addr, uintnat size)
+void *caml_plat_mem_map(uintnat size, int reserve_only)
 {
-  if (size > 0) {
-    caml_gc_message(0x1000, "munmap %" ARCH_INTNAT_PRINTF_FORMAT "d"
-                            " bytes at %" ARCH_INTNAT_PRINTF_FORMAT "x"
-                            " for heaps\n", size, addr);
-    munmap((void*)addr, size);
-  }
-}
-
-void *caml_plat_mem_map(uintnat size, uintnat alignment, int reserve_only)
-{
-  uintnat alloc_sz = size + alignment;
-  uintnat base, aligned_start, aligned_end;
+  uintnat alloc_sz = size;
   void* mem;
 
   mem = mmap(0, alloc_sz, reserve_only ? PROT_NONE : (PROT_READ | PROT_WRITE),
@@ -536,14 +522,6 @@ void *caml_plat_mem_map(uintnat size, uintnat alignment, int reserve_only)
   if (mem == MAP_FAILED)
     return 0;
 
-  /* trim to an aligned region */
-  base = (uintnat)mem;
-  aligned_start = (base + alignment - 1) & ~(alignment - 1);
-  aligned_end = aligned_start + size;
-  safe_munmap(base, aligned_start - base);
-  safe_munmap(aligned_end, (base + alloc_sz) - aligned_end);
-  mem = (void*)aligned_start;
-
   return mem;
 }
 
@@ -565,13 +543,10 @@ static void* map_fixed(void* mem, uintnat size, int prot)
    done using mprotect, since Cygwin's mmap doesn't implement the required
    functions for committing using mmap. */
 
-void *caml_plat_mem_map(uintnat size, uintnat alignment, int reserve_only)
+void *caml_plat_mem_map(uintnat size, int reserve_only)
 {
   void* mem;
 
-  if (alignment > caml_plat_mmap_alignment)
-    caml_fatal_error("Cannot align memory to %lx on this platform", alignment);
-
   mem = mmap(0, size, reserve_only ? PROT_NONE : (PROT_READ | PROT_WRITE),
              MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   if (mem == MAP_FAILED)
diff --git a/runtime/win32.c b/runtime/win32.c
index f1332822c24..84bf11bbfb3 100644
--- a/runtime/win32.c
+++ b/runtime/win32.c
@@ -1156,13 +1156,8 @@ int64_t caml_time_counter(void)
   return (int64_t)(now.QuadPart * clock_period);
 }
 
-void *caml_plat_mem_map(uintnat size, uintnat alignment, int reserve_only)
+void *caml_plat_mem_map(uintnat size, int reserve_only)
 {
-  /* VirtualAlloc returns an address aligned to caml_plat_mmap_alignment, so
-     trimming will not be required. VirtualAlloc returns 0 on error. */
-  if (alignment > caml_plat_mmap_alignment)
-    caml_fatal_error("Cannot align memory to %" ARCH_INTNAT_PRINTF_FORMAT "x"
-                     " on this platform", alignment);
   return
     VirtualAlloc(NULL, size,
                  MEM_RESERVE | (reserve_only ? 0 : MEM_COMMIT),
diff --git a/testsuite/tests/compaction/test_freelist_free.ml b/testsuite/tests/compaction/test_freelist_free.ml
new file mode 100644
index 00000000000..4162452eafb
--- /dev/null
+++ b/testsuite/tests/compaction/test_freelist_free.ml
@@ -0,0 +1,19 @@
+(* TEST
+*)
+
+(* tests that we correctly empty the shared pool's freelist. This requires a
+    bunch of garbage to be generated, a major cycle and two compactions to
+    test. If we can do that without segfault, we're good. *)
+
+let () =
+  let arr = ref (Some (Array.init 1000000 (fun x -> (Some x)))) in
+    Gc.minor ();
+    (* Now arr should be promoted to the major heap *)
+    arr := None;
+    Gc.full_major ();
+    (* Now arr should be garbage and the pools in the shared heap allocated
+      for it should be on the free list *)
+    Gc.compact ();
+    (* Now the pools should be compacted but also the freelist should have
+       been reset correctly *)
+    Gc.compact ()
diff --git a/testsuite/tests/lib-runtime-events/test_compact.ml b/testsuite/tests/lib-runtime-events/test_compact.ml
new file mode 100644
index 00000000000..890a28901ec
--- /dev/null
+++ b/testsuite/tests/lib-runtime-events/test_compact.ml
@@ -0,0 +1,75 @@
+(* TEST
+ include runtime_events;
+*)
+open Runtime_events
+
+type state =
+  | INIT
+  | BEGIN
+  | EVACUATING_BEGIN
+  | EVACUATING_END
+  | FORWARDING_BEGIN
+  | FORWARDING_END
+  | RELEASING_BEGIN
+  | RELEASING_END
+  | END
+
+let compact_state = ref INIT
+
+let () =
+    start ();
+    let cursor = create_cursor None in
+    let runtime_begin domain_id ts phase =
+      match phase with
+      | EV_COMPACT ->
+        begin
+          match !compact_state with
+          | INIT -> compact_state := BEGIN
+          | _ -> assert(false)
+        end
+      | EV_COMPACT_EVACUATE -> begin
+          match !compact_state with
+          | BEGIN -> compact_state := EVACUATING_BEGIN
+          | _ -> assert(false)
+        end
+      | EV_COMPACT_FORWARD -> begin
+          match !compact_state with
+          | EVACUATING_END -> compact_state := FORWARDING_BEGIN
+          | _ -> assert(false)
+        end
+      | EV_COMPACT_RELEASE -> begin
+          match !compact_state with
+          | FORWARDING_END -> compact_state := RELEASING_BEGIN
+          | _ -> assert(false)
+        end
+      | _ -> () in
+      let runtime_end domain_id ts phase =
+        match phase with
+        | EV_COMPACT ->
+          begin
+            match !compact_state with
+            | RELEASING_END -> compact_state := END
+            | _ -> assert(false)
+          end
+        | EV_COMPACT_EVACUATE -> begin
+            match !compact_state with
+            | EVACUATING_BEGIN -> compact_state := EVACUATING_END
+            | _ -> assert(false)
+          end
+        | EV_COMPACT_FORWARD -> begin
+            match !compact_state with
+            | FORWARDING_BEGIN -> compact_state := FORWARDING_END
+            | _ -> assert(false)
+          end
+        | EV_COMPACT_RELEASE -> begin
+            match !compact_state with
+            | RELEASING_BEGIN -> compact_state := RELEASING_END
+            | _ -> assert(false)
+          end
+        | _ -> ()
+        in
+    let callbacks = Callbacks.create ~runtime_begin ~runtime_end ()
+    in
+    Gc.compact ();
+    ignore(read_poll cursor callbacks (Some 1_000));
+    assert(!compact_state = END)
diff --git a/tools/gen_sizeclasses.ml b/tools/gen_sizeclasses.ml
index eb0a80d3bcd..35169e1f769 100644
--- a/tools/gen_sizeclasses.ml
+++ b/tools/gen_sizeclasses.ml
@@ -73,9 +73,9 @@ let _ =
   printf "#define SIZECLASS_MAX %d\n" max_slot;
   printf "#define NUM_SIZECLASSES %d\n" (List.length sizes);
   printf "static const unsigned int \
-wsize_sizeclass[NUM_SIZECLASSES] = @[<2>{ %a };@]\n" print_list sizes;
+wsize_sizeclass[NUM_SIZECLASSES] =@[<2>{ %a };@]\n" print_list sizes;
   printf "static const unsigned char \
-wastage_sizeclass[NUM_SIZECLASSES] = @[<2>{ %a };@]\n" print_list wastage;
+wastage_sizeclass[NUM_SIZECLASSES] =@[<2>{ %a };@]\n" print_list wastage;
   printf "static const unsigned char \
-sizeclass_wsize[SIZECLASS_MAX + 1] = @[<2>{ %a };@]\n"
+sizeclass_wsize[SIZECLASS_MAX + 1] =@[<2>{ %a };@]\n"
     print_list (255 :: size_slots 1);