gh-115103: Implement delayed free mechanism for free-threaded builds (#115367)

colesbury · web-flow · commit e3ad6ca56f9b · 2024-02-20T13:04:37.000-05:00
This adds `_PyMem_FreeDelayed()` and supporting functions. The
`_PyMem_FreeDelayed()` function frees memory with the same allocator as
`PyMem_Free()`, but after some delay to ensure that concurrent lock-free
readers have finished.
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h
@@ -231,6 +231,7 @@ struct _is {
 
     struct _Py_dict_state dict_state;
     struct _Py_exc_state exc_state;
+    struct _Py_mem_interp_free_queue mem_free_queue;
 
     struct ast_state ast;
     struct types_state types;
diff --git a/Include/internal/pycore_pymem.h b/Include/internal/pycore_pymem.h
@@ -1,6 +1,7 @@
 #ifndef Py_INTERNAL_PYMEM_H
 #define Py_INTERNAL_PYMEM_H
 
+#include "pycore_llist.h"           // struct llist_node
 #include "pycore_lock.h"            // PyMutex
 
 #ifdef __cplusplus
@@ -48,6 +49,11 @@ struct _pymem_allocators {
     PyObjectArenaAllocator obj_arena;
 };
 
+struct _Py_mem_interp_free_queue {
+    int has_work;   // true if the queue is not empty
+    PyMutex mutex;  // protects the queue
+    struct llist_node head;  // queue of _mem_work_chunk items
+};
 
 /* Set the memory allocator of the specified domain to the default.
    Save the old allocator into *old_alloc if it's non-NULL.
@@ -110,6 +116,19 @@ extern int _PyMem_SetupAllocators(PyMemAllocatorName allocator);
 /* Is the debug allocator enabled? */
 extern int _PyMem_DebugEnabled(void);
 
+// Enqueue a pointer to be freed possibly after some delay.
+extern void _PyMem_FreeDelayed(void *ptr);
+
+// Periodically process delayed free requests.
+extern void _PyMem_ProcessDelayed(PyThreadState *tstate);
+
+// Abandon all thread-local delayed free requests and push them to the
+// interpreter's queue.
+extern void _PyMem_AbandonDelayed(PyThreadState *tstate);
+
+// On interpreter shutdown, frees all delayed free requests.
+extern void _PyMem_FiniDelayed(PyInterpreterState *interp);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/internal/pycore_pymem_init.h b/Include/internal/pycore_pymem_init.h
@@ -92,6 +92,11 @@ extern void _PyMem_ArenaFree(void *, void *, size_t);
     { NULL, _PyMem_ArenaAlloc, _PyMem_ArenaFree }
 
 
+#define _Py_mem_free_queue_INIT(queue) \
+    { \
+        .head = LLIST_INIT(queue.head), \
+    }
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h
@@ -176,6 +176,7 @@ extern PyTypeObject _PyExc_MemoryError;
         }, \
         .dtoa = _dtoa_state_INIT(&(INTERP)), \
         .dict_state = _dict_state_INIT, \
+        .mem_free_queue = _Py_mem_free_queue_INIT(INTERP.mem_free_queue), \
         .func_state = { \
             .next_version = 1, \
         }, \
diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h
@@ -29,6 +29,7 @@ typedef struct _PyThreadStateImpl {
     PyThreadState base;
 
     struct _qsbr_thread_state *qsbr;  // only used by free-threaded build
+    struct llist_node mem_free_queue; // delayed free queue
 
 #ifdef Py_GIL_DISABLED
     struct _gc_thread_state gc;
diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c
@@ -948,6 +948,196 @@ _PyMem_Strdup(const char *str)
     return copy;
 }
 
+/***********************************************/
+/* Delayed freeing support for Py_GIL_DISABLED */
+/***********************************************/
+
+// So that sizeof(struct _mem_work_chunk) is 4096 bytes on 64-bit platforms.
+#define WORK_ITEMS_PER_CHUNK 254
+
+// A pointer to be freed once the QSBR read sequence reaches qsbr_goal.
+struct _mem_work_item {
+    void *ptr;
+    uint64_t qsbr_goal;
+};
+
+// A fixed-size buffer of pointers to be freed
+struct _mem_work_chunk {
+    // Linked list node of chunks in queue
+    struct llist_node node;
+
+    Py_ssize_t rd_idx;  // index of next item to read
+    Py_ssize_t wr_idx;  // index of next item to write
+    struct _mem_work_item array[WORK_ITEMS_PER_CHUNK];
+};
+
+void
+_PyMem_FreeDelayed(void *ptr)
+{
+#ifndef Py_GIL_DISABLED
+    PyMem_Free(ptr);
+#else
+    if (_PyRuntime.stoptheworld.world_stopped) {
+        // Free immediately if the world is stopped, including during
+        // interpreter shutdown.
+        PyMem_Free(ptr);
+        return;
+    }
+
+    _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
+    struct llist_node *head = &tstate->mem_free_queue;
+
+    struct _mem_work_chunk *buf = NULL;
+    if (!llist_empty(head)) {
+        // Try to re-use the last buffer
+        buf = llist_data(head->prev, struct _mem_work_chunk, node);
+        if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
+            // already full
+            buf = NULL;
+        }
+    }
+
+    if (buf == NULL) {
+        buf = PyMem_Calloc(1, sizeof(*buf));
+        if (buf != NULL) {
+            llist_insert_tail(head, &buf->node);
+        }
+    }
+
+    if (buf == NULL) {
+        // failed to allocate a buffer, free immediately
+        _PyEval_StopTheWorld(tstate->base.interp);
+        PyMem_Free(ptr);
+        _PyEval_StartTheWorld(tstate->base.interp);
+        return;
+    }
+
+    assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
+    uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
+    buf->array[buf->wr_idx].ptr = ptr;
+    buf->array[buf->wr_idx].qsbr_goal = seq;
+    buf->wr_idx++;
+
+    if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
+        _PyMem_ProcessDelayed((PyThreadState *)tstate);
+    }
+#endif
+}
+
+static struct _mem_work_chunk *
+work_queue_first(struct llist_node *head)
+{
+    return llist_data(head->next, struct _mem_work_chunk, node);
+}
+
+static void
+process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr,
+              bool keep_empty)
+{
+    while (!llist_empty(head)) {
+        struct _mem_work_chunk *buf = work_queue_first(head);
+
+        while (buf->rd_idx < buf->wr_idx) {
+            struct _mem_work_item *item = &buf->array[buf->rd_idx];
+            if (!_Py_qsbr_poll(qsbr, item->qsbr_goal)) {
+                return;
+            }
+
+            PyMem_Free(item->ptr);
+            buf->rd_idx++;
+        }
+
+        assert(buf->rd_idx == buf->wr_idx);
+        if (keep_empty && buf->node.next == head) {
+            // Keep the last buffer in the queue to reduce re-allocations
+            buf->rd_idx = buf->wr_idx = 0;
+            return;
+        }
+
+        llist_remove(&buf->node);
+        PyMem_Free(buf);
+    }
+}
+
+static void
+process_interp_queue(struct _Py_mem_interp_free_queue *queue,
+                     struct _qsbr_thread_state *qsbr)
+{
+    if (!_Py_atomic_load_int_relaxed(&queue->has_work)) {
+        return;
+    }
+
+    // Try to acquire the lock, but don't block if it's already held.
+    if (_PyMutex_LockTimed(&queue->mutex, 0, 0) == PY_LOCK_ACQUIRED) {
+        process_queue(&queue->head, qsbr, false);
+
+        int more_work = !llist_empty(&queue->head);
+        _Py_atomic_store_int_relaxed(&queue->has_work, more_work);
+
+        PyMutex_Unlock(&queue->mutex);
+    }
+}
+
+void
+_PyMem_ProcessDelayed(PyThreadState *tstate)
+{
+    PyInterpreterState *interp = tstate->interp;
+    _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate;
+
+    // Process thread-local work
+    process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true);
+
+    // Process shared interpreter work
+    process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr);
+}
+
+void
+_PyMem_AbandonDelayed(PyThreadState *tstate)
+{
+    PyInterpreterState *interp = tstate->interp;
+    struct llist_node *queue = &((_PyThreadStateImpl *)tstate)->mem_free_queue;
+
+    if (llist_empty(queue)) {
+        return;
+    }
+
+    // Check if the queue contains one empty buffer
+    struct _mem_work_chunk *buf = work_queue_first(queue);
+    if (buf->rd_idx == buf->wr_idx) {
+        llist_remove(&buf->node);
+        PyMem_Free(buf);
+        assert(llist_empty(queue));
+        return;
+    }
+
+    // Merge the thread's work queue into the interpreter's work queue.
+    PyMutex_Lock(&interp->mem_free_queue.mutex);
+    llist_concat(&interp->mem_free_queue.head, queue);
+    _Py_atomic_store_int_relaxed(&interp->mem_free_queue.has_work, 1);
+    PyMutex_Unlock(&interp->mem_free_queue.mutex);
+
+    assert(llist_empty(queue));  // the thread's queue is now empty
+}
+
+void
+_PyMem_FiniDelayed(PyInterpreterState *interp)
+{
+    struct llist_node *head = &interp->mem_free_queue.head;
+    while (!llist_empty(head)) {
+        struct _mem_work_chunk *buf = work_queue_first(head);
+
+        while (buf->rd_idx < buf->wr_idx) {
+            // Free the remaining items immediately. There should be no other
+            // threads accessing the memory at this point during shutdown.
+            struct _mem_work_item *item = &buf->array[buf->rd_idx];
+            PyMem_Free(item->ptr);
+            buf->rd_idx++;
+        }
+
+        llist_remove(&buf->node);
+        PyMem_Free(buf);
+    }
+}
 
 /**************************/
 /* the "object" allocator */
diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c
@@ -1837,6 +1837,9 @@ finalize_interp_clear(PyThreadState *tstate)
 
     finalize_interp_types(tstate->interp);
 
+    /* Free any delayed free requests immediately */
+    _PyMem_FiniDelayed(tstate->interp);
+
     /* finalize_interp_types may allocate Python objects so we may need to
        abandon mimalloc segments again */
     _PyThreadState_ClearMimallocHeaps(tstate);
diff --git a/Python/pystate.c b/Python/pystate.c
@@ -617,6 +617,7 @@ init_interpreter(PyInterpreterState *interp,
 #ifdef Py_GIL_DISABLED
     _Py_brc_init_state(interp);
 #endif
+    llist_init(&interp->mem_free_queue.head);
     for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) {
         interp->monitors.tools[i] = 0;
     }
@@ -1353,6 +1354,7 @@ init_threadstate(_PyThreadStateImpl *_tstate,
     // Initialize biased reference counting inter-thread queue
     _Py_brc_init_thread(tstate);
 #endif
+    llist_init(&_tstate->mem_free_queue);
 
     if (interp->stoptheworld.requested || _PyRuntime.stoptheworld.requested) {
         // Start in the suspended state if there is an ongoing stop-the-world.
@@ -1574,6 +1576,7 @@ PyThreadState_Clear(PyThreadState *tstate)
         // don't call _PyInterpreterState_SetNotRunningMain() yet.
         tstate->on_delete(tstate->on_delete_data);
     }
+
 #ifdef Py_GIL_DISABLED
     // Each thread should clear own freelists in free-threading builds.
     struct _Py_object_freelists *freelists = _Py_object_freelists_GET();
@@ -1583,6 +1586,9 @@ PyThreadState_Clear(PyThreadState *tstate)
     _Py_brc_remove_thread(tstate);
 #endif
 
+    // Merge our queue of pointers to be freed into the interpreter queue.
+    _PyMem_AbandonDelayed(tstate);
+
     _PyThreadState_ClearMimallocHeaps(tstate);
 
     tstate->_status.cleared = 1;