Skip to content

GH-133136: Revise QSBR to reduce excess memory held #135473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Include/internal/pycore_pymem.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ extern wchar_t *_PyMem_DefaultRawWcsdup(const wchar_t *str);
extern int _PyMem_DebugEnabled(void);

// Enqueue a pointer to be freed possibly after some delay.
extern void _PyMem_FreeDelayed(void *ptr);
extern void _PyMem_FreeDelayed(void *ptr, size_t size);

// Enqueue an object to be freed possibly after some delay
#ifdef Py_GIL_DISABLED
Expand Down
41 changes: 34 additions & 7 deletions Include/internal/pycore_qsbr.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,21 @@ struct _qsbr_thread_state {
// Thread state (or NULL)
PyThreadState *tstate;

// Used to defer advancing write sequence a fixed number of times
int deferrals;
// Number of held items added by this thread since write sequence advance
int deferred_count;

// Estimate for the amount of memory that is held by this thread since
// the last write sequence advance
size_t deferred_memory;

// Amount of memory in mimalloc pages deferred from collection. When
// deferred, they are prevented from being used for a different size class
// and in a different thread.
size_t deferred_page_memory;

// If non-zero, processing if deferred memory should be performed if the
// read sequence has reached this value.
uint64_t process_seq;

// Is this thread state allocated?
bool allocated;
Expand All @@ -59,7 +72,7 @@ struct _qsbr_thread_state {
// Padding to avoid false sharing
struct _qsbr_pad {
struct _qsbr_thread_state qsbr;
char __padding[64 - sizeof(struct _qsbr_thread_state)];
char __padding[128 - sizeof(struct _qsbr_thread_state)];
};

// Per-interpreter state
Expand Down Expand Up @@ -109,11 +122,25 @@ _Py_qbsr_goal_reached(struct _qsbr_thread_state *qsbr, uint64_t goal)
extern uint64_t
_Py_qsbr_advance(struct _qsbr_shared *shared);

// Batches requests to advance the write sequence. This advances the write
// sequence every N calls, which reduces overhead but increases time to
// reclamation. Returns the new goal.
// Advance the write sequence as required and return the sequence goal to use
// for memory to be freed. The 'free_size' argument is the size in bytes of
// the memory scheduled to be freed. If that size is not available, pass zero
// as the value.
extern uint64_t
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
_Py_qsbr_deferred_advance_for_free(struct _qsbr_thread_state *qsbr,
size_t free_size);

// Advance the write sequence as required and return the sequence goal to use
// for a mimalloc page to be collected. The 'page_size' argument is the size
// of the mimalloc page being deferred from collection.
extern uint64_t
_Py_qsbr_deferred_advance_for_page(struct _qsbr_thread_state *qsbr,
size_t page_size);

// Return true if memory held by QSBR should be processed to determine if it
// can be safely freed.
extern bool
_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr);

// Have the read sequences advanced to the given goal? If this returns true,
// it safe to reclaim any memory tagged with the goal (or earlier goal).
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Limit excess memory usage in the :term:`free threading` build when a
large dictionary or list is resized and accessed by multiple threads.
2 changes: 1 addition & 1 deletion Objects/codeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -3350,7 +3350,7 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx)
}
memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *));
_Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc);
_PyMem_FreeDelayed(tlbc);
_PyMem_FreeDelayed(tlbc, tlbc->size * sizeof(void *));
tlbc = new_tlbc;
}
char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co));
Expand Down
4 changes: 2 additions & 2 deletions Objects/dictobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ free_keys_object(PyDictKeysObject *keys, bool use_qsbr)
{
#ifdef Py_GIL_DISABLED
if (use_qsbr) {
_PyMem_FreeDelayed(keys);
_PyMem_FreeDelayed(keys, _PyDict_KeysSize(keys));
return;
}
#endif
Expand Down Expand Up @@ -858,7 +858,7 @@ free_values(PyDictValues *values, bool use_qsbr)
assert(values->embedded == 0);
#ifdef Py_GIL_DISABLED
if (use_qsbr) {
_PyMem_FreeDelayed(values);
_PyMem_FreeDelayed(values, values_size_from_count(values->capacity));
return;
}
#endif
Expand Down
3 changes: 2 additions & 1 deletion Objects/listobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ free_list_items(PyObject** items, bool use_qsbr)
#ifdef Py_GIL_DISABLED
_PyListArray *array = _Py_CONTAINER_OF(items, _PyListArray, ob_item);
if (use_qsbr) {
_PyMem_FreeDelayed(array);
size_t size = sizeof(_PyListArray) + array->allocated * sizeof(PyObject *);
_PyMem_FreeDelayed(array, size);
}
else {
PyMem_Free(array);
Expand Down
20 changes: 13 additions & 7 deletions Objects/obmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ _PyMem_mi_page_maybe_free(mi_page_t *page, mi_page_queue_t *pq, bool force)

_PyMem_mi_page_clear_qsbr(page);
page->retire_expire = 0;
page->qsbr_goal = _Py_qsbr_deferred_advance(tstate->qsbr);

size_t bsize = mi_page_block_size(page);
page->qsbr_goal = _Py_qsbr_deferred_advance_for_page(tstate->qsbr, page->capacity*bsize);

llist_insert_tail(&tstate->mimalloc.page_list, &page->qsbr_node);
return false;
}
Expand Down Expand Up @@ -1142,7 +1145,7 @@ free_work_item(uintptr_t ptr, delayed_dealloc_cb cb, void *state)
}

static void
free_delayed(uintptr_t ptr)
free_delayed(uintptr_t ptr, size_t size)
{
#ifndef Py_GIL_DISABLED
free_work_item(ptr, NULL, NULL);
Expand Down Expand Up @@ -1200,23 +1203,23 @@ free_delayed(uintptr_t ptr)
}

assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK);
uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr);
uint64_t seq = _Py_qsbr_deferred_advance_for_free(tstate->qsbr, size);
buf->array[buf->wr_idx].ptr = ptr;
buf->array[buf->wr_idx].qsbr_goal = seq;
buf->wr_idx++;

if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) {
if (_Py_qsbr_should_process(tstate->qsbr)) {
_PyMem_ProcessDelayed((PyThreadState *)tstate);
}
#endif
}

void
_PyMem_FreeDelayed(void *ptr)
_PyMem_FreeDelayed(void *ptr, size_t size)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed((uintptr_t)ptr);
free_delayed((uintptr_t)ptr, size);
}
}

Expand All @@ -1226,7 +1229,10 @@ _PyObject_XDecRefDelayed(PyObject *ptr)
{
assert(!((uintptr_t)ptr & 0x01));
if (ptr != NULL) {
free_delayed(((uintptr_t)ptr)|0x01);
// We use 0 as the size since we don't have an easy way to know the
// actual size. If we are freeing many objects, the write sequence
// will be advanced due to QSBR_DEFERRED_LIMIT.
free_delayed(((uintptr_t)ptr)|0x01, 0);
}
}
#endif
Expand Down
65 changes: 56 additions & 9 deletions Python/qsbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,19 @@
// Starting size of the array of qsbr thread states
#define MIN_ARRAY_SIZE 8

// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing
// the write sequence.
#define QSBR_DEFERRED_LIMIT 10
// For deferred advance on free: the number of deferred items before advancing
// the write sequence. This is based on WORK_ITEMS_PER_CHUNK. We ideally
// want to process a chunk before it overflows.
#define QSBR_DEFERRED_LIMIT 127

// If the deferred memory exceeds 1 MiB, we force an advance in the
// shared QSBR sequence number to limit excess memory usage.
#define QSBR_FREE_MEM_LIMIT 1024*1024

// If we are deferring collection of more than this amount of memory for
// mimalloc pages, advance the write sequence. Advancing allows these
// pages to be re-used in a different thread or for a different size class.
#define QSBR_PAGE_MEM_LIMIT 4096*10

// Allocate a QSBR thread state from the freelist
static struct _qsbr_thread_state *
Expand Down Expand Up @@ -113,17 +123,54 @@ _Py_qsbr_advance(struct _qsbr_shared *shared)
// NOTE: with 64-bit sequence numbers, we don't have to worry too much
// about the wr_seq getting too far ahead of rd_seq, but if we ever use
// 32-bit sequence numbers, we'll need to be more careful.
return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR) + QSBR_INCR;
return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this modified?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think using "current value + QSBR_INCR" is unnecessarily conservative. We have just advanced the write sequence. If the caller of free_deferred() has removed the memory from the structure (which it should have), using the current (newly advanced) wr_seq value as the goal is safe. That will free the memory a little faster. This probably doesn't make a large difference but with the previous code we are waiting one extra advance and I don't think that's required in order to be safe.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_Py_atomic_add_uint64 returns the previous value, not the result of the addition 1, so this change means that it's using the old value of wr_seq as the goal

Footnotes

  1. This is to match the C11 behavior of atomic_fetch_add

}

uint64_t
_Py_qsbr_deferred_advance_for_page(struct _qsbr_thread_state *qsbr, size_t page_size)
{
qsbr->deferred_page_memory += page_size;
if (qsbr->deferred_page_memory > QSBR_PAGE_MEM_LIMIT) {
qsbr->deferred_page_memory = 0;
// Advance the write sequence and return the updated value as the goal.
return _Py_qsbr_advance(qsbr->shared);
}
// Don't advance, return the next sequence value as the goal.
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
}

uint64_t
_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr)
_Py_qsbr_deferred_advance_for_free(struct _qsbr_thread_state *qsbr, size_t free_size)
{
qsbr->deferred_count++;
qsbr->deferred_memory += free_size;
if (qsbr->deferred_count >= QSBR_DEFERRED_LIMIT ||
qsbr->deferred_memory > QSBR_FREE_MEM_LIMIT) {
qsbr->deferred_count = 0;
qsbr->deferred_memory = 0;
// Advance the write sequence
uint64_t seq = _Py_qsbr_advance(qsbr->shared);
if (qsbr->process_seq == 0) {
// Process the queue of deferred frees when the read sequence
// reaches this value. We don't process immediately because
// we want to give readers a chance to advance their sequence.
qsbr->process_seq = seq;
}
// Return current (just advanced) sequence as the goal.
return seq;
}
// Don't advance, return the next sequence value as the goal.
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
}

bool
_Py_qsbr_should_process(struct _qsbr_thread_state *qsbr)
{
if (++qsbr->deferrals < QSBR_DEFERRED_LIMIT) {
return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
if (qsbr->process_seq == 0 || qsbr->seq < qsbr->process_seq) {
return false;
}
qsbr->deferrals = 0;
return _Py_qsbr_advance(qsbr->shared);
qsbr->process_seq = 0;
return true;
}

static uint64_t
Expand Down
Loading