Skip to content

Commit

Permalink
gh-112529: Make the GC scheduling thread-safe (#114880)
Browse files Browse the repository at this point in the history
The GC keeps track of the number of allocations (less deallocations)
since the last GC. This buffers the count in thread-local state and uses
atomic operations to modify the per-interpreter count. The thread-local
buffering avoids contention on shared state.

A consequence is that the GC scheduling is not as precise, so
"test_sneaky_frame_object" is skipped because it requires that the GC be
run exactly after allocating a frame object.
  • Loading branch information
colesbury authored Feb 16, 2024
1 parent f92857a commit b24c916
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 16 deletions.
7 changes: 7 additions & 0 deletions Include/internal/pycore_gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,13 @@ struct _gc_runtime_state {
Py_ssize_t long_lived_pending;
};

#ifdef Py_GIL_DISABLED
struct _gc_thread_state {
/* Thread-local allocation count. */
Py_ssize_t alloc_count;
};
#endif


extern void _PyGC_InitState(struct _gc_runtime_state *);

Expand Down
1 change: 1 addition & 0 deletions Include/internal/pycore_tstate.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ typedef struct _PyThreadStateImpl {
PyThreadState base;

#ifdef Py_GIL_DISABLED
struct _gc_thread_state gc;
struct _mimalloc_thread_state mimalloc;
struct _Py_object_freelists freelists;
struct _brc_thread_state brc;
Expand Down
3 changes: 2 additions & 1 deletion Lib/test/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
_testcapi = None

from test import support
from test.support import threading_helper
from test.support import threading_helper, Py_GIL_DISABLED
from test.support.script_helper import assert_python_ok


Expand Down Expand Up @@ -294,6 +294,7 @@ def gen():
assert_python_ok("-c", code)

@support.cpython_only
@unittest.skipIf(Py_GIL_DISABLED, "test requires precise GC scheduling")
def test_sneaky_frame_object(self):

def trace(frame, event, arg):
Expand Down
1 change: 1 addition & 0 deletions Lib/test/test_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ def __del__(self):
# To minimize variations, though, we first store the get_count() results
# and check them at the end.
@refcount_test
@unittest.skipIf(Py_GIL_DISABLED, 'needs precise allocation counts')
def test_get_count(self):
gc.collect()
a, b, c = gc.get_count()
Expand Down
10 changes: 10 additions & 0 deletions Modules/gcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,16 @@ gc_get_count_impl(PyObject *module)
/*[clinic end generated code: output=354012e67b16398f input=a392794a08251751]*/
{
GCState *gcstate = get_gc_state();

#ifdef Py_GIL_DISABLED
_PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET();
struct _gc_thread_state *gc = &tstate->gc;

// Flush the local allocation count to the global count
_Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
gc->alloc_count = 0;
#endif

return Py_BuildValue("(iii)",
gcstate->generations[0].count,
gcstate->generations[1].count,
Expand Down
2 changes: 2 additions & 0 deletions Objects/typeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1835,6 +1835,8 @@ _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems)
if (presize) {
((PyObject **)alloc)[0] = NULL;
((PyObject **)alloc)[1] = NULL;
}
if (PyType_IS_GC(type)) {
_PyObject_GC_Link(obj);
}
memset(obj, '\0', size);
Expand Down
63 changes: 48 additions & 15 deletions Python/gc_free_threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ typedef struct _gc_runtime_state GCState;
# define GC_DEBUG
#endif

// Each thread buffers the count of allocated objects in a thread-local
// variable up to +/- this amount to reduce the overhead of updating
// the global count.
#define LOCAL_ALLOC_COUNT_THRESHOLD 512

// Automatically choose the generation that needs collecting.
#define GENERATION_AUTO (-1)

Expand Down Expand Up @@ -959,6 +964,41 @@ gc_should_collect(GCState *gcstate)
gcstate->generations[1].threshold == 0);
}

static void
record_allocation(PyThreadState *tstate)
{
struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;

// We buffer the allocation count to avoid the overhead of atomic
// operations for every allocation.
gc->alloc_count++;
if (gc->alloc_count >= LOCAL_ALLOC_COUNT_THRESHOLD) {
// TODO: Use Py_ssize_t for the generation count.
GCState *gcstate = &tstate->interp->gc;
_Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
gc->alloc_count = 0;

if (gc_should_collect(gcstate) &&
!_Py_atomic_load_int_relaxed(&gcstate->collecting))
{
_Py_ScheduleGC(tstate->interp);
}
}
}

static void
record_deallocation(PyThreadState *tstate)
{
struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc;

gc->alloc_count--;
if (gc->alloc_count <= -LOCAL_ALLOC_COUNT_THRESHOLD) {
GCState *gcstate = &tstate->interp->gc;
_Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count);
gc->alloc_count = 0;
}
}

static void
gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
{
Expand All @@ -981,6 +1021,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state)
}
}

// Record the number of live GC objects
interp->gc.long_lived_total = state->long_lived_total;

// Clear weakrefs and enqueue callbacks (but do not call them).
clear_weakrefs(state);
_PyEval_StartTheWorld(interp);
Expand Down Expand Up @@ -1090,7 +1133,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason)

m = state.collected;
n = state.uncollectable;
gcstate->long_lived_total = state.long_lived_total;

if (gcstate->debug & _PyGC_DEBUG_STATS) {
double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1);
Expand Down Expand Up @@ -1530,15 +1572,7 @@ _Py_ScheduleGC(PyInterpreterState *interp)
void
_PyObject_GC_Link(PyObject *op)
{
PyThreadState *tstate = _PyThreadState_GET();
GCState *gcstate = &tstate->interp->gc;
gcstate->generations[0].count++;

if (gc_should_collect(gcstate) &&
!_Py_atomic_load_int_relaxed(&gcstate->collecting))
{
_Py_ScheduleGC(tstate->interp);
}
record_allocation(_PyThreadState_GET());
}

void
Expand All @@ -1564,7 +1598,7 @@ gc_alloc(PyTypeObject *tp, size_t basicsize, size_t presize)
((PyObject **)mem)[1] = NULL;
}
PyObject *op = (PyObject *)(mem + presize);
_PyObject_GC_Link(op);
record_allocation(tstate);
return op;
}

Expand Down Expand Up @@ -1646,10 +1680,9 @@ PyObject_GC_Del(void *op)
PyErr_SetRaisedException(exc);
#endif
}
GCState *gcstate = get_gc_state();
if (gcstate->generations[0].count > 0) {
gcstate->generations[0].count--;
}

record_deallocation(_PyThreadState_GET());

PyObject_Free(((char *)op)-presize);
}

Expand Down

0 comments on commit b24c916

Please sign in to comment.