From 52df553a7eeea5425544cd6405b8fee14e9841a4 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 16 Feb 2024 11:22:27 -0500 Subject: [PATCH] gh-112529: Make the GC scheduling thread-safe (#114880) The GC keeps track of the number of allocations (less deallocations) since the last GC. This buffers the count in thread-local state and uses atomic operations to modify the per-interpreter count. The thread-local buffering avoids contention on shared state. A consequence is that the GC scheduling is not as precise, so "test_sneaky_frame_object" is skipped because it requires that the GC be run exactly after allocating a frame object. --- Include/internal/pycore_gc.h | 7 ++++ Include/internal/pycore_tstate.h | 1 + Lib/test/test_frame.py | 3 +- Lib/test/test_gc.py | 1 + Modules/gcmodule.c | 10 +++++ Objects/typeobject.c | 2 + Python/gc_free_threading.c | 63 ++++++++++++++++++++++++-------- 7 files changed, 71 insertions(+), 16 deletions(-) diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index 582a16bf5218cef..a98864f74313987 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -260,6 +260,13 @@ struct _gc_runtime_state { Py_ssize_t long_lived_pending; }; +#ifdef Py_GIL_DISABLED +struct _gc_thread_state { + /* Thread-local allocation count. */ + Py_ssize_t alloc_count; +}; +#endif + extern void _PyGC_InitState(struct _gc_runtime_state *); diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 97aa85a659fa7b4..7fb9ab2056704e1 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -28,6 +28,7 @@ typedef struct _PyThreadStateImpl { PyThreadState base; #ifdef Py_GIL_DISABLED + struct _gc_thread_state gc; struct _mimalloc_thread_state mimalloc; struct _Py_object_freelists freelists; struct _brc_thread_state brc; diff --git a/Lib/test/test_frame.py b/Lib/test/test_frame.py index baed03d92b9e561..f88206de550da08 100644 --- a/Lib/test/test_frame.py +++ b/Lib/test/test_frame.py @@ -13,7 +13,7 @@ _testcapi = None from test import support -from test.support import threading_helper +from test.support import threading_helper, Py_GIL_DISABLED from test.support.script_helper import assert_python_ok @@ -294,6 +294,7 @@ def gen(): assert_python_ok("-c", code) @support.cpython_only + @unittest.skipIf(Py_GIL_DISABLED, "test requires precise GC scheduling") def test_sneaky_frame_object(self): def trace(frame, event, arg): diff --git a/Lib/test/test_gc.py b/Lib/test/test_gc.py index b01f344cb14a1a7..dd09643788d62fb 100644 --- a/Lib/test/test_gc.py +++ b/Lib/test/test_gc.py @@ -363,6 +363,7 @@ def __del__(self): # To minimize variations, though, we first store the get_count() results # and check them at the end. @refcount_test + @unittest.skipIf(Py_GIL_DISABLED, 'needs precise allocation counts') def test_get_count(self): gc.collect() a, b, c = gc.get_count() diff --git a/Modules/gcmodule.c b/Modules/gcmodule.c index a2b66b9b78c169d..961165e16a0fee9 100644 --- a/Modules/gcmodule.c +++ b/Modules/gcmodule.c @@ -201,6 +201,16 @@ gc_get_count_impl(PyObject *module) /*[clinic end generated code: output=354012e67b16398f input=a392794a08251751]*/ { GCState *gcstate = get_gc_state(); + +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + struct _gc_thread_state *gc = &tstate->gc; + + // Flush the local allocation count to the global count + _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count); + gc->alloc_count = 0; +#endif + return Py_BuildValue("(iii)", gcstate->generations[0].count, gcstate->generations[1].count, diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 0118ee255ef0177..2e25c207c643829 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -1835,6 +1835,8 @@ _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems) if (presize) { ((PyObject **)alloc)[0] = NULL; ((PyObject **)alloc)[1] = NULL; + } + if (PyType_IS_GC(type)) { _PyObject_GC_Link(obj); } memset(obj, '\0', size); diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 3dc1dc19182eb41..a758c99285a5390 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -23,6 +23,11 @@ typedef struct _gc_runtime_state GCState; # define GC_DEBUG #endif +// Each thread buffers the count of allocated objects in a thread-local +// variable up to +/- this amount to reduce the overhead of updating +// the global count. +#define LOCAL_ALLOC_COUNT_THRESHOLD 512 + // Automatically choose the generation that needs collecting. #define GENERATION_AUTO (-1) @@ -959,6 +964,41 @@ gc_should_collect(GCState *gcstate) gcstate->generations[1].threshold == 0); } +static void +record_allocation(PyThreadState *tstate) +{ + struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc; + + // We buffer the allocation count to avoid the overhead of atomic + // operations for every allocation. + gc->alloc_count++; + if (gc->alloc_count >= LOCAL_ALLOC_COUNT_THRESHOLD) { + // TODO: Use Py_ssize_t for the generation count. + GCState *gcstate = &tstate->interp->gc; + _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count); + gc->alloc_count = 0; + + if (gc_should_collect(gcstate) && + !_Py_atomic_load_int_relaxed(&gcstate->collecting)) + { + _Py_ScheduleGC(tstate->interp); + } + } +} + +static void +record_deallocation(PyThreadState *tstate) +{ + struct _gc_thread_state *gc = &((_PyThreadStateImpl *)tstate)->gc; + + gc->alloc_count--; + if (gc->alloc_count <= -LOCAL_ALLOC_COUNT_THRESHOLD) { + GCState *gcstate = &tstate->interp->gc; + _Py_atomic_add_int(&gcstate->generations[0].count, (int)gc->alloc_count); + gc->alloc_count = 0; + } +} + static void gc_collect_internal(PyInterpreterState *interp, struct collection_state *state) { @@ -981,6 +1021,9 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state) } } + // Record the number of live GC objects + interp->gc.long_lived_total = state->long_lived_total; + // Clear weakrefs and enqueue callbacks (but do not call them). clear_weakrefs(state); _PyEval_StartTheWorld(interp); @@ -1090,7 +1133,6 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) m = state.collected; n = state.uncollectable; - gcstate->long_lived_total = state.long_lived_total; if (gcstate->debug & _PyGC_DEBUG_STATS) { double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1); @@ -1530,15 +1572,7 @@ _Py_ScheduleGC(PyInterpreterState *interp) void _PyObject_GC_Link(PyObject *op) { - PyThreadState *tstate = _PyThreadState_GET(); - GCState *gcstate = &tstate->interp->gc; - gcstate->generations[0].count++; - - if (gc_should_collect(gcstate) && - !_Py_atomic_load_int_relaxed(&gcstate->collecting)) - { - _Py_ScheduleGC(tstate->interp); - } + record_allocation(_PyThreadState_GET()); } void @@ -1564,7 +1598,7 @@ gc_alloc(PyTypeObject *tp, size_t basicsize, size_t presize) ((PyObject **)mem)[1] = NULL; } PyObject *op = (PyObject *)(mem + presize); - _PyObject_GC_Link(op); + record_allocation(tstate); return op; } @@ -1646,10 +1680,9 @@ PyObject_GC_Del(void *op) PyErr_SetRaisedException(exc); #endif } - GCState *gcstate = get_gc_state(); - if (gcstate->generations[0].count > 0) { - gcstate->generations[0].count--; - } + + record_deallocation(_PyThreadState_GET()); + PyObject_Free(((char *)op)-presize); }