python · savannahostrowski · Jul 26, 2024 · Jun 25, 2024 · Jul 26, 2024 · Aug 12, 2024
@@ -259,7 +259,7 @@ struct _is {
     struct callable_cache callable_cache;
     _PyOptimizerObject *optimizer;
     _PyExecutorObject *executor_list_head;
-
+    size_t executors_created;
     _rare_events rare_events;
     PyDict_WatchCallback builtins_dict_watcher;
 

diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h
@@ -31,7 +31,8 @@ typedef struct {
     uint8_t oparg;
     uint16_t valid:1;
     uint16_t linked:1;
-    uint16_t chain_depth:14;  // Must be big engough for MAX_CHAIN_DEPTH - 1.
+    bool was_run:1;
+    uint16_t chain_depth:13;  // Must be big enough for MAX_CHAIN_DEPTH - 1.
     int index;           // Index of ENTER_EXECUTOR (if code isn't NULL, below).
     _PyBloomFilter bloom;
     _PyExecutorLinkListNode links;
@@ -123,11 +124,16 @@ PyAPI_FUNC(PyObject *) _PyOptimizer_NewUOpOptimizer(void);
 #ifdef _Py_TIER2
 PyAPI_FUNC(void) _Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation);
 PyAPI_FUNC(void) _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation);
+PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp);
+
 #else
 #  define _Py_Executors_InvalidateDependency(A, B, C) ((void)0)
 #  define _Py_Executors_InvalidateAll(A, B) ((void)0)
+#  define _Py_Executors_InvalidateCold(A) ((void)0)
+
 #endif
 
+#define JIT_CLEANUP_THRESHOLD 10
 
 // This is the length of the trace we project initially.
 #define UOP_MAX_TRACE_LENGTH 800

diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h
diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-27-21-44-14.gh-issue-116017.ZY3yBY.rst
@@ -0,0 +1,2 @@
+Improved JIT memory consumption by periodically freeing memory used by infrequently-executed code.
+This change is especially likely to improve the memory footprint of long-running programs.
@@ -4844,6 +4844,10 @@ dummy_func(
             assert(((_PyExecutorObject *)executor)->vm_data.valid);
         }
 
+        tier2 op(_SET_EXECUTOR_RUN_STATE, (--)) {
+            current_executor->vm_data.was_run = true;
+        }
+
         tier2 op(_FATAL_ERROR, (--)) {
             assert(0);
             Py_FatalError("Fatal error uop executed.");

diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
@@ -182,6 +182,12 @@ _PyOptimizer_Optimize(
     if (err <= 0) {
         return err;
     }
+
+    if (++interp->executors_created >= JIT_CLEANUP_THRESHOLD) {
+        interp->executors_created = 0;
+        _Py_Executors_InvalidateCold(interp);
+    }
+
     assert(*executor_ptr != NULL);
     if (progress_needed) {
         int index = get_index_for_executor(code, start);
@@ -565,6 +571,7 @@ translate_bytecode_to_trace(
             code->co_firstlineno,
             2 * INSTR_IP(initial_instr, code));
     ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code));
+    ADD_TO_TRACE(_SET_EXECUTOR_RUN_STATE, 0, 0, 0);
     uint32_t target = 0;
 
     for (;;) {
@@ -1194,6 +1201,9 @@ make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFil
     executor->jit_code = NULL;
     executor->jit_side_entry = NULL;
     executor->jit_size = 0;
+    // This is initialized to true so we can prevent the executor
+    // from being immediately detected as cold and invalidated.
+    executor->vm_data.was_run = true;
     if (_PyJIT_Compile(executor, executor->trace, length)) {
         Py_DECREF(executor);
         return NULL;
@@ -1657,6 +1667,49 @@ _Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
             OPT_STAT_INC(executors_invalidated);
         }
     }
+    interp->executors_created=0;
+}
+
+void
+_Py_Executors_InvalidateCold(PyInterpreterState *interp)
+{
+    /* Walk the list of executors */
+    /* TO DO -- Use a tree to avoid traversing as many objects */
+    PyObject *invalidate = PyList_New(0);
+    if (invalidate == NULL) {
+        goto error;
+    }
+
+    /* Clearing an executor can deallocate others, so we need to make a list of
+     * executors to invalidate first */
+    for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
+        assert(exec->vm_data.valid);
+        _PyExecutorObject *next = exec->vm_data.links.next;
+
+        if (!exec->vm_data.was_run) {
+            unlink_executor(exec);
+            if (PyList_Append(invalidate, (PyObject *)exec) < 0)
+            {
+                goto error;
+            }
+
+        } else {
+            exec->vm_data.was_run = false;
+        }
+
+        exec = next;
+    }
+    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
+        _PyExecutorObject *exec = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i);
+        executor_clear(exec);
+    }
+    Py_DECREF(invalidate);
+    return;
+error:
+    PyErr_Clear();
+    Py_XDECREF(invalidate);
+    // If we're truly out of memory, wiping out everything is a fine fallback:
+    _Py_Executors_InvalidateAll(interp, 0);
 }
 
 #endif /* _Py_TIER2 */
diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h
@@ -660,6 +660,7 @@ init_interpreter(PyInterpreterState *interp,
 #ifdef _Py_TIER2
     (void)_Py_SetOptimizer(interp, NULL);
     interp->executor_list_head = NULL;
+    interp->executors_created = 0;
 #endif
     if (interp != &runtime->_main_interpreter) {
         /* Fix the self-referential, statically initialized fields. */
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Improved JIT memory consumption by periodically freeing memory used by infrequently-executed code.
		This change is especially likely to improve the memory footprint of long-running programs.