Skip to content

gh-116738: Make _codecs module thread-safe #117530

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Include/internal/pycore_codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ extern "C" {
# error "this header requires Py_BUILD_CORE define"
#endif

#include "pycore_lock.h" // PyMutex

extern PyObject* _PyCodec_Lookup(const char *encoding);

/* Text codec specific encoding and decoding API.
Expand Down Expand Up @@ -48,6 +50,32 @@ extern PyObject* _PyCodecInfo_GetIncrementalEncoder(
PyObject *codec_info,
const char *errors);

// Per-interpreter state used by codecs.c.
struct codecs_state {
// A list of callable objects used to search for codecs.
PyObject *search_path;

// A dict mapping codec names to codecs returned from a callable in
// search_path.
PyObject *search_cache;

// A dict mapping error handling strategies to functions to implement them.
PyObject *error_registry;

#ifdef Py_GIL_DISABLED
// Used to safely delete a specific item from search_path.
PyMutex search_path_mutex;

// Used to synchronize initialization of the PyObject* members above.
PyMutex init_mutex;
#endif

// If an acquire load of initialized yields 1, all of the PyObject* members
// above will be set, and their values will not change until interpreter
// finalization. This allows common operations to freely read them without
// additional synchronization.
int initialized;
};

#ifdef __cplusplus
}
Expand Down
6 changes: 2 additions & 4 deletions Include/internal/pycore_interp.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ extern "C" {
#include "pycore_atexit.h" // struct atexit_state
#include "pycore_ceval_state.h" // struct _ceval_state
#include "pycore_code.h" // struct callable_cache
#include "pycore_codecs.h" // struct codecs_state
#include "pycore_context.h" // struct _Py_context_state
#include "pycore_crossinterp.h" // struct _xidregistry
#include "pycore_dict_state.h" // struct _Py_dict_state
Expand Down Expand Up @@ -164,10 +165,7 @@ struct _is {
possible to facilitate out-of-process observability
tools. */

PyObject *codec_search_path;
PyObject *codec_search_cache;
PyObject *codec_error_registry;
int codecs_initialized;
struct codecs_state codecs;

PyConfig config;
unsigned long feature_flags;
Expand Down
7 changes: 7 additions & 0 deletions Include/internal/pycore_pyatomic_ft_wrappers.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,25 @@ extern "C" {
#define FT_ATOMIC_LOAD_SSIZE(value) _Py_atomic_load_ssize(&value)
#define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) \
_Py_atomic_load_ssize_relaxed(&value)
#define FT_ATOMIC_LOAD_INT_ACQUIRE(value) _Py_atomic_load_int_acquire(&value)

#define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) \
_Py_atomic_store_ptr_relaxed(&value, new_value)
#define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) \
_Py_atomic_store_ptr_release(&value, new_value)
#define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) \
_Py_atomic_store_ssize_relaxed(&value, new_value)
#define FT_ATOMIC_STORE_INT_RELEASE(value, new_value) \
_Py_atomic_store_int_release(&value, new_value)
#else
#define FT_ATOMIC_LOAD_SSIZE(value) value
#define FT_ATOMIC_LOAD_SSIZE_RELAXED(value) value
#define FT_ATOMIC_LOAD_INT_ACQUIRE(value) value

#define FT_ATOMIC_STORE_PTR_RELAXED(value, new_value) value = new_value
#define FT_ATOMIC_STORE_PTR_RELEASE(value, new_value) value = new_value
#define FT_ATOMIC_STORE_SSIZE_RELAXED(value, new_value) value = new_value
#define FT_ATOMIC_STORE_INT_RELEASE(value, new_value) value = new_value
#endif

#ifdef __cplusplus
Expand Down
158 changes: 106 additions & 52 deletions Python/codecs.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Copyright (c) Corporation for National Research Initiatives.
#include "Python.h"
#include "pycore_call.h" // _PyObject_CallNoArgs()
#include "pycore_interp.h" // PyInterpreterState.codec_search_path
#include "pycore_lock.h" // PyMutex
#include "pycore_pyatomic_ft_wrappers.h"
#include "pycore_pyerrors.h" // _PyErr_FormatNote()
#include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
Expand All @@ -30,13 +32,15 @@ const char *Py_hexdigits = "0123456789abcdef";

*/

static int _PyCodecRegistry_Init(void); /* Forward */
static int _PyCodecRegistry_EnsureInit(PyInterpreterState *); /* Forward */

int PyCodec_Register(PyObject *search_function)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
if (_PyCodecRegistry_EnsureInit(interp) < 0) {
goto onError;
}
PyObject *search_path = interp->codecs.search_path;
if (search_function == NULL) {
PyErr_BadArgument();
goto onError;
Expand All @@ -45,7 +49,14 @@ int PyCodec_Register(PyObject *search_function)
PyErr_SetString(PyExc_TypeError, "argument must be callable");
goto onError;
}
return PyList_Append(interp->codec_search_path, search_function);
#ifdef Py_GIL_DISABLED
PyMutex_Lock(&interp->codecs.search_path_mutex);
#endif
int ret = PyList_Append(search_path, search_function);
#ifdef Py_GIL_DISABLED
PyMutex_Unlock(&interp->codecs.search_path_mutex);
#endif
return ret;

onError:
return -1;
Expand All @@ -55,22 +66,33 @@ int
PyCodec_Unregister(PyObject *search_function)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
PyObject *codec_search_path = interp->codec_search_path;
/* Do nothing if codec_search_path is not created yet or was cleared. */
if (codec_search_path == NULL) {
/* Do nothing if codec data structures are not created yet. */
if (FT_ATOMIC_LOAD_INT_ACQUIRE(interp->codecs.initialized) == 0) {
return 0;
}

PyObject *codec_search_path = interp->codecs.search_path;
assert(PyList_CheckExact(codec_search_path));
Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
for (Py_ssize_t i = 0; i < n; i++) {
PyObject *item = PyList_GET_ITEM(codec_search_path, i);
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
#ifdef Py_GIL_DISABLED
PyMutex_Lock(&interp->codecs.search_path_mutex);
#endif
PyObject *item = PyList_GetItemRef(codec_search_path, i);
int ret = 1;
if (item == search_function) {
if (interp->codec_search_cache != NULL) {
assert(PyDict_CheckExact(interp->codec_search_cache));
PyDict_Clear(interp->codec_search_cache);
}
return PyList_SetSlice(codec_search_path, i, i+1, NULL);
// We hold a reference to the item, so its destructor can't run
// while we hold search_path_mutex.
ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
}
#ifdef Py_GIL_DISABLED
PyMutex_Unlock(&interp->codecs.search_path_mutex);
#endif
Py_DECREF(item);
if (ret != 1) {
assert(interp->codecs.search_cache != NULL);
assert(PyDict_CheckExact(interp->codecs.search_cache));
PyDict_Clear(interp->codecs.search_cache);
return ret;
}
}
return 0;
Expand Down Expand Up @@ -132,7 +154,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
}

PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
if (_PyCodecRegistry_EnsureInit(interp) < 0) {
return NULL;
}

Expand All @@ -147,7 +169,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)

/* First, try to lookup the name in the registry dictionary */
PyObject *result;
if (PyDict_GetItemRef(interp->codec_search_cache, v, &result) < 0) {
if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
goto onError;
}
if (result != NULL) {
Expand All @@ -156,7 +178,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
}

/* Next, scan the search functions in order of registration */
const Py_ssize_t len = PyList_Size(interp->codec_search_path);
const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
if (len < 0)
goto onError;
if (len == 0) {
Expand All @@ -170,14 +192,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
for (i = 0; i < len; i++) {
PyObject *func;

func = PyList_GetItem(interp->codec_search_path, i);
func = PyList_GetItemRef(interp->codecs.search_path, i);
if (func == NULL)
goto onError;
result = PyObject_CallOneArg(func, v);
Py_DECREF(func);
if (result == NULL)
goto onError;
if (result == Py_None) {
Py_DECREF(result);
Py_CLEAR(result);
continue;
}
if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
Expand All @@ -188,15 +211,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
}
break;
}
if (i == len) {
if (result == NULL) {
/* XXX Perhaps we should cache misses too ? */
PyErr_Format(PyExc_LookupError,
"unknown encoding: %s", encoding);
goto onError;
}

/* Cache and return the result */
if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
Py_DECREF(result);
goto onError;
}
Expand Down Expand Up @@ -600,13 +623,14 @@ PyObject *_PyCodec_DecodeText(PyObject *object,
int PyCodec_RegisterError(const char *name, PyObject *error)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
if (_PyCodecRegistry_EnsureInit(interp) < 0) {
return -1;
}
if (!PyCallable_Check(error)) {
PyErr_SetString(PyExc_TypeError, "handler must be callable");
return -1;
}
return PyDict_SetItemString(interp->codec_error_registry,
return PyDict_SetItemString(interp->codecs.error_registry,
name, error);
}

Expand All @@ -616,13 +640,14 @@ int PyCodec_RegisterError(const char *name, PyObject *error)
PyObject *PyCodec_LookupError(const char *name)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
if (_PyCodecRegistry_EnsureInit(interp) < 0) {
return NULL;
}

if (name==NULL)
name = "strict";
PyObject *handler;
if (PyDict_GetItemStringRef(interp->codec_error_registry, name, &handler) < 0) {
if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
return NULL;
}
if (handler == NULL) {
Expand Down Expand Up @@ -1375,7 +1400,7 @@ static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
return PyCodec_SurrogateEscapeErrors(exc);
}

static int _PyCodecRegistry_Init(void)
static int _PyCodecRegistry_EnsureInit(PyInterpreterState *interp)
{
static struct {
const char *name;
Expand Down Expand Up @@ -1463,45 +1488,74 @@ static int _PyCodecRegistry_Init(void)
}
};

PyInterpreterState *interp = _PyInterpreterState_GET();
PyObject *mod;

if (interp->codec_search_path != NULL)
if (FT_ATOMIC_LOAD_INT_ACQUIRE(interp->codecs.initialized) == 1) {
return 0;

interp->codec_search_path = PyList_New(0);
if (interp->codec_search_path == NULL) {
return -1;
}

interp->codec_search_cache = PyDict_New();
if (interp->codec_search_cache == NULL) {
return -1;
PyObject *search_path = NULL, *search_cache = NULL, *error_registry = NULL;
search_path = PyList_New(0);
if (search_path == NULL) {
goto error;
}

interp->codec_error_registry = PyDict_New();
if (interp->codec_error_registry == NULL) {
return -1;
search_cache = PyDict_New();
if (search_cache == NULL) {
goto error;
}
error_registry = PyDict_New();
if (error_registry == NULL) {
goto error;
}

for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
if (!func) {
return -1;
if (func == NULL) {
goto error;
}

int res = PyCodec_RegisterError(methods[i].name, func);
int res = PyDict_SetItemString(error_registry, methods[i].name, func);
Py_DECREF(func);
if (res) {
return -1;
if (res < 0) {
goto error;
}
}

mod = PyImport_ImportModule("encodings");
if (mod == NULL) {
return -1;
#ifdef Py_GIL_DISABLED
PyMutex_Lock(&interp->codecs.init_mutex);
#endif
int do_import = 1;
if (interp->codecs.initialized == 0) {
interp->codecs.search_path = search_path;
interp->codecs.search_cache = search_cache;
interp->codecs.error_registry = error_registry;
FT_ATOMIC_STORE_INT_RELEASE(interp->codecs.initialized, 1);
} else {
// Another thread initialized everything while we were preparing.
Py_DECREF(search_path);
Py_DECREF(search_cache);
Py_DECREF(error_registry);
do_import = 0;
}

// Importing `encodings' can execute arbitrary code and will call back into
// this module to register codec search functions. Do it once everything is
// initialized and we hold no locks. Other Python code may register other
// codecs before `encodings' is finished importing; this is true with or
// without the GIL.
#ifdef Py_GIL_DISABLED
PyMutex_Unlock(&interp->codecs.init_mutex);
#endif

if (do_import) {
PyObject *mod = PyImport_ImportModule("encodings");
if (mod == NULL) {
return -1;
}
Py_DECREF(mod);
}
Py_DECREF(mod);
interp->codecs_initialized = 1;
return 0;

error:
Py_XDECREF(search_path);
Py_XDECREF(search_cache);
Py_XDECREF(error_registry);
return -1;
}
6 changes: 3 additions & 3 deletions Python/pystate.c
Original file line number Diff line number Diff line change
Expand Up @@ -821,9 +821,9 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
}

PyConfig_Clear(&interp->config);
Py_CLEAR(interp->codec_search_path);
Py_CLEAR(interp->codec_search_cache);
Py_CLEAR(interp->codec_error_registry);
Py_CLEAR(interp->codecs.search_path);
Py_CLEAR(interp->codecs.search_cache);
Py_CLEAR(interp->codecs.error_registry);

assert(interp->imports.modules == NULL);
assert(interp->imports.modules_by_index == NULL);
Expand Down
Loading