From 9f2f1dd131b912e224cd0269adde8879799686c4 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 30 Sep 2022 14:58:30 +0200 Subject: [PATCH] gh-94526: getpath_dirname() no longer encodes the path (#97645) Fix the Python path configuration used to initialized sys.path at Python startup. Paths are no longer encoded to UTF-8/strict to avoid encoding errors if it contains surrogate characters (bytes paths are decoded with the surrogateescape error handler). getpath_basename() and getpath_dirname() functions no longer encode the path to UTF-8/strict, but work directly on Unicode strings. These functions now use PyUnicode_FindChar() and PyUnicode_Substring() on the Unicode path, rather than strrchr() on the encoded bytes string. --- ...2-09-29-15-19-29.gh-issue-94526.wq5m6T.rst | 4 ++++ Modules/getpath.c | 23 +++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-09-29-15-19-29.gh-issue-94526.wq5m6T.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-09-29-15-19-29.gh-issue-94526.wq5m6T.rst b/Misc/NEWS.d/next/Core and Builtins/2022-09-29-15-19-29.gh-issue-94526.wq5m6T.rst new file mode 100644 index 00000000000000..59e389a64ee07d --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-09-29-15-19-29.gh-issue-94526.wq5m6T.rst @@ -0,0 +1,4 @@ +Fix the Python path configuration used to initialized :data:`sys.path` at +Python startup. Paths are no longer encoded to UTF-8/strict to avoid encoding +errors if it contains surrogate characters (bytes paths are decoded with the +surrogateescape error handler). Patch by Victor Stinner. diff --git a/Modules/getpath.c b/Modules/getpath.c index 94479887cf850a..be704adbde9468 100644 --- a/Modules/getpath.c +++ b/Modules/getpath.c @@ -82,27 +82,32 @@ getpath_abspath(PyObject *Py_UNUSED(self), PyObject *args) static PyObject * getpath_basename(PyObject *Py_UNUSED(self), PyObject *args) { - const char *path; - if (!PyArg_ParseTuple(args, "s", &path)) { + PyObject *path; + if (!PyArg_ParseTuple(args, "U", &path)) { return NULL; } - const char *name = strrchr(path, SEP); - return PyUnicode_FromString(name ? name + 1 : path); + Py_ssize_t end = PyUnicode_GET_LENGTH(path); + Py_ssize_t pos = PyUnicode_FindChar(path, SEP, 0, end, -1); + if (pos < 0) { + return Py_NewRef(path); + } + return PyUnicode_Substring(path, pos + 1, end); } static PyObject * getpath_dirname(PyObject *Py_UNUSED(self), PyObject *args) { - const char *path; - if (!PyArg_ParseTuple(args, "s", &path)) { + PyObject *path; + if (!PyArg_ParseTuple(args, "U", &path)) { return NULL; } - const char *name = strrchr(path, SEP); - if (!name) { + Py_ssize_t end = PyUnicode_GET_LENGTH(path); + Py_ssize_t pos = PyUnicode_FindChar(path, SEP, 0, end, -1); + if (pos < 0) { return PyUnicode_FromStringAndSize(NULL, 0); } - return PyUnicode_FromStringAndSize(path, (name - path)); + return PyUnicode_Substring(path, 0, pos); }