From 9cced225ce1f17efe4f91804a757edee12ff2eb4 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Wed, 1 Jan 2025 20:32:14 +0000 Subject: [PATCH] gh-124363: Treat debug expressions in f-string as raw strings --- Parser/action_helpers.c | 78 ++++++++++++++++++----------------------- Parser/lexer/lexer.c | 6 ++-- 2 files changed, 37 insertions(+), 47 deletions(-) diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c index 5ac1dd7813689cd..d44fedaec535335 100644 --- a/Parser/action_helpers.c +++ b/Parser/action_helpers.c @@ -969,8 +969,6 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv) return result_token_with_metadata(p, conv, conv_token->metadata); } -static asdl_expr_seq * -unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions); ResultTokenWithMetadata * _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, int lineno, int col_offset, int end_lineno, int end_col_offset, PyArena *arena) @@ -1251,7 +1249,6 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq static expr_ty _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) { assert(PyUnicode_CheckExact(constant->v.Constant.value)); - const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value); if (bstr == NULL) { return NULL; @@ -1279,9 +1276,9 @@ _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* tok p->arena); } -static asdl_expr_seq * -unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions) -{ +expr_ty +_PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* expr, Token*b) { + /* The parser might put multiple f-string values into an individual * JoinedStr node at the top level due to stuff like f-string debugging * expressions. This function flattens those and promotes them to the @@ -1289,44 +1286,14 @@ unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions) * of the regular output, so this is not necessary if you are not going * to expose the output AST to Python level. */ - Py_ssize_t i, req_size, raw_size; - - req_size = raw_size = asdl_seq_LEN(raw_expressions); - expr_ty expr; - for (i = 0; i < raw_size; i++) { - expr = asdl_seq_GET(raw_expressions, i); - if (expr->kind == JoinedStr_kind) { - req_size += asdl_seq_LEN(expr->v.JoinedStr.values) - 1; - } - } - - asdl_expr_seq *expressions = _Py_asdl_expr_seq_new(req_size, p->arena); - if (expressions == NULL) { - return NULL; - } - - Py_ssize_t raw_index, req_index = 0; - for (raw_index = 0; raw_index < raw_size; raw_index++) { - expr = asdl_seq_GET(raw_expressions, raw_index); - if (expr->kind == JoinedStr_kind) { - asdl_expr_seq *values = expr->v.JoinedStr.values; - for (Py_ssize_t n = 0; n < asdl_seq_LEN(values); n++) { - asdl_seq_SET(expressions, req_index, asdl_seq_GET(values, n)); - req_index++; - } - } else { - asdl_seq_SET(expressions, req_index, expr); - req_index++; + Py_ssize_t n_items = asdl_seq_LEN(expr); + Py_ssize_t total_items = n_items; + for (Py_ssize_t i = 0; i < n_items; i++) { + expr_ty item = asdl_seq_GET(expr, i); + if (item->kind == JoinedStr_kind) { + total_items += asdl_seq_LEN(item->v.JoinedStr.values) - 1; } } - return expressions; -} - -expr_ty -_PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) { - - asdl_expr_seq *expr = unpack_top_level_joined_strs(p, raw_expressions); - Py_ssize_t n_items = asdl_seq_LEN(expr); const char* quote_str = PyBytes_AsString(a->bytes); if (quote_str == NULL) { @@ -1334,7 +1301,7 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b } int is_raw = strpbrk(quote_str, "rR") != NULL; - asdl_expr_seq *seq = _Py_asdl_expr_seq_new(n_items, p->arena); + asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena); if (seq == NULL) { return NULL; } @@ -1342,6 +1309,29 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b Py_ssize_t index = 0; for (Py_ssize_t i = 0; i < n_items; i++) { expr_ty item = asdl_seq_GET(expr, i); + + // This should correspond to a JoinedStr node of two elements + // created _PyPegen_formatted_value + if (item->kind == JoinedStr_kind) { + asdl_expr_seq *values = item->v.JoinedStr.values; + if (asdl_seq_LEN(values) != 2) { + PyErr_Format(PyExc_SystemError, + "unexpected JoinedStr node without debug data in f-string at line %d", + item->lineno); + return NULL; + } + + expr_ty first = asdl_seq_GET(values, 0); + assert(first->kind == Constant_kind); + asdl_seq_SET(seq, index++, first); + + expr_ty second = asdl_seq_GET(values, 1); + assert(second->kind == FormattedValue_kind); + asdl_seq_SET(seq, index++, second); + + continue; + } + if (item->kind == Constant_kind) { item = _PyPegen_decode_fstring_part(p, is_raw, item, b); if (item == NULL) { @@ -1360,7 +1350,7 @@ _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b } asdl_expr_seq *resized_exprs; - if (index != n_items) { + if (index != total_items) { resized_exprs = _Py_asdl_expr_seq_new(index, p->arena); if (resized_exprs == NULL) { return NULL; diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 8c868593f944c8c..ddb558066f6e31e 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -153,13 +153,13 @@ set_fstring_expr(struct tok_state* tok, struct token *token, char c) { } result[j] = '\0'; // Null-terminate the result string - res = PyUnicode_DecodeUTF8(result, j, NULL); + res = PyUnicode_DecodeUTF8Stateful(result, j, NULL, NULL); PyMem_Free(result); } else { - res = PyUnicode_DecodeUTF8( + res = PyUnicode_DecodeUTF8Stateful( tok_mode->last_expr_buffer, tok_mode->last_expr_size - tok_mode->last_expr_end, - NULL + NULL, NULL ); }