forked from python/cpython
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
pythongh-104169: Refactor tokenizer into lexer and wrappers (python#1…
…10684) * The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <[email protected]> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Showing
29 changed files
with
3,185 additions
and
2,988 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 4 additions & 0 deletions
4
Misc/NEWS.d/next/Core and Builtins/2023-10-11-12-48-03.gh-issue-104169.bPoX8u.rst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Split the tokenizer into two separate directories: | ||
- One part includes the actual lexeme producing logic and lives in ``Parser/lexer``. | ||
- The second part wraps the lexer according to the different tokenization modes | ||
we have (string, utf-8, file, interactive, readline) and lives in ``Parser/tokenizer``. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#include "Python.h" | ||
#include "errcode.h" | ||
|
||
#include "state.h" | ||
|
||
/* Traverse and remember all f-string buffers, in order to be able to restore | ||
them after reallocating tok->buf */ | ||
void | ||
_PyLexer_remember_fstring_buffers(struct tok_state *tok) | ||
{ | ||
int index; | ||
tokenizer_mode *mode; | ||
|
||
for (index = tok->tok_mode_stack_index; index >= 0; --index) { | ||
mode = &(tok->tok_mode_stack[index]); | ||
mode->f_string_start_offset = mode->f_string_start - tok->buf; | ||
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf; | ||
} | ||
} | ||
|
||
/* Traverse and restore all f-string buffers after reallocating tok->buf */ | ||
void | ||
_PyLexer_restore_fstring_buffers(struct tok_state *tok) | ||
{ | ||
int index; | ||
tokenizer_mode *mode; | ||
|
||
for (index = tok->tok_mode_stack_index; index >= 0; --index) { | ||
mode = &(tok->tok_mode_stack[index]); | ||
mode->f_string_start = tok->buf + mode->f_string_start_offset; | ||
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset; | ||
} | ||
} | ||
|
||
/* Read a line of text from TOK into S, using the stream in TOK. | ||
Return NULL on failure, else S. | ||
On entry, tok->decoding_buffer will be one of: | ||
1) NULL: need to call tok->decoding_readline to get a new line | ||
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and | ||
stored the result in tok->decoding_buffer | ||
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room | ||
(in the s buffer) to copy entire contents of the line read | ||
by tok->decoding_readline. tok->decoding_buffer has the overflow. | ||
In this case, tok_readline_recode is called in a loop (with an expanded buffer) | ||
until the buffer ends with a '\n' (or until the end of the file is | ||
reached): see tok_nextc and its calls to tok_reserve_buf. | ||
*/ | ||
int | ||
_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) | ||
{ | ||
Py_ssize_t cur = tok->cur - tok->buf; | ||
Py_ssize_t oldsize = tok->inp - tok->buf; | ||
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1); | ||
if (newsize > tok->end - tok->buf) { | ||
char *newbuf = tok->buf; | ||
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf; | ||
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf; | ||
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf; | ||
_PyLexer_remember_fstring_buffers(tok); | ||
newbuf = (char *)PyMem_Realloc(newbuf, newsize); | ||
if (newbuf == NULL) { | ||
tok->done = E_NOMEM; | ||
return 0; | ||
} | ||
tok->buf = newbuf; | ||
tok->cur = tok->buf + cur; | ||
tok->inp = tok->buf + oldsize; | ||
tok->end = tok->buf + newsize; | ||
tok->start = start < 0 ? NULL : tok->buf + start; | ||
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start; | ||
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start; | ||
_PyLexer_restore_fstring_buffers(tok); | ||
} | ||
return 1; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#ifndef _LEXER_BUFFER_H_ | ||
#define _LEXER_BUFFER_H_ | ||
|
||
#include "pyport.h" | ||
|
||
void _PyLexer_remember_fstring_buffers(struct tok_state *tok); | ||
void _PyLexer_restore_fstring_buffers(struct tok_state *tok); | ||
int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size); | ||
|
||
#endif |
Oops, something went wrong.