Expand and provide docs

astral-sh · Jun 14, 2024 · 90ca067 · 90ca067
1 parent 4e5ccde
commit 90ca067
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 10 deletions.
diff --git a/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py b/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py
@@ -38,7 +38,7 @@ def bar():
 
 
 # The parser tries to recover from an unclosed `]` when the current token is `)`. This
-# test is to make sure it emits a `NonLogicalNewline` token after `c`.
+# test is to make sure it emits a `NonLogicalNewline` token after `b`.
 if call(foo, [a,
     b
 )

diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs
@@ -1309,11 +1309,58 @@ impl<'src> Lexer<'src> {
 
     /// Re-lex the current token in the context of a logical line.
     ///
-    /// Returns a boolean indicating that whether the new current token is different than the
-    /// previous current token. This also means that the current position of the lexer has changed
-    /// and the caller is responsible for updating it's state accordingly.
+    /// Returns a boolean indicating whether the lexer's position has changed. This could result
+    /// into the new current token being different than the previous current token but is not
+    /// necessarily true. If the return value is `true` then the caller is responsible for updating
+    /// it's state accordingly.
     ///
     /// This method is a no-op if the lexer isn't in a parenthesized context.
+    ///
+    /// ## Explanation
+    ///
+    /// The lexer emits two different kinds of newline token based on the context. If it's in a
+    /// parenthesized context, it'll emit a [`NonLogicalNewline`] token otherwise it'll emit a
+    /// regular [`Newline`] token. Based on the type of newline token, the lexer will consume and
+    /// emit the indentation tokens appropriately which affects the structure of the code.
+    ///
+    /// For example:
+    /// ```py
+    /// if call(foo
+    ///     def bar():
+    ///         pass
+    /// ```
+    ///
+    /// Here, the lexer emits a [`NonLogicalNewline`] token after `foo` which means that the lexer
+    /// doesn't emit an `Indent` token before the `def` keyword. This leads to an AST which
+    /// considers the function `bar` as part of the module block and the `if` block remains empty.
+    ///
+    /// This method is to facilitate the parser if it recovers from these kind of scenarios so that
+    /// the lexer can then re-lex a [`NonLogicalNewline`] token to a [`Newline`] token which in
+    /// turn helps the parser to build the correct AST.
+    ///
+    /// In the above snippet, it would mean that this method would move the lexer back to the
+    /// newline character after the `foo` token and emit it as a [`Newline`] token instead of
+    /// [`NonLogicalNewline`]. This means that the next token emitted by the lexer would be an
+    /// `Indent` token.
+    ///
+    /// There are cases where the lexer's position will change but the re-lexed token will remain
+    /// the same. This is to help the parser to add the error message at an appropriate location.
+    /// Consider the following example:
+    ///
+    /// ```py
+    /// if call(foo, [a, b
+    ///     def bar():
+    ///         pass
+    /// ```
+    ///
+    /// Here, the parser recovers from two unclosed parenthesis. The inner unclosed `[` will call
+    /// into the re-lexing logic and reduce the nesting level from 2 to 1. And, the re-lexing logic
+    /// will move the lexer at the newline after `b` but still emit a [`NonLogicalNewline`] token.
+    /// Only after the parser recovers from the outer unclosed `(` does the re-lexing logic emit
+    /// the [`Newline`] token.
+    ///
+    /// [`Newline`]: TokenKind::Newline
+    /// [`NonLogicalNewline`]: TokenKind::NonLogicalNewline
     pub(crate) fn re_lex_logical_token(&mut self) -> bool {
         if self.nesting == 0 {
             return false;
@@ -1344,7 +1391,7 @@ impl<'src> Lexer<'src> {
         if new_position != current_position && has_newline {
             // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's
             // position is going to be moved back, the lexer needs to be put back into a
-            // parenthesized context if the current token was a closing parenthesis.
+            // parenthesized context if the current token is a closing parenthesis.
             //
             // ```py
             // (a, [b,

diff --git a/crates/ruff_python_parser/src/parser/mod.rs b/crates/ruff_python_parser/src/parser/mod.rs
@@ -537,10 +537,11 @@ impl<'src> Parser<'src> {
             if recovery_context_kind.is_list_element(self) {
                 parse_element(self);
 
-                // Only unset this when we've completely parsed a single element.
-                if first_element {
-                    first_element = false;
-                }
+                // Only unset this when we've completely parsed a single element. This is mainly to
+                // raise the correct error in case the first element isn't valid and the current
+                // token isn't a comma. Without this knowledge, the parser would later expect a
+                // comma instead of raising the context error.
+                first_element = false;
 
                 let maybe_comma_range = self.current_token_range();
                 if self.eat(TokenKind::Comma) {

diff --git a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap
@@ -582,7 +582,7 @@ Module(
 
 
    |
-41 | # test is to make sure it emits a `NonLogicalNewline` token after `c`.
+41 | # test is to make sure it emits a `NonLogicalNewline` token after `b`.
 42 | if call(foo, [a,
 43 |     b
    |      ^ Syntax Error: Expected ']', found NonLogicalNewline