Earley: share nodes created by the scanner with the completer

chanicpanic · Aug 17, 2024 · 12b3a77 · 12b3a77
1 parent acfe33d
commit 12b3a77
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 29 deletions.
diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py
@@ -75,7 +75,7 @@ def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matc
         self.term_matcher = term_matcher
 
 
-    def predict_and_complete(self, i, to_scan, columns, transitives):
+    def predict_and_complete(self, i, to_scan, columns, transitives, node_cache):
         """The core Earley Predictor and Completer.
 
         At each stage of the input, we handling any completed items (things
@@ -84,7 +84,6 @@ def predict_and_complete(self, i, to_scan, columns, transitives):
         non-terminals are recursively processed until we reach a set of,
         which can be added to the scan list for the next scanner cycle."""
         # Held Completions (H in E.Scotts paper).
-        node_cache = {}
         held_completions = {}
 
         column = columns[i]
@@ -203,7 +202,7 @@ def scan(i, token, to_scan):
             for item in self.Set(to_scan):
                 if match(item.expect, token):
                     new_item = item.advance()
-                    label = (new_item.s, new_item.start, i)
+                    label = (new_item.s, new_item.start, i + 1)
                     # 'terminals' may not contain token.type when using %declare
                     # Additionally, token is not always a Token
                     # For example, it can be a Tree when using TreeMatcher
@@ -227,7 +226,7 @@ def scan(i, token, to_scan):
                 expect = {i.expect.name for i in to_scan}
                 raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))
 
-            return next_to_scan
+            return next_to_scan, node_cache
 
 
         # Define parser functions
@@ -245,16 +244,17 @@ def scan(i, token, to_scan):
         # step.
         expects = {i.expect for i in to_scan}
         i = 0
+        node_cache = {}
         for token in lexer.lex(expects):
-            self.predict_and_complete(i, to_scan, columns, transitives)
+            self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
 
-            to_scan = scan(i, token, to_scan)
+            to_scan, node_cache = scan(i, token, to_scan)
             i += 1
 
             expects.clear()
             expects |= {i.expect for i in to_scan}
 
-        self.predict_and_complete(i, to_scan, columns, transitives)
+        self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
 
         ## Column is now the final column in the parse.
         assert i == len(columns)-1
@@ -294,24 +294,18 @@ def parse(self, lexer, start):
             except ImportError:
                 logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
             else:
-                for i, s in enumerate(solutions):
-                    debug_walker.visit(s, f"sppf{i}.png")
+                debug_walker.visit(solutions[0], "sppf.png")
 
+        if len(solutions) > 1:
+            assert False, 'Earley should not generate multiple start symbol items!'
 
         if self.Tree is not None:
             # Perform our SPPF -> AST conversion
             # Disable the ForestToParseTree cache when ambiguity='resolve'
             # to prevent a tree construction bug. See issue #1283
             use_cache = not self.resolve_ambiguity
             transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)
-            solutions = [transformer.transform(s) for s in solutions]
-
-            if len(solutions) > 1 and not self.resolve_ambiguity:
-                t: Tree = self.Tree('_ambig', solutions)
-                t.expand_kids_by_data('_ambig')     # solutions may themselves be _ambig nodes
-                return t
-            return solutions[0]
+            return transformer.transform(solutions[0])
 
         # return the root of the SPPF
-        # TODO return a list of solutions, or join them together somehow
         return solutions[0]
diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py
@@ -127,7 +127,7 @@ def scan(i, to_scan):
                                            considered_rules=considered_rules
                                            )
 
-            return next_to_scan
+            return next_to_scan, node_cache
 
 
         delayed_matches = defaultdict(list)
@@ -146,10 +146,11 @@ def scan(i, to_scan):
         # processed down to terminals/empty nodes to be added to the scanner for the next
         # step.
         i = 0
+        node_cache = {}
         for token in stream:
-            self.predict_and_complete(i, to_scan, columns, transitives)
+            self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
 
-            to_scan = scan(i, to_scan)
+            to_scan, node_cache = scan(i, to_scan)
 
             if token == '\n':
                 text_line += 1
@@ -158,7 +159,7 @@ def scan(i, to_scan):
                 text_column += 1
             i += 1
 
-        self.predict_and_complete(i, to_scan, columns, transitives)
+        self.predict_and_complete(i, to_scan, columns, transitives, node_cache)
 
         ## Column is now the final column in the parse.
         assert i == len(columns)-1

diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -836,14 +836,14 @@ def test_multiple_start_solutions(self):
             tree = l.parse('x')
 
             expected = Tree('_ambig', [
+                Tree('start', [Tree('a', ['x'])]),
                 Tree('start', ['x']),
-                Tree('start', [Tree('a', ['x'])])]
-            )
+            ])
             self.assertEqual(tree, expected)
 
             l = Lark(grammar, ambiguity='resolve', lexer=LEXER)
             tree = l.parse('x')
-            assert tree == Tree('start', ['x'])
+            assert tree == Tree('start', [Tree('a', ['x'])])
 
 
         def test_cycle(self):
@@ -872,10 +872,7 @@ def test_cycle2(self):
             tree = l.parse("ab")
             expected = (
                 Tree('start', [
-                    Tree('_ambig', [
-                        Tree('v', [Tree('v', [])]),
-                        Tree('v', [Tree('v', [Tree('v', [])])])
-                    ])
+                    Tree('v', [Tree('v', [])]),
                 ])
             )
             self.assertEqual(tree, expected)
@@ -986,7 +983,7 @@ def test_consistent_derivation_order1(self):
             parser = Lark('''
                 start: a a
                 a: "." | b
-                b: "."
+                b.1: "."
             ''', lexer=LEXER)
 
             tree = parser.parse('..')