Skip to content

Commit

Permalink
Earley: share nodes created by the scanner with the completer
Browse files Browse the repository at this point in the history
  • Loading branch information
chanicpanic committed Aug 17, 2024
1 parent acfe33d commit 12b3a77
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 29 deletions.
28 changes: 11 additions & 17 deletions lark/parsers/earley.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matc
self.term_matcher = term_matcher


def predict_and_complete(self, i, to_scan, columns, transitives):
def predict_and_complete(self, i, to_scan, columns, transitives, node_cache):
"""The core Earley Predictor and Completer.
At each stage of the input, we handling any completed items (things
Expand All @@ -84,7 +84,6 @@ def predict_and_complete(self, i, to_scan, columns, transitives):
non-terminals are recursively processed until we reach a set of,
which can be added to the scan list for the next scanner cycle."""
# Held Completions (H in E.Scotts paper).
node_cache = {}
held_completions = {}

column = columns[i]
Expand Down Expand Up @@ -203,7 +202,7 @@ def scan(i, token, to_scan):
for item in self.Set(to_scan):
if match(item.expect, token):
new_item = item.advance()
label = (new_item.s, new_item.start, i)
label = (new_item.s, new_item.start, i + 1)
# 'terminals' may not contain token.type when using %declare
# Additionally, token is not always a Token
# For example, it can be a Tree when using TreeMatcher
Expand All @@ -227,7 +226,7 @@ def scan(i, token, to_scan):
expect = {i.expect.name for i in to_scan}
raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan))

return next_to_scan
return next_to_scan, node_cache


# Define parser functions
Expand All @@ -245,16 +244,17 @@ def scan(i, token, to_scan):
# step.
expects = {i.expect for i in to_scan}
i = 0
node_cache = {}
for token in lexer.lex(expects):
self.predict_and_complete(i, to_scan, columns, transitives)
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

to_scan = scan(i, token, to_scan)
to_scan, node_cache = scan(i, token, to_scan)
i += 1

expects.clear()
expects |= {i.expect for i in to_scan}

self.predict_and_complete(i, to_scan, columns, transitives)
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

## Column is now the final column in the parse.
assert i == len(columns)-1
Expand Down Expand Up @@ -294,24 +294,18 @@ def parse(self, lexer, start):
except ImportError:
logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image")
else:
for i, s in enumerate(solutions):
debug_walker.visit(s, f"sppf{i}.png")
debug_walker.visit(solutions[0], "sppf.png")

if len(solutions) > 1:
assert False, 'Earley should not generate multiple start symbol items!'

if self.Tree is not None:
# Perform our SPPF -> AST conversion
# Disable the ForestToParseTree cache when ambiguity='resolve'
# to prevent a tree construction bug. See issue #1283
use_cache = not self.resolve_ambiguity
transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache)
solutions = [transformer.transform(s) for s in solutions]

if len(solutions) > 1 and not self.resolve_ambiguity:
t: Tree = self.Tree('_ambig', solutions)
t.expand_kids_by_data('_ambig') # solutions may themselves be _ambig nodes
return t
return solutions[0]
return transformer.transform(solutions[0])

# return the root of the SPPF
# TODO return a list of solutions, or join them together somehow
return solutions[0]
9 changes: 5 additions & 4 deletions lark/parsers/xearley.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def scan(i, to_scan):
considered_rules=considered_rules
)

return next_to_scan
return next_to_scan, node_cache


delayed_matches = defaultdict(list)
Expand All @@ -146,10 +146,11 @@ def scan(i, to_scan):
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
i = 0
node_cache = {}
for token in stream:
self.predict_and_complete(i, to_scan, columns, transitives)
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

to_scan = scan(i, to_scan)
to_scan, node_cache = scan(i, to_scan)

if token == '\n':
text_line += 1
Expand All @@ -158,7 +159,7 @@ def scan(i, to_scan):
text_column += 1
i += 1

self.predict_and_complete(i, to_scan, columns, transitives)
self.predict_and_complete(i, to_scan, columns, transitives, node_cache)

## Column is now the final column in the parse.
assert i == len(columns)-1
Expand Down
13 changes: 5 additions & 8 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,14 +836,14 @@ def test_multiple_start_solutions(self):
tree = l.parse('x')

expected = Tree('_ambig', [
Tree('start', [Tree('a', ['x'])]),
Tree('start', ['x']),
Tree('start', [Tree('a', ['x'])])]
)
])
self.assertEqual(tree, expected)

l = Lark(grammar, ambiguity='resolve', lexer=LEXER)
tree = l.parse('x')
assert tree == Tree('start', ['x'])
assert tree == Tree('start', [Tree('a', ['x'])])


def test_cycle(self):
Expand Down Expand Up @@ -872,10 +872,7 @@ def test_cycle2(self):
tree = l.parse("ab")
expected = (
Tree('start', [
Tree('_ambig', [
Tree('v', [Tree('v', [])]),
Tree('v', [Tree('v', [Tree('v', [])])])
])
Tree('v', [Tree('v', [])]),
])
)
self.assertEqual(tree, expected)
Expand Down Expand Up @@ -986,7 +983,7 @@ def test_consistent_derivation_order1(self):
parser = Lark('''
start: a a
a: "." | b
b: "."
b.1: "."
''', lexer=LEXER)

tree = parser.parse('..')
Expand Down

0 comments on commit 12b3a77

Please sign in to comment.