diff --git a/Cargo.toml b/Cargo.toml index bd47796..faf4385 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,10 @@ wasm-bindgen = { version = "0.2.83", optional = true } [target.'cfg(not(target_family = "wasm"))'.dependencies] colored = { version = "2.1" } -[dev-dependencies] +[target.'cfg(target_family = "wasm")'.dev-dependencies] +wasm-bindgen-test = { version = "0.3.36" } + +[target.'cfg(not(target_family = "wasm"))'.dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } [features] diff --git a/README.md b/README.md index 8245c9b..1e02a74 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ * [5) Constructing ACTION and GOTO tables](#5-constructing-action-and-goto-tables) * [6) Tokenizing the input](#6-tokenizing-the-input) * [7) Parsing the tokens](#7-parsing-the-tokens) +* [Can I have symbols that can match to empty string?](#can-i-have-symbols-that-can-match-to-empty-string) * [Can I have an LALR(1) parser instead of an LR(1) parser?](#can-i-have-an-lalr1-parser-instead-of-an-lr1-parser) * [Any benchmarks?](#any-benchmarks) * [Can I modify it?](#can-i-modify-it) @@ -266,7 +267,7 @@ Yes, you can depend on the `dotlr` crate from [crates.io](https://crates.io/crat Paste the following to your `dependencies` section of your `Cargo.toml`: ```toml -dotlr = { version = "0.1", default-features = false } +dotlr = { version = "0.3", default-features = false } ``` ### Example @@ -366,8 +367,7 @@ And then do a step-by-step explanation of the parsing steps for the following in Few notes before starting: - `$` represents the end of input token -- symbols that can expand to empty string are not supported in `dotlr`, - which simplifies the explanations below (i.e., no rules like `S ->`) +- `ε` represents the empty token ### 1) Parsing the grammar @@ -383,6 +383,10 @@ The grammar object will consist of: - The symbol to parse \ (e.g., `E`) +- **empty_symbols (HashSet):** + - The set of symbols that can expand to empty string (i.e., `∀S: S -> '' ∈ rules`) \ + (e.g., `T`) + - **constant_tokens (HashSet):** - The set of constant tokens in the grammar \ (e.g., `{ '+', '*' }`) @@ -404,9 +408,16 @@ Now, we need to compute a set of tokens for each symbol in the grammar according following constraints: - For each `token ∈ FIRST(Symbol)`, at least one of the following conditions must hold: - - `Symbol -> token ... ∈ grammar.rules` - - `Symbol -> AnotherSymbol ... ∈ grammar.rules` **and** \ - `token ∈ FIRST(AnotherSymbol)` + - `Symbol -> '' ∈ grammar.rules` + **and** `token == ε` + + - `Symbol -> (EmptySymbol1 ... EmptySymbolN) ∈ grammar.rules` + **and** `token == ε` + + - `Symbol -> (EmptySymbol1 ... EmptySymbolN) token ... ∈ grammar.rules` + + - `Symbol -> (EmptySymbol1 ... EmptySymbolN) NonEmptySymbol ... ∈ grammar.rules` + **and** `token ∈ FIRST(NonEmptySymbol)` As for the implementation, here is a python-like pseudocode of the algorithm to compute FIRST sets: @@ -418,20 +429,47 @@ first_sets = {} while first_sets.has_changed(): # Iterate over the rules of the grammar for rule in grammar: - # If pattern of the rule starts with a token - if rule.pattern[0].is_token: - # S -> '+' ... <==> S can start with '+' - # -------------------------------------- - # Add the matching token to the FIRST set of the symbol of the rule - first_sets[rule.symbol].add(rule.pattern[0]) - - # If pattern of the rule starts with a symbol - elif rule.pattern[0].is_symbol: - # S -> E ... <==> S can start with anything E can start with - # ---------------------------------------------------------- - # Add every token in the FIRST set of the matching symbol - # to the FIRST set of the symbol of the rule - first_sets[rule.symbol].extend(first_sets[rule.pattern[0]]) + # If rule is empty pattern + if rule.is_empty_pattern: + # S -> '' <==> S can start with ε + # ------------------------------- + # Add ε to the FIRST set of the symbol of the rule + first_sets[rule.symbol].add(ε) + # Break the processing of the rule as it's fully processed + break + + # Iterate over the atoms of the pattern of the rule + for atom in rule.pattern: + # If atom is a token + if atom.is_token: + # S -> (EmptySymbol1 ... EmptySymbolN) '+' ... <==> S can start with '+' + # ---------------------------------------------------------------------- + # Add the matching token to the FIRST set of the symbol of the rule + first_sets[rule.symbol].add(atom) + # Break the processing of the rule + # as the rule cannot provide any other first tokens + break + + # If atom is a symbol + elif atom.is_symbol: + # S -> (EmptySymbol1 ... EmptySymbolN) E ... <==> S can start with anything E can start with + # ------------------------------------------------------------------------------------------ + # Add every token in the FIRST set of the matching symbol + # to the FIRST set of the symbol except ε as it's treated + # in a special way through the loop + first_sets[rule.symbol].extend(first_sets[atom].exclude(ε)) + # If the symbol cannot be empty (E -> '' ∉ grammar.rules) + if not atom.can_be_empty: + # Break the processing of the rule + # as the rule cannot provide any other first tokens + break + + # If the loop is not broken manually + else: + # S -> EmptySymbol1 ... EmptySymbolN <==> S can start with 'ε' + # ------------------------------------------------------------- + # Add ε to the FIRST set of the symbol of the rule + first_sets[rule.symbol].add(ε) ``` This is done in [src/tables.rs](https://github.com/umut-sahin/dotlr/blob/main/src/tables.rs). @@ -456,16 +494,20 @@ Next, we need to compute another set of tokens for each symbol in the grammar ac following constraints: - For each `token ∈ FOLLOW(Symbol)`, at least one of the following conditions must hold: - - `Symbol == grammar.start_symbol` **and** \ - `token == $` + - `Symbol == grammar.start_symbol` + **and** `token == $` + + - `Anything -> ... Symbol (EmptySymbol1 ... EmptySymbolN) token ... ∈ grammar.rules` - - `Anything -> ... Symbol token ... ∈ grammar.rules` + - `Anything -> ... Symbol (EmptySymbol1 ... EmptySymbolN) AnotherSymbol ... ∈ grammar.rules` + **and** `token ∈ FIRST(AnotherSymbol)` + **and** `token != ε` - - `Anything -> ... Symbol AnotherSymbol ... ∈ grammar.rules` **and** \ - `token ∈ FIRST(AnotherSymbol)` + - `AnotherSymbol -> ... Symbol (EmptySymbol1 ... EmptySymbolN) ∈ grammar.rules` + **and** `token ∈ FOLLOW(AnotherSymbol)` - - `Symbol -> ... AnotherSymbol ∈ grammar.rules` **and** \ - `token ∈ FOLLOW(AnotherSymbol)` + - `Symbol -> ... AnotherSymbol ∈ grammar.rules` + **and** `token ∈ FOLLOW(AnotherSymbol)` As for the implementation, here is a python-like pseudocode of the algorithm to compute FOLLOW sets: @@ -477,30 +519,49 @@ follow_sets = { grammar.start_symbol: { $ } } while follow_sets.has_changed(): # Iterate over the rules of the grammar for rule in grammar: - # Iterate over the 2-windows of the pattern of the rule + # Iterate over the atoms of the pattern of the rule for i in range(len(rule.pattern) - 1): - # If the first atomic pattern is a symbol + # If the atom is a symbol if rule.pattern[i].is_symbol: - # And if the second atomic pattern is a token - if rule.pattern[i + 1].is_token: - # S -> ... E '+' ... <==> E can follow '+' - # ---------------------------------------- - # Add the matching token to the FOLLOW set of the matching symbol - follow_sets[rule.pattern[i]].add(rule.pattern[i + 1]) - - # Or if the second atomic pattern is a symbol - elif rule.pattern[i + 1].is_symbol: - # S -> ... E F ... <==> E can follow anything F can start with - # ------------------------------------------------------------ - # Add every token in the FIRST set of the second symbol - # to the FOLLOW set of the first symbol - follow_sets[rule.pattern[i]].extend(first_sets[rule.pattern[i + 1]]) + # Iterate over the remaining atoms + for j in range(i + 1, len(rule.pattern)): + # If the next atom is a token + if rule.pattern[j].is_token: + # S -> ... E (EmptySymbol1 ... EmptySymbolN) '+' ... <==> E can follow '+' + # ------------------------------------------------------------------------ + # Add the matching token to the FOLLOW set of the matching symbol + follow_sets[rule.pattern[i]].add(rule.pattern[j]) + # Break the processing of the rule + # as the rule cannot provide any other follow tokens + break + + # Or if the next atom is a symbol + elif rule.pattern[j].is_symbol: + # S -> ... E (EmptySymbol1 ... EmptySymbolN) F ... <==> E can follow anything F can start with + # -------------------------------------------------------------------------------------------- + # Add every token in the FIRST set of the second symbol + # to the FOLLOW set of the first symbol + # except ε as it's shouldn't be in the follow set + follow_sets[rule.pattern[i]].extend(first_sets[rule.pattern[j]].exclude(ε)) + # If the symbol cannot be empty (F -> '' ∉ grammar.rules) + if not rule.pattern[j].can_be_empty: + # Break the processing of the rule + # as the rule cannot provide any other follow tokens + break + + # If the loop is not broken manually + else: + # A -> ... S (EmptySymbol1 ... EmptySymbolN) <==> S can follow anything A can follow + # ---------------------------------------------------------------------------------- + # Add every token in the FOLLOW set of the symbol of the rule + # to the FOLLOW set of the symbol of the atom + follow_sets[rule.patten[i]].extend(follow_sets[rule.symbol]) # If pattern of ends with a symbol if rule.pattern[-1].is_symbol: # S -> ... E <==> S can follow anything E can follow # -------------------------------------------------- - # Add every token in the FOLLOW set of the matching symbol + # Add every token in the FOLLOW set of the symbol of the last atom # to the FOLLOW set of the symbol of the rule follow_sets[rule.symbol].extend(follow_sets[rule.patten[-1]]) ``` @@ -648,21 +709,31 @@ while len(states_to_process) > 0: # Compute transitions of from the state to process. for item in state_to_process.items: + # If the rule of the items is empty pattern + if item.rule.is_empty_pattern: + # S -> . ε <==> No need to create a transition to item S -> ε . + # -------------------------------------------------------------- + continue + # If dot is not at the end - if item.dot != len(item.rule.pattern): - # S -> ... . E ... <==> Seeing E would cause a transition to another state - # S -> ... . '+' ... <==> Seeing '+' would cause a transition to another state - # ---------------------------------------------------------------------------- - atomic_pattern_after_dot = item.rule.pattern[item.dot] + if item.dot == len(item.rule.pattern): + # S -> ... . <==> Can't create a transition as the dot is already at the end + # -------------------------------------------------------------------------- + continue + + # S -> ... . E ... <==> Seeing E would cause a transition to another state + # S -> ... . '+' ... <==> Seeing '+' would cause a transition to another state + # ---------------------------------------------------------------------------- + atomic_pattern_after_dot = item.rule.pattern[item.dot] - # If state to transition is not created yet, create an empty state for it. - if atomic_pattern_after_dot is not in transitions: - # Create an empty state to transition to - state_to_process.transitions[atomic_pattern_after_dot] = next_empty_state() + # If state to transition is not created yet, create an empty state for it. + if atomic_pattern_after_dot is not in transitions: + # Create an empty state to transition to + state_to_process.transitions[atomic_pattern_after_dot] = next_empty_state() - # Setup the kernel of the state to transition - state_to_transition = state_to_process.transitions[atomic_pattern_after_dot] - state_to_transition.items.push(item.shift_dot_to_right()) + # Setup the kernel of the state to transition + state_to_transition = state_to_process.transitions[atomic_pattern_after_dot] + state_to_transition.items.push(item.shift_dot_to_right()) # Add state to process to processed states, as we're done with it processed_states.push(state_to_process) @@ -720,11 +791,22 @@ Finally, we can compute ACTION and GOTO tables of the parser according the follo `item.rule.symbol == grammar.start_symbol` **and** \ `action == Accept(item.rule)` + - `Anything -> . ε | lookahead ∈ state.items` **and** \ + `token ∈ lookahead` **and** \ + `token == $` **and** \ + `item.rule.symbol == grammar.start_symbol` **and** \ + `action == Accept(item.rule)` + - `Anything -> ... . | lookahead ∈ state.items` **and** \ `token ∈ lookahead` **and** \ (`token != $` **or** `item.rule.symbol != grammar.start_symbol`) **and** \ `action == Reduce(item.rule)` + - `Anything -> . ε | lookahead ∈ state.items` **and** \ + `token ∈ lookahead` **and** \ + (`token != $` **or** `item.rule.symbol != grammar.start_symbol`) **and** \ + `action == Reduce(item.rule)` + - For each `goto ∈ GOTO(state, Symbol)`, at least one of the following conditions must hold: - `Anything -> ... . Symbol ... | lookahead ∈ state.items` **and** \ `goto == state.transitions[Symbol]` @@ -741,9 +823,10 @@ goto_table = {} for state in automaton.states: # Iterate over the items of the state for item in state.items: - # If dot is at the end of the item - if item.dot == len(item.rule.pattern): + # If dot is at the end of the item or rule of the item is empty pattern + if item.dot == len(item.rule.pattern) or item.rule.is_empty_pattern: # S -> ... . <==> We can either reduce the rule or accept if S is a start symbol + # S -> . ε <==> We can either reduce the rule or accept if S is a start symbol # ------------------------------------------------------------------------------ # We can only perform actions for the tokens in the follow set of the symbol of the rule @@ -967,6 +1050,113 @@ E └─ 1 ``` +## Can I have symbols that can match to empty string? + +Yes, empty symbols are supported! + +In the grammar definition, you can have rules like `P -> ''` +to indicate that `P` can match the empty string. + +``` +P -> 'x' O 'z' + +O -> 'y' +O -> '' +``` + +When used as the sole pattern, `''` will indicate mark the symbol as `can be empty`. +However, using `''` anywhere else would be ignored (e.g., `P -> A '' B` would be parsed as `P -> A B`). + +The grammar above will be able to parse both `x y z` and `x z`. + +``` ++----------------------+ +| Grammar | ++----------------------+ +| 1) P -> 'x' O 'z' | +| 2) O -> 'y' | +| 3) O -> ε | ++----------------------+ ++--------+------------+------------+ +| Symbol | First Set | Follow Set | ++--------+------------+------------+ +| P | { 'x' } | { $ } | ++--------+------------+------------+ +| O | { 'y', ε } | { 'z' } | ++--------+------------+------------+ ++-------+--------------------+------------+--------------+ +| State | Items | Lookaheads | Transitions | ++-------+--------------------+------------+--------------+ +| 0 | P -> . 'x' O 'z' | { $ } | 'x' -> 1 | ++-------+--------------------+------------+--------------+ +| 1 | P -> 'x' . O 'z' | { $ } | O -> 2 | +| | O -> . 'y' | { 'z' } | 'y' -> 3 | +| | O -> . ε | { 'z' } | | ++-------+--------------------+------------+--------------+ +| 2 | P -> 'x' O . 'z' | { $ } | 'z' -> 4 | ++-------+--------------------+------------+--------------+ +| 3 | O -> 'y' . | { 'z' } | | ++-------+--------------------+------------+--------------+ +| 4 | P -> 'x' O 'z' . | { $ } | | ++-------+--------------------+------------+--------------+ ++-------+--------------------------------+--------------+ +| | Action | Goto | +| State | ------------------------------ | ------------ | +| | 'x' 'z' 'y' $ | P O | ++-------+--------------------------------+--------------+ +| 0 | s1 - - - | - - | ++-------+--------------------------------+--------------+ +| 1 | - r3 s3 - | - 2 | ++-------+--------------------------------+--------------+ +| 2 | - s4 - - | - - | ++-------+--------------------------------+--------------+ +| 3 | - r2 - - | - - | ++-------+--------------------------------+--------------+ +| 4 | - - - a1 | - - | ++-------+--------------------------------+--------------+ + +> x y z + +P +├─ x +├─ O +│ └─ y +└─ z + ++------+-------------+--------------+-----------------+---------------------------+ +| Step | State Stack | Symbol Stack | Remaining Input | Action Taken | ++------+-------------+--------------+-----------------+---------------------------+ +| 0 | 0 | | 'x' 'y' 'z' $ | Shift 1 | ++------+-------------+--------------+-----------------+---------------------------+ +| 1 | 0 1 | 'x' | 'y' 'z' $ | Shift 3 | ++------+-------------+--------------+-----------------+---------------------------+ +| 2 | 0 1 3 | 'x' 'y' | 'z' $ | Reduce 2 (O -> 'y') | ++------+-------------+--------------+-----------------+---------------------------+ +| 3 | 0 1 2 | 'x' O | 'z' $ | Shift 4 | ++------+-------------+--------------+-----------------+---------------------------+ +| 4 | 0 1 2 4 | 'x' O 'z' | $ | Accept 1 (P -> 'x' O 'z') | ++------+-------------+--------------+-----------------+---------------------------+ + +> x z + +P +├─ x +├─ O +└─ z + ++------+-------------+--------------+-----------------+---------------------------+ +| Step | State Stack | Symbol Stack | Remaining Input | Action Taken | ++------+-------------+--------------+-----------------+---------------------------+ +| 0 | 0 | | 'x' 'z' $ | Shift 1 | ++------+-------------+--------------+-----------------+---------------------------+ +| 1 | 0 1 | 'x' | 'z' $ | Reduce 3 (O -> ε) | ++------+-------------+--------------+-----------------+---------------------------+ +| 2 | 0 1 2 | 'x' O | 'z' $ | Shift 4 | ++------+-------------+--------------+-----------------+---------------------------+ +| 3 | 0 1 2 4 | 'x' O 'z' | $ | Accept 1 (P -> 'x' O 'z') | ++------+-------------+--------------+-----------------+---------------------------+ +``` + ## Can I have an LALR(1) parser instead of an LR(1) parser? Yes, `dotlr` supports both LR(1) and LALR(1) parsers! diff --git a/assets/grammars/correct/g9.lr b/assets/grammars/correct/g9.lr new file mode 100644 index 0000000..9689445 --- /dev/null +++ b/assets/grammars/correct/g9.lr @@ -0,0 +1,16 @@ +P -> E + +E -> T Ep + +Ep -> '+' T Ep +Ep -> '' + +T -> F Tp + +Tp -> '*' F Tp +Tp -> '' + +F -> '(' E ')' +F -> %int + +%int -> /[0-9]+/ diff --git a/assets/grammars/correct/optional.lr b/assets/grammars/correct/optional.lr new file mode 100644 index 0000000..720c785 --- /dev/null +++ b/assets/grammars/correct/optional.lr @@ -0,0 +1,4 @@ +P -> O 'x' O 'z' + +O -> 'y' +O -> '' diff --git a/bindings/typescript/build.js b/bindings/typescript/build.js index b421af7..959546e 100644 --- a/bindings/typescript/build.js +++ b/bindings/typescript/build.js @@ -3,11 +3,20 @@ import fs from "fs/promises"; async function init() { console.log("Starting build..."); + await fs + .unlink("./src/pkg/dotlr_bg.wasm.d.ts") + .catch(() => console.warn("No dotlr_bg.wasm.d.ts found")); execSync("tsc", { stdio: "inherit" }); await fs.cp("./src/pkg", "./dist/pkg", { recursive: true }); - await fs.unlink("./dist/pkg/package.json"); - await fs.unlink("./dist/pkg/README.md"); - await fs.unlink("./dist/pkg/.gitignore"); + await fs + .unlink("./dist/pkg/package.json") + .catch(() => console.warn("No package.json found")); + await fs + .unlink("./dist/pkg/README.md") + .catch(() => console.warn("No README.md found")); + await fs + .unlink("./dist/pkg/.gitignore") + .catch(() => console.warn("No .gitignore found")); console.log("Build complete"); } diff --git a/bindings/typescript/src/types.ts b/bindings/typescript/src/types.ts index 2ecf769..526b0f5 100644 --- a/bindings/typescript/src/types.ts +++ b/bindings/typescript/src/types.ts @@ -40,6 +40,8 @@ export type Token = { value: R } | { type: 'Eof' +} | { + type: 'Empty' } //prettier-ignore @@ -153,14 +155,14 @@ export type Action = { } export type Span = { offset: number; - len: number; + length: number; column: number; line: number; }; export type Spanned = { span: Span; - value: T; + object: T; }; export type FirstTable = Map; diff --git a/bindings/typescript/src/utils.ts b/bindings/typescript/src/utils.ts index d7f223a..f3c67cf 100644 --- a/bindings/typescript/src/utils.ts +++ b/bindings/typescript/src/utils.ts @@ -12,6 +12,7 @@ import type { export function stringifyToken(token: Token, noApostrophes = false) { if (token.type === "Eof") return "$"; + if (token.type === "Empty") return "ε"; if (token.type === "Regex") return `%${token.value}`; if (token.type === "Constant") return noApostrophes ? token.value : `'${token.value}'`; @@ -86,7 +87,7 @@ export function stringifyTree( if (tree.type === "Terminal") { const { token, slice } = tree.value; - if (token.type !== "Eof") { + if (token.type !== "Eof" && token.type !== "Empty") { result += `${indent}${linePrefix}${token.value} [${slice}]\n`; } } else { diff --git a/examples/calculator.rs b/examples/calculator.rs index 90b0ebc..4ed07d0 100644 --- a/examples/calculator.rs +++ b/examples/calculator.rs @@ -92,7 +92,7 @@ fn evaluate(tree: Tree<'_>) -> f64 { _ => unreachable!(), } }, - Token::Constant(_) | Token::Eof => { + Token::Constant(_) | Token::Eof | Token::Empty => { unreachable!(); }, } diff --git a/examples/json.rs b/examples/json.rs index 9fd1ade..c339f71 100644 --- a/examples/json.rs +++ b/examples/json.rs @@ -38,7 +38,7 @@ impl From> for Value { _ => unreachable!(), } }, - Token::Eof => { + Token::Eof | Token::Empty => { unreachable!(); }, } diff --git a/src/automaton.rs b/src/automaton.rs index 3a68824..01aa8d9 100644 --- a/src/automaton.rs +++ b/src/automaton.rs @@ -1,7 +1,7 @@ use crate::prelude::*; -/// Item of a state of an LR(1) automaton. +/// Item of a state of an LR(1) or an LALR(1) automaton. #[cfg_attr(feature = "serde", derive(Serialize))] #[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))] #[derive(Clone, Debug, Eq, PartialEq)] @@ -60,7 +60,7 @@ impl Display for Item { } -/// State of an LR(1) automaton. +/// State of an LR(1) or an LALR(1) automaton. #[derive(Clone, Debug, Default, Eq)] #[cfg_attr(feature = "serde", derive(Serialize))] #[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))] @@ -159,7 +159,7 @@ impl State { fn compute_transitions(&self, state_counter: &mut usize) -> Vec<(AtomicPattern, State)> { let mut transitions = IndexMap::::new(); for item in self.items.iter() { - if item.dot == item.rule.pattern().len() { + if item.dot == item.rule.pattern().len() || item.rule().is_empty_pattern() { continue; } @@ -184,7 +184,7 @@ impl PartialEq for State { } -/// LR(1) automaton of a grammar. +/// LR(1) or LALR(1) automaton of a grammar. #[cfg_attr(feature = "wasm", wasm_bindgen)] #[cfg_attr(feature = "serde", derive(Serialize))] #[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))] @@ -256,7 +256,6 @@ impl Automaton { *transition_target = *transition_map.get(transition_target).unwrap(); } } - Automaton { states: final_states } } } diff --git a/src/errors.rs b/src/errors.rs index 0192bd4..f0edc55 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -84,37 +84,6 @@ pub enum ParserError { Conflict { parser: Box, state: usize, token: Token }, } -#[cfg(feature = "wasm")] -#[cfg_attr(feature = "wasm", wasm_bindgen)] -pub struct WasmParserError { - error: ParserError, -} - -#[cfg(feature = "wasm")] -impl WasmParserError { - pub fn new(error: ParserError) -> Self { - WasmParserError { error } - } -} - -#[cfg(feature = "wasm")] -#[cfg_attr(feature = "wasm", wasm_bindgen)] -impl WasmParserError { - pub fn to_string_wasm(&self) -> String { - format!("{}", self.error) - } - pub fn serialize(&self) -> Result { - serde_wasm_bindgen::to_value(&self.error).map_err(JsValue::from) - } - pub fn into_conflict_parser(self) -> Result { - match self.error { - //&Box - ParserError::Conflict { parser, .. } => Ok(*parser), - _ => Err(JsValue::from("Error is not a conflict")), - } - } -} - /// Parsing error of an input tried to be parsed with a parser. #[cfg_attr(feature = "serde", derive(Serialize))] @@ -161,3 +130,38 @@ pub enum ParsingError { )] UnexpectedEof { expected: SmallVec<[Token; 2]>, span: Span }, } + + +/// Parser error of a parser tried to be constructed from a grammar (WASM). +#[cfg(feature = "wasm")] +#[cfg_attr(feature = "wasm", wasm_bindgen)] +pub struct WasmParserError(ParserError); + +#[cfg(feature = "wasm")] +#[cfg_attr(feature = "wasm", wasm_bindgen)] +impl WasmParserError { + /// Prints the parser error to a string. + pub fn to_string_wasm(&self) -> String { + format!("{}", self.0) + } + + /// Serializes the parser error to a JavaScript value. + pub fn serialize(&self) -> Result { + serde_wasm_bindgen::to_value(&self.0).map_err(JsValue::from) + } + + /// Converts the parser error to the conflicted parser if error was a conflict error. + pub fn into_conflict_parser(self) -> Result { + match self.0 { + ParserError::Conflict { parser, .. } => Ok(*parser), + _ => Err(JsValue::from("ParserError is not a `Conflict` error")), + } + } +} + +#[cfg(feature = "wasm")] +impl From for WasmParserError { + fn from(error: ParserError) -> WasmParserError { + WasmParserError(error) + } +} diff --git a/src/grammar.rs b/src/grammar.rs index 5e29506..cfb38bb 100644 --- a/src/grammar.rs +++ b/src/grammar.rs @@ -88,6 +88,8 @@ impl> From for RegexToken { #[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))] #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub enum Token { + /// Empty token. + Empty, /// Constant token. Constant(ConstantToken), /// Regular expression token. @@ -99,6 +101,9 @@ pub enum Token { impl Display for Token { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { + Token::Empty => { + write!(f, "ε") + }, Token::Constant(constant_token) => { write!(f, "{}", constant_token) }, @@ -200,6 +205,11 @@ impl Rule { pub fn pattern(&self) -> &[AtomicPattern] { &self.pattern } + + /// Gets whether the rule is `S -> ''`. + pub fn is_empty_pattern(&self) -> bool { + self.pattern.len() == 1 && self.pattern[0] == AtomicPattern::Token(Token::Empty) + } } impl Display for Rule { @@ -221,13 +231,16 @@ impl Display for Rule { pub struct Grammar { symbols: IndexSet, start_symbol: Symbol, + empty_symbols: IndexSet, constant_tokens: IndexSet, - #[cfg_attr(feature = "serde", serde(serialize_with = "utils::serialize_regex_map"))] + #[cfg_attr( + feature = "serde", + serde(serialize_with = "utils::serialize_regex_token_to_regex_map") + )] regular_expressions: IndexMap, rules: Vec, } - impl Grammar { /// Creates a grammar from a grammar string. pub fn parse(grammar_string: &str) -> Result { @@ -235,18 +248,6 @@ impl Grammar { } } -#[cfg(feature = "wasm")] -#[cfg_attr(feature = "wasm", wasm_bindgen)] -impl Grammar { - pub fn parse_wasm(grammar_string: &str) -> Result { - match Grammar::parse(grammar_string) { - Ok(grammar) => Ok(grammar), - Err(error) => Err(serde_wasm_bindgen::to_value(&error)?), - } - } -} - - impl Grammar { /// Gets the symbols of the grammar. pub fn symbols(&self) -> &IndexSet { @@ -258,6 +259,11 @@ impl Grammar { &self.start_symbol } + /// Gets the empty symbols of the grammar. + pub fn empty_symbols(&self) -> &IndexSet { + &self.empty_symbols + } + /// Gets the constant tokens of the grammar. pub fn constant_tokens(&self) -> &IndexSet { &self.constant_tokens @@ -277,33 +283,59 @@ impl Grammar { #[cfg(feature = "wasm")] #[cfg_attr(feature = "wasm", wasm_bindgen)] impl Grammar { + /// Creates a grammar from a grammar string (WASM). + pub fn parse_wasm(grammar_string: &str) -> Result { + match Grammar::parse(grammar_string) { + Ok(grammar) => Ok(grammar), + Err(error) => Err(serde_wasm_bindgen::to_value(&error)?), + } + } +} + +#[cfg(feature = "wasm")] +#[cfg_attr(feature = "wasm", wasm_bindgen)] +impl Grammar { + /// Gets the symbols of the grammar (WASM). pub fn symbols_wasm(&self) -> Result { Ok(serde_wasm_bindgen::to_value(&self.symbols)?) } + + /// Gets the start symbol of the grammar (WASM). pub fn start_symbol_wasm(&self) -> Result { Ok(serde_wasm_bindgen::to_value(&self.start_symbol)?) } + + /// Gets the empty symbols of the grammar (WASM). pub fn rules_wasm(&self) -> Result { Ok(serde_wasm_bindgen::to_value(&self.rules)?) } + + /// Gets the constant tokens of the grammar (WASM). pub fn to_string_wasm(&self) -> String { self.to_string() } + + /// Gets the regular expressions of the grammar (WASM). pub fn constant_tokens_wasm(&self) -> Result { Ok(serde_wasm_bindgen::to_value(&self.constant_tokens)?) } + /// Gets the rules of the grammar (WASM). pub fn regular_expressions_wasm(&self) -> Result { - let index_map: IndexMap = - self.regular_expressions.iter().map(|(k, v)| (k.clone(), v.to_string())).collect(); + let index_map: IndexMap = self + .regular_expressions + .iter() + .map(|(name, regex)| (name.clone(), regex.to_string())) + .collect(); Ok(serde_wasm_bindgen::to_value(&index_map)?) } + + /// Clones the grammar (WASM). pub fn clone_wasm(&self) -> Grammar { self.clone() } } - impl Display for Grammar { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for rule in self.rules.iter() { @@ -423,6 +455,7 @@ mod grammar_parser { let mut symbols = IndexSet::new(); let mut start_symbol = None; + let mut empty_symbols = IndexSet::new(); let mut constant_tokens = IndexSet::new(); let mut regular_expressions = IndexMap::new(); let mut rules = Vec::new(); @@ -500,8 +533,12 @@ mod grammar_parser { pattern.push(AtomicPattern::Symbol(symbol)); }, GrammarToken::ConstantToken(constant_token) => { - constant_tokens.insert(constant_token.clone()); - pattern.push(AtomicPattern::Token(Token::Constant(constant_token))); + if constant_token.is_empty() { + pattern.push(AtomicPattern::Token(Token::Empty)); + } else { + constant_tokens.insert(constant_token.clone()); + pattern.push(AtomicPattern::Token(Token::Constant(constant_token))); + } }, GrammarToken::RegexToken(regex_token) => { pattern.push(AtomicPattern::Token(Token::Regex(regex_token))); @@ -570,9 +607,22 @@ mod grammar_parser { }, } + for rule in rules.iter_mut() { + if rule.pattern.as_slice() == [AtomicPattern::Token(Token::Empty)] { + empty_symbols.insert(rule.symbol.clone()); + continue; + } + + if rule.pattern.contains(&AtomicPattern::Token(Token::Empty)) { + rule.pattern + .retain(|atomic_pattern| *atomic_pattern != AtomicPattern::Token(Token::Empty)); + } + } + Ok(Grammar { symbols, start_symbol: start_symbol.unwrap_or(Symbol::from("")), + empty_symbols, constant_tokens, regular_expressions, rules, diff --git a/src/lib.rs b/src/lib.rs index 3804b33..73df703 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,19 +49,6 @@ pub use { }; mod prelude { - #[cfg(feature = "serde")] - pub use serde_renamed::Serialize; - - #[cfg(feature = "wasm")] - pub use { - errors::WasmParserError, - wasm_bindgen::prelude::*, - }; - - #[cfg(not(target_family = "wasm"))] - pub use colored::*; - #[cfg(target_family = "wasm")] - pub use utils::MockColored; pub use { super::*, indexmap::{ @@ -102,13 +89,28 @@ mod prelude { Debug, Display, }, + io::BufWriter, ops::Deref, }, thiserror::Error, }; + #[cfg(feature = "serde")] + pub use serde_renamed::{ + Serialize, + Serializer, + ser::SerializeMap, + }; + + #[cfg(feature = "wasm")] pub use { - serde_renamed::Serializer, - serde_renamed::ser::SerializeMap, + errors::WasmParserError, + wasm_bindgen::prelude::*, }; + + #[cfg(not(target_family = "wasm"))] + pub use colored::*; + + #[cfg(target_family = "wasm")] + pub use utils::MockColored; } diff --git a/src/parser.rs b/src/parser.rs index d006f0c..d159b58 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,6 +1,7 @@ use crate::prelude::*; -/// LR(1) parser of a grammar. + +/// LR(1) or LALR(1) parser of a grammar. #[cfg_attr(feature = "wasm", wasm_bindgen)] #[cfg_attr(feature = "serde", derive(Serialize))] #[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))] @@ -13,7 +14,6 @@ pub struct Parser { parsing_tables: ParsingTables, } - impl Parser { /// Crates an LR(1) parser of a grammar. pub fn lr(grammar: Grammar) -> Result { @@ -41,17 +41,6 @@ impl Parser { parser.check_conflicts_internal() } } -#[cfg(feature = "wasm")] -#[cfg_attr(feature = "wasm", wasm_bindgen)] -impl Parser { - pub fn new_wasm(grammar: Grammar) -> Result { - Parser::lr(grammar).map_err(WasmParserError::new) - } - pub fn new_lalr_wasm(grammar: Grammar) -> Result { - Parser::lalr(grammar).map_err(WasmParserError::new) - } -} - impl Parser { /// Gets the grammar of the parser. @@ -90,29 +79,6 @@ impl Parser { } } -#[cfg(feature = "wasm")] -#[cfg_attr(feature = "wasm", wasm_bindgen)] -impl Parser { - pub fn first_table_wasm(&self) -> Result { - Ok(serde_wasm_bindgen::to_value(&self.first_table)?) - } - pub fn follow_table_wasm(&self) -> Result { - Ok(serde_wasm_bindgen::to_value(&self.follow_table)?) - } - pub fn automaton_wasm(&self) -> Result { - Ok(serde_wasm_bindgen::to_value(&self.automaton)?) - } - pub fn parsing_tables_wasm(&self) -> Result { - Ok(serde_wasm_bindgen::to_value(&self.parsing_tables)?) - } - pub fn action_table_wasm(&self) -> Result { - Ok(serde_wasm_bindgen::to_value(&self.parsing_tables.action_table())?) - } - pub fn goto_table_wasm(&self) -> Result { - Ok(serde_wasm_bindgen::to_value(&self.parsing_tables.goto_table())?) - } -} - impl Parser { /// Tokenizes an input into a stream of tokens and their corresponding input slices. pub fn tokenize<'i>( @@ -129,7 +95,7 @@ impl Parser { let (initial_new_lines, initial_newline_offset) = utils::count_new_lines(&input[..offset]); let mut line = initial_new_lines + 1; let mut last_newline_offset = initial_newline_offset.unwrap_or(0); - let mut column = utils::count_col_position(&input[last_newline_offset..offset]); + let mut column = input[last_newline_offset..offset].chars().count() + 1; while !remaining_input.is_empty() { let mut matching_token = None; let mut matching_slice = ""; @@ -152,7 +118,7 @@ impl Parser { } if matching_token.is_none() { - let span = Span { offset, len: 1, line, column }; + let span = Span { offset, length: 1, line, column }; return Err(ParsingError::UnknownToken { token: format_smolstr!("{}", remaining_input.chars().next().unwrap()), span, @@ -161,7 +127,7 @@ impl Parser { let token = Spanned::new(matching_token.unwrap(), Span { offset, - len: matching_slice.len(), + length: matching_slice.len(), line, column, }); @@ -187,9 +153,9 @@ impl Parser { last_newline_offset = old_offset + whitespace_newline_offset; } // skip the newline character - column = utils::count_col_position(&input[last_newline_offset..offset]); + column = input[last_newline_offset..offset].chars().count() + 1; } - let eof = Spanned::new(Token::Eof, Span { offset, len: 0, line, column }); + let eof = Spanned::new(Token::Eof, Span { offset, length: 0, line, column }); tokens.push((eof, "\0")); Ok(tokens) @@ -212,44 +178,6 @@ impl Parser { } } -#[cfg(feature = "wasm")] -#[cfg_attr(feature = "wasm", wasm_bindgen)] -impl Parser { - pub fn tokenize_wasm(&self, input: &str) -> Result { - match self.tokenize(input) { - Ok(tokens) => Ok(serde_wasm_bindgen::to_value(&tokens)?), - Err(error) => Err(serde_wasm_bindgen::to_value(&error)?), - } - } - pub fn parse_wasm(&self, input: &str) -> Result { - let tokens = self.tokenize(input); - let tokens = match tokens { - Ok(tokens) => tokens, - Err(error) => return Err(serde_wasm_bindgen::to_value(&error)?), - }; - match self.parse(tokens) { - Ok(tree) => Ok(serde_wasm_bindgen::to_value(&tree)?), - Err(error) => Err(serde_wasm_bindgen::to_value(&error)?), - } - } - pub fn trace_wasm(&self, input: &str) -> Result, JsValue> { - let tokens = self.tokenize(input); - let tokens = match tokens { - Ok(tokens) => tokens, - Err(error) => return Err(serde_wasm_bindgen::to_value(&error)?), - }; - match self.trace(tokens) { - Ok((trace, tree)) => { - let trace = serde_wasm_bindgen::to_value(&trace)?; - let tree = serde_wasm_bindgen::to_value(&tree)?; - Ok(vec![trace, tree]) - }, - Err(error) => Err(serde_wasm_bindgen::to_value(&error)?), - } - } -} - - impl Parser { /// Internal grammar checks. fn check_grammar_internal(grammar: &Grammar) -> Result<(), ParserError> { @@ -315,7 +243,7 @@ impl Parser { let (mut current_token, mut current_slice) = remaining_tokens.pop().unwrap(); loop { let current_state = *state_stack.last().unwrap(); - let action_to_take = match self.action_table()[current_state].get(current_token.value()) + let action_to_take = match self.action_table()[current_state].get(current_token.deref()) { Some(actions) => { assert_eq!(actions.len(), 1); @@ -368,7 +296,8 @@ impl Parser { }, Action::Reduce { rule_index } => { let rule = &self.grammar.rules()[rule_index]; - let pattern_length = rule.pattern().len(); + let pattern_length = + if rule.is_empty_pattern() { 0 } else { rule.pattern().len() }; let symbol = rule.symbol().clone(); let pattern = @@ -394,7 +323,6 @@ impl Parser { } } - impl Parser { /// Dumps the parser to stdout. pub fn dump(&self) { @@ -659,3 +587,93 @@ impl Parser { } } } + +#[cfg(feature = "wasm")] +#[cfg_attr(feature = "wasm", wasm_bindgen)] +impl Parser { + /// Crates an LR(1) parser of a grammar (WASM). + pub fn new_wasm(grammar: Grammar) -> Result { + Ok(Parser::lr(grammar)?) + } + + /// Crates an LALR(1) parser of a grammar (WASM). + pub fn new_lalr_wasm(grammar: Grammar) -> Result { + Ok(Parser::lalr(grammar)?) + } +} + +#[cfg(feature = "wasm")] +#[cfg_attr(feature = "wasm", wasm_bindgen)] +impl Parser { + /// Gets the first table of the symbols in the grammar of the parser (WASM). + pub fn first_table_wasm(&self) -> Result { + Ok(serde_wasm_bindgen::to_value(&self.first_table)?) + } + + /// Gets the follow table of the symbols in the grammar of the parser (WASM). + pub fn follow_table_wasm(&self) -> Result { + Ok(serde_wasm_bindgen::to_value(&self.follow_table)?) + } + + /// Gets the automaton of the grammar of the parser (WASM). + pub fn automaton_wasm(&self) -> Result { + Ok(serde_wasm_bindgen::to_value(&self.automaton)?) + } + + /// Gets the parsing tables of the parser (WASM). + pub fn parsing_tables_wasm(&self) -> Result { + Ok(serde_wasm_bindgen::to_value(&self.parsing_tables)?) + } + + /// Gets the action table of the parser (WASM). + pub fn action_table_wasm(&self) -> Result { + Ok(serde_wasm_bindgen::to_value(&self.parsing_tables.action_table())?) + } + + /// Gets the goto table of the parser (WASM). + pub fn goto_table_wasm(&self) -> Result { + Ok(serde_wasm_bindgen::to_value(&self.parsing_tables.goto_table())?) + } +} + +#[cfg(feature = "wasm")] +#[cfg_attr(feature = "wasm", wasm_bindgen)] +impl Parser { + /// Tokenizes an input into a stream of tokens and their corresponding input slices (WASM). + pub fn tokenize_wasm(&self, input: &str) -> Result { + match self.tokenize(input) { + Ok(tokens) => Ok(serde_wasm_bindgen::to_value(&tokens)?), + Err(error) => Err(serde_wasm_bindgen::to_value(&error)?), + } + } + + /// Parses a tokenized input (WASM). + pub fn parse_wasm(&self, input: &str) -> Result { + let tokens = self.tokenize(input); + let tokens = match tokens { + Ok(tokens) => tokens, + Err(error) => return Err(serde_wasm_bindgen::to_value(&error)?), + }; + match self.parse(tokens) { + Ok(tree) => Ok(serde_wasm_bindgen::to_value(&tree)?), + Err(error) => Err(serde_wasm_bindgen::to_value(&error)?), + } + } + + /// Traces the parsing of a tokenized input (WASM). + pub fn trace_wasm(&self, input: &str) -> Result, JsValue> { + let tokens = self.tokenize(input); + let tokens = match tokens { + Ok(tokens) => tokens, + Err(error) => return Err(serde_wasm_bindgen::to_value(&error)?), + }; + match self.trace(tokens) { + Ok((trace, tree)) => { + let trace = serde_wasm_bindgen::to_value(&trace)?; + let tree = serde_wasm_bindgen::to_value(&tree)?; + Ok(vec![trace, tree]) + }, + Err(error) => Err(serde_wasm_bindgen::to_value(&error)?), + } + } +} diff --git a/src/span.rs b/src/span.rs index 87fe028..011add8 100644 --- a/src/span.rs +++ b/src/span.rs @@ -1,67 +1,72 @@ use crate::prelude::*; -/// Position information of a token in the input string. + +/// Position of a token in the input string. #[cfg_attr(feature = "serde", derive(Serialize))] #[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))] #[derive(Clone, Debug, PartialEq)] pub struct Span { /// Byte offset of the span in the input string. pub offset: usize, - /// Length of the span. - pub len: usize, + /// Length of the span in terms of bytes. + pub length: usize, /// Line number of the span in the input string. pub line: usize, /// Column number of the span in the input string. pub column: usize, } -/// Wrapper over any type with span information. + +/// Wrapper for objects with spans. #[cfg(not(feature = "serde"))] #[derive(Clone, Debug)] pub struct Spanned { - /// The value of the span. - pub value: T, - /// The span information. + /// Spanned object. + object: T, + /// Span of the object. span: Span, } -/// Wrapper over any type with span information. + +/// Wrapper for objects with spans. #[cfg(feature = "serde")] #[cfg_attr(feature = "serde", derive(Serialize))] #[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))] #[derive(Clone, Debug)] pub struct Spanned { - /// Span of the value. - value: T, - /// Span information. + /// Spanned object. + object: T, + /// Span of the object. span: Span, } - impl< #[cfg(not(feature = "serde"))] T: Debug + Clone, #[cfg(feature = "serde")] T: Serialize + Debug + Clone, > Spanned { - /// Creates a new Spanned value. - pub fn new(value: T, span: Span) -> Self { - Self { value, span } + /// Creates a new spanned object. + pub fn new(object: T, span: Span) -> Spanned { + Spanned { object, span } } - /// Gets the span information. + + /// Gets the spanned object. + pub fn object(&self) -> &T { + &self.object + } + + /// Gets the span of the object. pub fn span(&self) -> &Span { &self.span } - /// Converts the Spanned value into a tuple of the value and the span. + /// Splits the spanned object into the object and the span. pub fn into_components(self) -> (T, Span) { - (self.value, self.span) + (self.object, self.span) } - /// Gets the value of the span. - pub fn value(&self) -> &T { - &self.value - } - /// Converts the Spanned value into the value. - pub fn into_value(self) -> T { - self.value + + /// Extracts the object destroys the span. + pub fn into_object(self) -> T { + self.object } } @@ -72,7 +77,7 @@ impl< { type Target = T; - fn deref(&self) -> &Self::Target { - &self.value + fn deref(&self) -> &T { + &self.object } } diff --git a/src/tables.rs b/src/tables.rs index c26fd52..04b509e 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -17,46 +17,39 @@ impl FirstTable { while !done { done = true; for rule in grammar.rules().iter() { - let symbol = rule.symbol(); - let first_atomic_pattern = rule.pattern().first().unwrap(); - - if matches!( - first_atomic_pattern, - AtomicPattern::Symbol(atomic_pattern) if atomic_pattern == symbol - ) { - continue; - } - - let mut possible_first_tokens = match first_table.get_mut(symbol) { - Some(entry) => std::mem::take(entry), - None => IndexSet::new(), - }; + let mut possible_first_tokens: IndexSet = + first_table.get(rule.symbol()).cloned().unwrap_or_default(); let old_possible_first_token_count = possible_first_tokens.len(); - match first_atomic_pattern { - AtomicPattern::Symbol(symbol) => { - if let Some(new_possible_first_tokens) = first_table.get(symbol) { - possible_first_tokens.extend(new_possible_first_tokens.iter().cloned()); - } - }, - AtomicPattern::Token(token) => { - possible_first_tokens.insert(token.clone()); - }, + for (index, atomic_pattern) in rule.pattern().iter().enumerate() { + match atomic_pattern { + AtomicPattern::Symbol(symbol) => { + if let Some(new_possible_first_tokens) = first_table.get(symbol) { + possible_first_tokens.extend( + new_possible_first_tokens + .iter() + .filter(|&possible_token| *possible_token != Token::Empty) + .cloned(), + ); + } + if !grammar.empty_symbols().contains(symbol) { + break; + } + }, + AtomicPattern::Token(token) => { + possible_first_tokens.insert(token.clone()); + break; + }, + } + if index == rule.pattern().len() - 1 { + possible_first_tokens.insert(Token::Empty); + } } let new_possible_first_token_count = possible_first_tokens.len(); if new_possible_first_token_count != old_possible_first_token_count { done = false; - } - if new_possible_first_token_count > 0 { - match first_table.get_mut(symbol) { - Some(entry) => { - *entry = possible_first_tokens; - }, - None => { - first_table.insert(symbol.clone(), possible_first_tokens); - }, - } + first_table.insert(rule.symbol().clone(), possible_first_tokens); } } } @@ -90,76 +83,59 @@ impl FollowTable { let mut done = false; while !done { done = true; - for rule in grammar.rules().iter() { - for consecutive_atomic_patterns in rule.pattern().windows(2) { - let ap1 = &consecutive_atomic_patterns[0]; - let ap2 = &consecutive_atomic_patterns[1]; + for rule in grammar.rules() { + let rule_symbol = rule.symbol(); + let rule_pattern = rule.pattern(); - let ap1_symbol = match ap1 { + let last_ap_index = rule_pattern.len() - 1; + for (ap_index, ap) in rule_pattern.iter().enumerate() { + let atomic_pattern_symbol = match ap { AtomicPattern::Symbol(symbol) => symbol, AtomicPattern::Token(_) => continue, }; - let mut possible_follow_tokens = match follow_table.get_mut(ap1_symbol) { - Some(entry) => std::mem::take(entry), - None => IndexSet::new(), - }; + let mut possible_follow_tokens = + follow_table.get(atomic_pattern_symbol).cloned().unwrap_or_default(); let old_possible_follow_token_count = possible_follow_tokens.len(); - match ap2 { - AtomicPattern::Symbol(symbol) => { - if let Some(new_possible_follow_tokens) = first_table.get(symbol) { - possible_follow_tokens - .extend(new_possible_follow_tokens.iter().cloned()); + if ap_index != last_ap_index { + let mut rest_of_the_pattern_can_be_empty = true; + for next_atomic_pattern in &rule_pattern[ap_index + 1..] { + match next_atomic_pattern { + AtomicPattern::Symbol(next_symbol) => { + if let Some(first_set) = first_table.get(next_symbol) { + possible_follow_tokens.extend( + first_set + .iter() + .filter(|&t| *t != Token::Empty) + .cloned(), + ); + if !first_set.contains(&Token::Empty) { + rest_of_the_pattern_can_be_empty = false; + break; + } + } + }, + AtomicPattern::Token(token) => { + possible_follow_tokens.insert(token.clone()); + rest_of_the_pattern_can_be_empty = false; + break; + }, } - }, - AtomicPattern::Token(token) => { - possible_follow_tokens.insert(token.clone()); - }, - } - let new_possible_follow_token_count = possible_follow_tokens.len(); - - if new_possible_follow_token_count != old_possible_follow_token_count { - done = false; - } - - if new_possible_follow_token_count > 0 { - match follow_table.get_mut(ap1_symbol) { - Some(entry) => { - *entry = possible_follow_tokens; - }, - None => { - follow_table.insert(ap1_symbol.clone(), possible_follow_tokens); - }, } - } - } - - if let AtomicPattern::Symbol(last_ap) = rule.pattern().last().unwrap() { - let mut possible_follow_tokens = match follow_table.get_mut(last_ap) { - Some(entry) => std::mem::take(entry), - None => IndexSet::new(), - }; - - let old_possible_follow_token_count = possible_follow_tokens.len(); - if let Some(new_possible_follow_tokens) = follow_table.get(rule.symbol()) { - possible_follow_tokens.extend(new_possible_follow_tokens.iter().cloned()); + if rest_of_the_pattern_can_be_empty { + if let Some(rule_symbol_follow) = follow_table.get(rule_symbol) { + possible_follow_tokens.extend(rule_symbol_follow.iter().cloned()); + } + } + } else if let Some(rule_symbol_follow) = follow_table.get(rule_symbol) { + possible_follow_tokens.extend(rule_symbol_follow.iter().cloned()); } let new_possible_follow_token_count = possible_follow_tokens.len(); if new_possible_follow_token_count != old_possible_follow_token_count { done = false; - } - - if new_possible_follow_token_count > 0 { - match follow_table.get_mut(last_ap) { - Some(entry) => { - *entry = possible_follow_tokens; - }, - None => { - follow_table.insert(last_ap.clone(), possible_follow_tokens); - }, - } + follow_table.insert(atomic_pattern_symbol.clone(), possible_follow_tokens); } } } @@ -237,7 +213,8 @@ impl ParsingTables { let mut gotos = IndexMap::::new(); for item in state.items() { - if item.dot() == item.rule().pattern().len() { + let rule = item.rule(); + if item.dot() == rule.pattern().len() || rule.is_empty_pattern() { if let Some(follows) = follow_table.get(item.rule().symbol()) { let rule_index = grammar.rules().iter().position(|rule| rule == item.rule()).unwrap(); diff --git a/src/trace.rs b/src/trace.rs index 4225a20..3d7eb1e 100644 --- a/src/trace.rs +++ b/src/trace.rs @@ -43,9 +43,9 @@ pub struct Trace<'i> { steps: Vec>, } -impl Trace<'_> { +impl<'i> Trace<'i> { /// Creates a new trace. - pub fn new() -> Self { + pub fn new() -> Trace<'i> { Self { steps: vec![] } } } @@ -91,7 +91,7 @@ impl Trace<'_> { } }) .join(" "); - let remaining_input = step.remaining_tokens.iter().rev().map(|t| t.value()).join(" "); + let remaining_input = step.remaining_tokens.iter().rev().map(|t| t.deref()).join(" "); let action_taken = match step.action_taken { Action::Shift { next_state } => { format!("Shift {}", next_state) diff --git a/src/tree.rs b/src/tree.rs index f18fa44..f8ae5e8 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -1,7 +1,5 @@ -use { - crate::prelude::*, - std::io::BufWriter, -}; +use crate::prelude::*; + /// Parse tree of a parsed input. #[cfg_attr(feature = "serde", derive(Serialize))] @@ -26,6 +24,7 @@ pub enum Tree<'i> { pattern: Vec>, }, } + impl Tree<'_> { /// Dumps the parse tree to stdout. pub fn dump(&self) { diff --git a/src/utils.rs b/src/utils.rs index 7413653..e5cda17 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -2,12 +2,15 @@ use crate::prelude::*; -/// The `colored` crate uses OS specific features to colorize the output, which are not available in -/// the WASM target. This trait provides a mock implementation of the `colored` crate for the WASM target. +/// We're using `colored` crate, which use OS specific features to colorize the output. +/// Since those features are not available in WebAssembly, we create a mock implementation +/// for the `colored` methods we use. #[cfg(target_family = "wasm")] pub trait MockColored { fn green(&self) -> String; + fn cyan(&self) -> String; + fn bold(&self) -> String; } @@ -24,26 +27,28 @@ impl> MockColored for T { } } -/// Serializes a map of regex objects to a map of regex strings. + +/// Serializes a map of regex tokens to compiled regex objects. #[cfg(feature = "serde")] -pub fn serialize_regex_map( - map: &IndexMap, +pub fn serialize_regex_token_to_regex_map( + regex_token_to_regex: &IndexMap, serializer: S, ) -> Result where S: Serializer, { - let mut map_serializer = serializer.serialize_map(Some(map.len()))?; - for (key, value) in map { + let mut map_serializer = serializer.serialize_map(Some(regex_token_to_regex.len()))?; + for (key, value) in regex_token_to_regex { map_serializer.serialize_entry(key, &value.to_string())?; } map_serializer.end() } -/// Counts the number of new lines in a slice and returns the offset after the last new line. +/// Counts the number of new lines in a slice and computes the offset after the last new line. pub fn count_new_lines(slice: &str) -> (usize, Option) { let mut offset_after_newline = None; + let mut count = 0; for (offset, byte) in slice.bytes().enumerate() { if byte == b'\n' { @@ -51,12 +56,6 @@ pub fn count_new_lines(slice: &str) -> (usize, Option) { count += 1; } } - (count, offset_after_newline) -} -/// Counts column position of a char. -/// -/// The resulting column position is the 1 indexed utf-8 charater in the slice. -pub fn count_col_position(slice: &str) -> usize { - slice.chars().count() + 1 + (count, offset_after_newline) } diff --git a/tests/common.rs b/tests/common.rs index 8a6c024..d7cc9cd 100644 --- a/tests/common.rs +++ b/tests/common.rs @@ -4,15 +4,17 @@ pub mod grammars { // Correct grammars // ---------------- pub const CORRECT: &[&str] = - &[BINARY_ADDITION, CALCULATOR, CONDITIONAL, G10, G11, JSON, NOT_LALR]; + &[BINARY_ADDITION, CALCULATOR, CONDITIONAL, G9, G10, G11, JSON, NOT_LALR, OPTIONAL]; pub const BINARY_ADDITION: &str = include_str!("../assets/grammars/correct/binary-addition.lr"); pub const CALCULATOR: &str = include_str!("../assets/grammars/correct/calculator.lr"); pub const CONDITIONAL: &str = include_str!("../assets/grammars/correct/conditional.lr"); + pub const G9: &str = include_str!("../assets/grammars/correct/g9.lr"); pub const G10: &str = include_str!("../assets/grammars/correct/g10.lr"); pub const G11: &str = include_str!("../assets/grammars/correct/g11.lr"); pub const JSON: &str = include_str!("../assets/grammars/correct/json.lr"); pub const NOT_LALR: &str = include_str!("../assets/grammars/correct/not-lalr.lr"); + pub const OPTIONAL: &str = include_str!("../assets/grammars/correct/optional.lr"); // -------------------------------- // Syntactically incorrect grammars diff --git a/tests/grammar.rs b/tests/grammar.rs index b0a065b..042bbd1 100644 --- a/tests/grammar.rs +++ b/tests/grammar.rs @@ -3,13 +3,18 @@ mod common; use dotlr::{ ConstantToken, Grammar, + GrammarError, RegexToken, Rule, Symbol, }; +#[cfg(target_family = "wasm")] +use wasm_bindgen_test::*; + #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn parsing_syntactically_correct_grammars() { for grammar in common::grammars::CORRECT { assert!(Grammar::parse(grammar).is_ok()) @@ -20,6 +25,7 @@ fn parsing_syntactically_correct_grammars() { } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn failing_to_parse_syntactically_incorrect_grammars() { for grammar in common::grammars::SYNTACTICALLY_INCORRECT { assert!(Grammar::parse(grammar).is_err()) @@ -28,10 +34,11 @@ fn failing_to_parse_syntactically_incorrect_grammars() { #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_parsing_unexpected_token_grammar() { let error = Grammar::parse(common::grammars::UNEXPECTED_TOKEN).unwrap_err(); match error { - dotlr::GrammarError::UnexpectedToken { line, column, token, expected } => { + GrammarError::UnexpectedToken { line, column, token, expected } => { assert_eq!(line, 1); assert_eq!(column, 6); assert_eq!(token.as_str(), "->"); @@ -41,25 +48,27 @@ fn raising_correct_error_when_parsing_unexpected_token_grammar() { "regular expression token" ]); }, - _ => unreachable!(), + error => panic!("unexpected grammar error {:?}", error), } } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_parsing_invalid_regex_grammar() { let error = Grammar::parse(common::grammars::INVALID_REGEX).unwrap_err(); match error { - dotlr::GrammarError::InvalidRegex { line, column, regex } => { + GrammarError::InvalidRegex { line, column, regex } => { assert_eq!(line, 3); assert_eq!(column, 8); assert_eq!(regex.as_str(), "/[1-9][0-9+/"); }, - _ => unreachable!(), + error => panic!("unexpected grammar error {:?}", error), } } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn correctly_parsing_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); diff --git a/tests/parser.rs b/tests/parser.rs index dbd60a8..5bdc2c7 100644 --- a/tests/parser.rs +++ b/tests/parser.rs @@ -21,8 +21,12 @@ use { std::ops::Deref, }; +#[cfg(target_family = "wasm")] +use wasm_bindgen_test::*; + #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn creating_parser_for_semantically_correct_grammars() { for grammar in common::grammars::CORRECT { let grammar = Grammar::parse(grammar).unwrap(); @@ -31,6 +35,7 @@ fn creating_parser_for_semantically_correct_grammars() { } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn failing_to_create_parser_for_semantically_incorrect_grammars() { for grammar in common::grammars::SEMANTICALLY_INCORRECT { let grammar = Grammar::parse(grammar).unwrap(); @@ -40,6 +45,7 @@ fn failing_to_create_parser_for_semantically_incorrect_grammars() { #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_creating_parser_for_empty_grammar() { let grammar = Grammar::parse(common::grammars::EMPTY).unwrap(); let error = Parser::lr(grammar).unwrap_err(); @@ -47,6 +53,7 @@ fn raising_correct_error_when_creating_parser_for_empty_grammar() { } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_creating_parser_for_undefined_symbol_grammar() { let grammar = Grammar::parse(common::grammars::UNDEFINED_SYMBOL).unwrap(); let error = Parser::lr(grammar).unwrap_err(); @@ -54,6 +61,7 @@ fn raising_correct_error_when_creating_parser_for_undefined_symbol_grammar() { } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_creating_parser_for_undefined_regex_token_grammar() { let grammar = Grammar::parse(common::grammars::UNDEFINED_REGEX_TOKEN).unwrap(); let error = Parser::lr(grammar).unwrap_err(); @@ -61,6 +69,7 @@ fn raising_correct_error_when_creating_parser_for_undefined_regex_token_grammar( } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_creating_parser_for_shift_reduce_conflict_grammar() { let grammar = Grammar::parse(common::grammars::SHIFT_REDUCE_CONFLICT).unwrap(); let error = Parser::lr(grammar).unwrap_err(); @@ -84,10 +93,13 @@ fn raising_correct_error_when_creating_parser_for_shift_reduce_conflict_grammar( } assert!(has_shift_action && has_reduce_action); + } else { + panic!("unexpected parser error {:?}", error); } } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_creating_parser_for_reduce_reduce_conflict_grammar() { let grammar = Grammar::parse(common::grammars::REDUCE_REDUCE_CONFLICT).unwrap(); let error = Parser::lr(grammar).unwrap_err(); @@ -107,10 +119,13 @@ fn raising_correct_error_when_creating_parser_for_reduce_reduce_conflict_grammar } assert!(reduce_action_count >= 2); + } else { + panic!("unexpected parser error {:?}", error); } } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_creating_lalr_parser_for_non_lalr_grammar() { let grammar = Grammar::parse(common::grammars::NOT_LALR).unwrap(); let error = Parser::lalr(grammar).unwrap_err(); @@ -123,11 +138,14 @@ fn raising_correct_error_when_creating_lalr_parser_for_non_lalr_grammar() { assert!(possible_actions.is_some()); assert!(possible_actions.unwrap().len() >= 2); + } else { + panic!("unexpected parser error {:?}", error); } } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn correctly_creating_lr_parser_for_binary_addition_grammar() { let grammar = Grammar::parse(common::grammars::BINARY_ADDITION).unwrap(); let parser = Parser::lr(grammar).unwrap(); @@ -655,6 +673,7 @@ B -> '1' } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn correctly_creating_lalr_parser_for_g10_grammar() { let grammar = Grammar::parse(common::grammars::G10).unwrap(); let parser = Parser::lalr(grammar).unwrap(); @@ -1469,3 +1488,500 @@ T -> %id ); } } + +#[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] +fn correctly_creating_lr_parser_for_optional_grammar() { + let grammar = Grammar::parse(common::grammars::OPTIONAL).unwrap(); + let parser = Parser::lr(grammar).unwrap(); + + assert_eq!( + parser.grammar().to_string().trim(), + r#" + +P -> O 'x' O 'z' +O -> 'y' +O -> ε + + "# + .trim() + ); + + let first_table = parser.first_table(); + { + // +--------+--------------+ + // | Symbol | First Set | + // +--------+--------------+ + // | P | { 'x', 'y' } | + // +--------+--------------+ + // | O | { 'y', ε } | + // +--------+--------------+ + + #[rustfmt::skip] + assert_eq!( + *first_table.deref(), + [ + ( + Symbol::from("P"), + [ + ConstantToken::from("x").into(), + ConstantToken::from("y").into(), + ] + .into(), + ), + ( + Symbol::from("O"), + [ + ConstantToken::from("y").into(), + Token::Empty, + ] + .into(), + ), + ] + .into_iter() + .collect::>() + ); + } + + let follow_table = parser.follow_table(); + { + // +--------+--------------+ + // | Symbol | Follow Set | + // +--------+--------------+ + // | P | { $ } | + // +--------+--------------+ + // | O | { 'x', 'z' } | + // +--------+--------------+ + + #[rustfmt::skip] + assert_eq!( + *follow_table.deref(), + [ + ( + Symbol::from("P"), + [ + Token::Eof, + ] + .into(), + ), + ( + Symbol::from("O"), + [ + ConstantToken::from("x").into(), + ConstantToken::from("z").into(), + ] + .into(), + ), + ] + .into_iter() + .collect::>() + ); + } + + let automaton = parser.automaton(); + { + // +-------+----------------------+------------+--------------+ + // | State | Items | Lookaheads | Transitions | + // +-------+----------------------+------------+--------------+ + // | 0 | P -> . O 'x' O 'z' | { $ } | O -> 1 | + // | | O -> . 'y' | { 'x' } | 'y' -> 2 | + // | | O -> . ε | { 'x' } | | + // +-------+----------------------+------------+--------------+ + // | 1 | P -> O . 'x' O 'z' | { $ } | 'x' -> 3 | + // +-------+----------------------+------------+--------------+ + // | 2 | O -> 'y' . | { 'x' } | | + // +-------+----------------------+------------+--------------+ + // | 3 | P -> O 'x' . O 'z' | { $ } | O -> 4 | + // | | O -> . 'y' | { 'z' } | 'y' -> 5 | + // | | O -> . ε | { 'z' } | | + // +-------+----------------------+------------+--------------+ + // | 4 | P -> O 'x' O . 'z' | { $ } | 'z' -> 6 | + // +-------+----------------------+------------+--------------+ + // | 5 | O -> 'y' . | { 'z' } | | + // +-------+----------------------+------------+--------------+ + // | 6 | P -> O 'x' O 'z' . | { $ } | | + // +-------+----------------------+------------+--------------+ + + #[rustfmt::skip] + assert_eq!( + automaton.states(), + [ + // State 0 + State::new( + 0, + [ + // P -> . O 'x' O 'z' | { $ } + Item::new( + Rule::new( + "P", + [ + Symbol::from("O").into(), + ConstantToken::from("x").into(), + Symbol::from("O").into(), + ConstantToken::from("z").into(), + ] + ), + 0, + [Token::Eof], + ), + // O -> . 'y' | { 'x' } + Item::new( + Rule::new( + "O", + [ + ConstantToken::from("y").into(), + ] + ), + 0, + [ConstantToken::from("x").into()], + ), + // O -> . ε | { 'x' } + Item::new( + Rule::new( + "O", + [ + Token::Empty.into(), + ] + ), + 0, + [ConstantToken::from("x").into()], + ), + ], + [ + // O -> 1 + (Symbol::from("O").into(), 1), + // 'y' -> 2 + (ConstantToken::from("y").into(), 2), + ], + ), + + // State 1 + State::new( + 1, + [ + // P -> O . 'x' O 'z' | { $ } + Item::new( + Rule::new( + "P", + [ + Symbol::from("O").into(), + ConstantToken::from("x").into(), + Symbol::from("O").into(), + ConstantToken::from("z").into(), + ] + ), + 1, + [Token::Eof], + ), + ], + [ + // 'x' -> 2 + (ConstantToken::from("x").into(), 3), + ], + ), + + // State 2 + State::new( + 2, + [ + // O -> 'y' . | { 'x' } + Item::new( + Rule::new( + "O", + [ + ConstantToken::from("y").into(), + ] + ), + 1, + [ConstantToken::from("x").into()], + ), + ], + [], + ), + + // State 3 + State::new( + 3, + [ + // P -> O 'x' . O 'z' | { $ } + Item::new( + Rule::new( + "P", + [ + Symbol::from("O").into(), + ConstantToken::from("x").into(), + Symbol::from("O").into(), + ConstantToken::from("z").into(), + ] + ), + 2, + [Token::Eof], + ), + // O -> . 'y' | { 'z' } + Item::new( + Rule::new( + "O", + [ + ConstantToken::from("y").into(), + ] + ), + 0, + [ConstantToken::from("z").into()], + ), + // O -> . ε | { 'z' } + Item::new( + Rule::new( + "O", + [ + Token::Empty.into(), + ] + ), + 0, + [ConstantToken::from("z").into()], + ), + ], + [ + // O -> 4 + (Symbol::from("O").into(), 4), + // 'y' -> 5 + (ConstantToken::from("y").into(), 5), + ], + ), + + // State 4 + State::new( + 4, + [ + // P -> O 'x' O . 'z' | { $ } + Item::new( + Rule::new( + "P", + [ + Symbol::from("O").into(), + ConstantToken::from("x").into(), + Symbol::from("O").into(), + ConstantToken::from("z").into(), + ] + ), + 3, + [Token::Eof], + ), + ], + [ + // 'z' -> 6 + (ConstantToken::from("z").into(), 6), + ], + ), + + // State 5 + State::new( + 5, + [ + // O -> 'y' . | { 'z' } + Item::new( + Rule::new( + "O", + [ + ConstantToken::from("y").into(), + ] + ), + 1, + [ConstantToken::from("z").into()], + ), + ], + [], + ), + + // State 6 + State::new( + 6, + [ + // P -> O 'x' O 'z' . | { $ } + Item::new( + Rule::new( + "P", + [ + Symbol::from("O").into(), + ConstantToken::from("x").into(), + Symbol::from("O").into(), + ConstantToken::from("z").into(), + ] + ), + 4, + [Token::Eof], + ), + ], + [], + ), + ] + ); + } + + let action_table = parser.action_table(); + { + // +-------+--------------------------------+ + // | | Action | + // | State | ------------------------------ | + // | | 'x' 'z' 'y' $ | + // +-------+--------------------------------+ + // | 0 | r3 - s2 - | + // +-------+--------------------------------+ + // | 1 | s3 - - - | + // +-------+--------------------------------+ + // | 2 | r2 - - - | + // +-------+--------------------------------+ + // | 3 | - r3 s5 - | + // +-------+--------------------------------+ + // | 4 | - s6 - - | + // +-------+--------------------------------+ + // | 5 | - r2 - - | + // +-------+--------------------------------+ + // | 6 | - - - a1 | + // +-------+--------------------------------+ + + #[rustfmt::skip] + assert_eq!( + action_table, + [ + // State 0 + IndexMap::>::from_iter( + [ + ( + ConstantToken::from("x").into(), + IndexSet::from([Action::Reduce { rule_index: 2 }]), + ), + ( + ConstantToken::from("y").into(), + IndexSet::from([Action::Shift { next_state: 2 }]), + ), + ], + ), + // State 1 + IndexMap::>::from_iter( + [ + ( + ConstantToken::from("x").into(), + IndexSet::from([Action::Shift { next_state: 3 }]), + ), + ], + ), + // State 2 + IndexMap::>::from_iter( + [ + ( + ConstantToken::from("x").into(), + IndexSet::from([Action::Reduce { rule_index: 1 }]), + ), + ], + ), + // State 3 + IndexMap::>::from_iter( + [ + ( + ConstantToken::from("z").into(), + IndexSet::from([Action::Reduce { rule_index: 2 }]), + ), + ( + ConstantToken::from("y").into(), + IndexSet::from([Action::Shift { next_state: 5 }]), + ), + ], + ), + // State 4 + IndexMap::>::from_iter( + [ + ( + ConstantToken::from("z").into(), + IndexSet::from([Action::Shift { next_state: 6 }]), + ), + ], + ), + // State 5 + IndexMap::>::from_iter( + [ + ( + ConstantToken::from("z").into(), + IndexSet::from([Action::Reduce { rule_index: 1 }]), + ), + ], + ), + // State 6 + IndexMap::>::from_iter( + [ + ( + Token::Eof, + IndexSet::from([Action::Accept { rule_index: 0 }]), + ) + ], + ), + ] + ); + } + + let goto_table = parser.goto_table(); + { + // +-------+--------------+ + // | | Goto | + // | State | ------------ | + // | | P O | + // +-------+--------------+ + // | 0 | - 1 | + // +-------+--------------+ + // | 1 | - - | + // +-------+--------------+ + // | 2 | - - | + // +-------+--------------+ + // | 3 | - 4 | + // +-------+--------------+ + // | 4 | - - | + // +-------+--------------+ + // | 5 | - - | + // +-------+--------------+ + // | 6 | - - | + // +-------+--------------+ + + #[rustfmt::skip] + assert_eq!( + goto_table, + [ + // State 0 + IndexMap::::from_iter( + [ + (Symbol::from("O"), 1), + ], + ), + // State 1 + IndexMap::::from_iter( + [ + ], + ), + // State 2 + IndexMap::::from_iter( + [ + ], + ), + // State 3 + IndexMap::::from_iter( + [ + (Symbol::from("O"), 4), + ], + ), + // State 4 + IndexMap::::from_iter( + [ + ], + ), + // State 5 + IndexMap::::from_iter( + [ + ], + ), + // State 6 + IndexMap::::from_iter( + [ + ], + ), + ] + ); + } +} diff --git a/tests/parsing.rs b/tests/parsing.rs index a46b566..41904c0 100644 --- a/tests/parsing.rs +++ b/tests/parsing.rs @@ -6,8 +6,12 @@ use dotlr::{ Parser, }; +#[cfg(target_family = "wasm")] +use wasm_bindgen_test::*; + #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_encountering_unknown_token_during_parsing_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); let parser = Parser::lr(grammar).unwrap(); @@ -17,6 +21,7 @@ fn raising_correct_error_when_encountering_unknown_token_during_parsing_calculat } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_encountering_unexpected_token_during_parsing_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); let parser = Parser::lr(grammar).unwrap(); @@ -27,6 +32,7 @@ fn raising_correct_error_when_encountering_unexpected_token_during_parsing_calcu } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn raising_correct_error_when_encountering_unexpected_eof_during_parsing_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); let parser = Parser::lr(grammar).unwrap(); @@ -41,6 +47,7 @@ fn raising_correct_error_when_encountering_unexpected_eof_during_parsing_calcula #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn correctly_trace_parsing_of_calculator_grammar() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); let parser = Parser::lr(grammar).unwrap(); @@ -166,6 +173,7 @@ Expr } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn correctly_parsing_conditional_grammar() { let grammar = Grammar::parse(common::grammars::CONDITIONAL).unwrap(); let parser = Parser::lr(grammar).unwrap(); @@ -200,6 +208,7 @@ Conditional } #[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] fn correctly_parsing_json_grammar_with_lalr() { let grammar = Grammar::parse(common::grammars::JSON).unwrap(); let parser = Parser::lalr(grammar).unwrap(); @@ -300,3 +309,164 @@ Json .trim(), ); } + +#[test] +#[cfg_attr(target_family = "wasm", wasm_bindgen_test)] +fn correctly_trace_parsing_of_optional_grammar() { + let grammar = Grammar::parse(common::grammars::OPTIONAL).unwrap(); + let parser = Parser::lr(grammar).unwrap(); + + { + let expression = "x y z"; + let tokens = parser.tokenize(expression).unwrap(); + + let (parse_trace, parse_tree) = parser.trace(tokens).unwrap(); + { + // +------+-------------+--------------+-----------------+-----------------------------+ + // | Step | State Stack | Symbol Stack | Remaining Input | Action Taken | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 0 | 0 | | 'x' 'y' 'z' $ | Reduce 3 (O -> ε) | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 1 | 0 1 | O | 'x' 'y' 'z' $ | Shift 3 | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 2 | 0 1 3 | O 'x' | 'y' 'z' $ | Shift 5 | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 3 | 0 1 3 5 | O 'x' 'y' | 'z' $ | Reduce 2 (O -> 'y') | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 4 | 0 1 3 4 | O 'x' O | 'z' $ | Shift 6 | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 5 | 0 1 3 4 6 | O 'x' O 'z' | $ | Accept 1 (P -> O 'x' O 'z') | + // +------+-------------+--------------+-----------------+-----------------------------+ + + assert_eq!( + parse_trace.steps().iter().map(|step| *step.action_taken()).collect::>(), + [ + Action::Reduce { rule_index: 2 }, + Action::Shift { next_state: 3 }, + Action::Shift { next_state: 5 }, + Action::Reduce { rule_index: 1 }, + Action::Shift { next_state: 6 }, + Action::Accept { rule_index: 0 } + ], + ); + } + { + assert_eq!( + parse_tree.to_string().trim(), + r#" + +P +├─ O +├─ x +├─ O +│ └─ y +└─ z + + "# + .trim(), + ); + } + } + { + let expression = "x z"; + let tokens = parser.tokenize(expression).unwrap(); + + let (parse_trace, parse_tree) = parser.trace(tokens).unwrap(); + { + // +------+-------------+--------------+-----------------+-----------------------------+ + // | Step | State Stack | Symbol Stack | Remaining Input | Action Taken | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 0 | 0 | | 'x' 'z' $ | Reduce 3 (O -> ε) | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 1 | 0 1 | O | 'x' 'z' $ | Shift 3 | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 2 | 0 1 3 | O 'x' | 'z' $ | Reduce 3 (O -> ε) | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 3 | 0 1 3 4 | O 'x' O | 'z' $ | Shift 6 | + // +------+-------------+--------------+-----------------+-----------------------------+ + // | 4 | 0 1 3 4 6 | O 'x' O 'z' | $ | Accept 1 (P -> O 'x' O 'z') | + // +------+-------------+--------------+-----------------+-----------------------------+ + + assert_eq!( + parse_trace.steps().iter().map(|step| *step.action_taken()).collect::>(), + [ + Action::Reduce { rule_index: 2 }, + Action::Shift { next_state: 3 }, + Action::Reduce { rule_index: 2 }, + Action::Shift { next_state: 6 }, + Action::Accept { rule_index: 0 } + ], + ); + } + { + assert_eq!( + parse_tree.to_string().trim(), + r#" + +P +├─ O +├─ x +├─ O +└─ z + + "# + .trim(), + ); + } + } + { + let expression = "y x y z"; + let tokens = parser.tokenize(expression).unwrap(); + + let (parse_trace, parse_tree) = parser.trace(tokens).unwrap(); + { + // +------+-------------+--------------+-------------------+-----------------------------+ + // | Step | State Stack | Symbol Stack | Remaining Input | Action Taken | + // +------+-------------+--------------+-------------------+-----------------------------+ + // | 0 | 0 | | 'y' 'x' 'y' 'z' $ | Shift 2 | + // +------+-------------+--------------+-------------------+-----------------------------+ + // | 1 | 0 2 | 'y' | 'x' 'y' 'z' $ | Reduce 2 (O -> 'y') | + // +------+-------------+--------------+-------------------+-----------------------------+ + // | 2 | 0 1 | O | 'x' 'y' 'z' $ | Shift 3 | + // +------+-------------+--------------+-------------------+-----------------------------+ + // | 3 | 0 1 3 | O 'x' | 'y' 'z' $ | Shift 5 | + // +------+-------------+--------------+-------------------+-----------------------------+ + // | 4 | 0 1 3 5 | O 'x' 'y' | 'z' $ | Reduce 2 (O -> 'y') | + // +------+-------------+--------------+-------------------+-----------------------------+ + // | 5 | 0 1 3 4 | O 'x' O | 'z' $ | Shift 6 | + // +------+-------------+--------------+-------------------+-----------------------------+ + // | 6 | 0 1 3 4 6 | O 'x' O 'z' | $ | Accept 1 (P -> O 'x' O 'z') | + // +------+-------------+--------------+-------------------+-----------------------------+ + + assert_eq!( + parse_trace.steps().iter().map(|step| *step.action_taken()).collect::>(), + [ + Action::Shift { next_state: 2 }, + Action::Reduce { rule_index: 1 }, + Action::Shift { next_state: 3 }, + Action::Shift { next_state: 5 }, + Action::Reduce { rule_index: 1 }, + Action::Shift { next_state: 6 }, + Action::Accept { rule_index: 0 } + ], + ); + } + { + assert_eq!( + parse_tree.to_string().trim(), + r#" + +P +├─ O +│ └─ y +├─ x +├─ O +│ └─ y +└─ z + + "# + .trim(), + ); + } + } +} diff --git a/tests/span.rs b/tests/span.rs index 1982466..a8ff60a 100644 --- a/tests/span.rs +++ b/tests/span.rs @@ -4,87 +4,36 @@ use dotlr::{ Grammar, Parser, Span, - Spanned, - Token, }; -/// Formats the expected and got tokens and spans into a more readable format. -#[allow(unused)] -fn fmt_expected<'i>(tokens: &[(Spanned, &'i str)], spans: &[Span]) -> String { - if tokens.len() != spans.len() { - panic!( - "Mismatch in the number of tokens and spans. Expected {} got {}", - spans.len(), - tokens.len() - ); - } - format!( - "[Expected -> Got] [Offset expected -> Offset Got] {{length}} \n{}", - tokens - .iter() - .zip(spans) - .map(|((expected_token, slice), got)| { - let span = expected_token.span(); - format!( - "{}:{} -> {}:{} ({} -> {}) [{}] {{{} -> {}}}", - span.line, - span.column, - got.line, - got.column, - span.offset, - got.offset, - slice, - span.len, - got.len - ) - }) - .collect::>() - .join("\n") - ) -} - -/// Checks if the spans of the tokens are equal to the expected spans. -#[allow(unused)] -fn check_spans<'i>(tokens: Vec<(Spanned, &'i str)>, spans: &[Span]) { - if tokens.len() != spans.len() { - panic!( - "Mismatch in the number of tokens and spans. Expected {} got {}", - spans.len(), - tokens.len() - ); - } - for (token, expected_span) in tokens.iter().zip(spans) { - let span = token.0.span(); - if *span != *expected_span { - panic!("{}", fmt_expected(&tokens, spans)); - } - } -} - #[test] -fn correctly_calculate_spans_multi_line() { +fn correctly_calculating_spans_on_multiline_input() { let grammar = Grammar::parse(common::grammars::CALCULATOR).unwrap(); let parser = Parser::lalr(grammar).unwrap(); - // do not remove the spaces in the string - let str = " 11 + 221+3 + + let input = " 11 + 221+3 +20 +44 +5"; - let tokens = parser.tokenize(str).unwrap(); + let tokens = parser.tokenize(input).unwrap(); - check_spans(tokens, &[ - Span { line: 1, column: 3, offset: 2, len: 2 }, - Span { line: 1, column: 6, offset: 5, len: 1 }, - Span { line: 1, column: 9, offset: 8, len: 3 }, - Span { line: 1, column: 12, offset: 11, len: 1 }, - Span { line: 1, column: 13, offset: 12, len: 1 }, - Span { line: 2, column: 1, offset: 14, len: 1 }, - Span { line: 2, column: 2, offset: 15, len: 2 }, - Span { line: 4, column: 5, offset: 23, len: 1 }, - Span { line: 4, column: 6, offset: 24, len: 2 }, - Span { line: 4, column: 9, offset: 27, len: 1 }, - Span { line: 4, column: 10, offset: 28, len: 1 }, - Span { line: 4, column: 11, offset: 29, len: 0 }, - ]); + #[rustfmt::skip] + assert_eq!( + tokens.iter().map(|(token, _)| token.span().clone()).collect::>(), + [ + Span { line: 1, column: 3, offset: 2, length: 2 }, + Span { line: 1, column: 6, offset: 5, length: 1 }, + Span { line: 1, column: 9, offset: 8, length: 3 }, + Span { line: 1, column: 12, offset: 11, length: 1 }, + Span { line: 1, column: 13, offset: 12, length: 1 }, + Span { line: 2, column: 1, offset: 14, length: 1 }, + Span { line: 2, column: 2, offset: 15, length: 2 }, + Span { line: 4, column: 5, offset: 23, length: 1 }, + Span { line: 4, column: 6, offset: 24, length: 2 }, + Span { line: 4, column: 9, offset: 27, length: 1 }, + Span { line: 4, column: 10, offset: 28, length: 1 }, + Span { line: 4, column: 11, offset: 29, length: 0 }, + ] + ); }