Skip to content

Commit

Permalink
Add token span info, fixes umut-sahin#1 and umut-sahin#13
Browse files Browse the repository at this point in the history
  • Loading branch information
Specy committed Oct 1, 2024
1 parent 479451b commit 61e2064
Show file tree
Hide file tree
Showing 11 changed files with 169 additions and 40 deletions.
17 changes: 10 additions & 7 deletions bindings/typescript/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ import {
ParserError,
ParsingError,
ParsingTables,
Rule,
Rule, Spanned,
Token,
Trace,
Tree
} from './types'
import {Err, Ok} from "ts-results";

export class Grammar<
T extends string = string,
NT extends string = string,
T extends string = string,
NT extends string = string,
R extends string = string
> {
grammar: _Grammar
Expand Down Expand Up @@ -75,8 +75,8 @@ export class Grammar<


class Parser<
T extends string = string,
NT extends string = string,
T extends string = string,
NT extends string = string,
R extends string = string
> {

Expand Down Expand Up @@ -128,7 +128,7 @@ class Parser<

tokenize(input: string) {
try {
const tokens = this.parser.tokenize_wasm(input) as [Token<T, R>, string][]
const tokens = this.parser.tokenize_wasm(input) as [Spanned<Token<T, R>>, string][]
return Ok(tokens.map(([token, slice]) => ({
token, slice
})))
Expand All @@ -139,7 +139,10 @@ class Parser<

trace(input: string) {
try {
const [trace, tree] = this.parser.trace_wasm(input) as [Trace<Tree<NT, Token<T, R>>>, Tree<NT, Token<T, R>>]
const [trace, tree] = this.parser.trace_wasm(input) as [
Trace<Tree<NT, Token<T, R>>>,
Tree<NT, Token<T, R>>
]
return Ok({
trace,
tree
Expand Down
14 changes: 14 additions & 0 deletions bindings/typescript/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export type Tree<NT extends string = string, T extends Token = Token> = {
value: {
token: T,
slice: string
span: Span
}
} | {
type: 'NonTerminal'
Expand Down Expand Up @@ -94,16 +95,19 @@ export type ParsingError<T extends Token = Token> = {
type: "UnknownToken",
value: {
token: string
span: Span
}
} | {
type: "UnexpectedToken"
value: {
token: string
span: Span
expected: T[]
}
} | {
type: "UnexpectedEof"
value: {
span: Span
expected: T[]
}
}
Expand Down Expand Up @@ -148,6 +152,16 @@ export type Action = {
rule_index: number
}
}
export type Span = {
offset: number,
len: number,
column: number,
line: number
}
export type Spanned<T> = {
span: Span,
value: T
}


export type FirstTable<T extends Token = Token> = Map<string, T[]>
Expand Down
2 changes: 1 addition & 1 deletion examples/calculator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ fn calculate(parser: &Parser, input: &str) -> ExitCode {

fn evaluate(tree: Tree<'_>) -> f64 {
match tree {
Tree::Terminal { token, slice } => {
Tree::Terminal { token, slice, .. } => {
match token {
Token::Regex(regex_token) => {
match regex_token.as_str() {
Expand Down
2 changes: 1 addition & 1 deletion examples/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub enum Value {
impl From<Tree<'_>> for Value {
fn from(tree: Tree) -> Value {
match tree {
Tree::Terminal { token, slice } => {
Tree::Terminal { token, slice, .. } => {
match token {
Token::Constant(constant_token) => {
match constant_token.as_str() {
Expand Down
12 changes: 7 additions & 5 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,13 @@ pub enum ParsingError {
"unknown token {}",
format_smolstr!("{}", token).green(),
)]
UnknownToken { token: SmolStr },
UnknownToken { token: SmolStr, span: Span },

/// An unexpected token has been encountered.
#[error(
"unexpected token {} (expected {})",
"unexpected token {} at [{}] (expected {})",
format_smolstr!("{}", token).green(),
format_smolstr!("{}:{}", span.line, span.column).cyan(),
if expected.len() == 1 {
format!("{}", format_smolstr!("{}", expected[0]).green())
} else {
Expand All @@ -111,11 +112,12 @@ pub enum ParsingError {
)
},
)]
UnexpectedToken { token: SmolStr, expected: SmallVec<[Token; 2]> },
UnexpectedToken { token: SmolStr, expected: SmallVec<[Token; 2]>, span: Span },

/// An unexpected end of input has been encountered.
#[error(
"unexpected end of input (expected {})",
"unexpected end of input at [{}] (expected {})",
format_smolstr!("{}:{}", span.line, span.column).cyan(),
if expected.len() == 1 {
format!("{}", format_smolstr!("{}", expected[0]).green())
} else {
Expand All @@ -125,5 +127,5 @@ pub enum ParsingError {
)
},
)]
UnexpectedEof { expected: SmallVec<[Token; 2]> },
UnexpectedEof { expected: SmallVec<[Token; 2]>, span: Span },
}
1 change: 0 additions & 1 deletion src/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ impl<T: Into<SmolStr>> From<T> for RegexToken {
}
}


/// Token (e.g., `'+'`, `%f`, `$`) in a grammar.
#[cfg_attr(feature = "serde", derive(Serialize))]
#[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))]
Expand Down
4 changes: 4 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ pub use {
Trace,
},
tree::Tree,
utils::{
Span,
Spanned,
},
};

mod prelude {
Expand Down
78 changes: 59 additions & 19 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,27 @@ impl Parser {
}
}

fn count_new_lines(slice: &str) -> usize {
slice.chars().filter(|&c| c == '\n').count()
}


impl Parser {
/// Tokenizes an input into a stream of tokens and their corresponding input slices.
pub fn tokenize<'i>(&self, input: &'i str) -> Result<Vec<(Token, &'i str)>, ParsingError> {
let mut tokens = Vec::new();
pub fn tokenize<'i>(
&self,
input: &'i str,
) -> Result<Vec<(Spanned<Token>, &'i str)>, ParsingError> {
let mut tokens: Vec<(Spanned<Token>, &'i str)> = Vec::new();

let mut ordered_constant_tokens = self.grammar.constant_tokens().iter().collect::<Vec<_>>();
ordered_constant_tokens.sort_by_key(|token| token.len());


let mut remaining_input = input.trim_start();
let mut offset = input.len() - remaining_input.len();
let mut line = count_new_lines(&input[..offset]) + 1;
let mut column = offset - input[..offset].rfind('\n').unwrap_or(1) - 1;
while !remaining_input.is_empty() {
let mut matching_token = None;
let mut matching_slice = "";
Expand All @@ -148,29 +160,49 @@ impl Parser {
}
}


if matching_token.is_none() {
let span = Span { offset, len: 1, line, column };
return Err(ParsingError::UnknownToken {
token: format_smolstr!("{}", remaining_input.chars().next().unwrap()),
span,
});
}

tokens.push((matching_token.unwrap(), matching_slice));
remaining_input = remaining_input[matching_slice.len()..].trim();
line += count_new_lines(matching_slice);
let token = Spanned::new(matching_token.unwrap(), Span {
offset,
len: matching_slice.len(),
line,
column,
});
tokens.push((token, matching_slice));
remaining_input = remaining_input[matching_slice.len()..].trim_start();
// add back the whitespace that was trimmed
let old_offset = offset;
offset = input.len() - remaining_input.len();
let whitespace = &input[old_offset..offset];
line += count_new_lines(whitespace);
column = offset - input[..offset].rfind('\n').unwrap_or(1) - 1;
}
tokens.push((Token::Eof, "\0"));
let eof = Spanned::new(Token::Eof, Span { offset, len: 0, line, column });
tokens.push((eof, "\0"));

Ok(tokens)
}

/// Parses a tokenized input.
pub fn parse<'i>(&self, tokens: Vec<(Token, &'i str)>) -> Result<Tree<'i>, ParsingError> {
pub fn parse<'i>(
&self,
tokens: Vec<(Spanned<Token>, &'i str)>,
) -> Result<Tree<'i>, ParsingError> {
self.parse_and_trace_internal(tokens, false).map(|(_, tree)| tree)
}

/// Traces the parsing of a tokenized input.
pub fn trace<'i>(
&self,
tokens: Vec<(Token, &'i str)>,
tokens: Vec<(Spanned<Token>, &'i str)>,
) -> Result<(Trace<'i>, Tree<'i>), ParsingError> {
self.parse_and_trace_internal(tokens, true)
}
Expand Down Expand Up @@ -264,7 +296,7 @@ impl Parser {
/// Internal parsing logic.
fn parse_and_trace_internal<'i>(
&self,
mut tokens: Vec<(Token, &'i str)>,
mut tokens: Vec<(Spanned<Token>, &'i str)>,
traced: bool,
) -> Result<(Trace<'i>, Tree<'i>), ParsingError> {
let mut state_stack = vec![0];
Expand All @@ -279,7 +311,8 @@ impl Parser {
let (mut current_token, mut current_slice) = remaining_tokens.pop().unwrap();
loop {
let current_state = *state_stack.last().unwrap();
let action_to_take = match self.action_table()[current_state].get(&current_token) {
let action_to_take = match self.action_table()[current_state].get(&current_token.value)
{
Some(actions) => {
assert_eq!(actions.len(), 1);
*actions.iter().next().unwrap()
Expand All @@ -290,18 +323,27 @@ impl Parser {
expected.push(token.clone());
}

return Err(if current_token == Token::Eof {
ParsingError::UnexpectedEof { expected }
return Err(if current_token.value == Token::Eof {
ParsingError::UnexpectedEof {
expected,
span: current_token.get_span().clone(),
}
} else {
ParsingError::UnexpectedToken { token: current_slice.into(), expected }
ParsingError::UnexpectedToken {
token: current_slice.into(),
expected,
span: current_token.get_span().clone(),
}
});
},
};

if traced {
let mut remaining_tokens_without_slices =
remaining_tokens.iter().map(|(token, _)| token.clone()).collect::<Vec<_>>();
remaining_tokens_without_slices.push(current_token.clone());
let mut remaining_tokens_without_slices = remaining_tokens
.iter()
.map(|(token, _)| token.value.clone())
.collect::<Vec<_>>();
remaining_tokens_without_slices.push(current_token.value.clone());

trace.step(Step {
state_stack: state_stack.clone(),
Expand All @@ -320,11 +362,9 @@ impl Parser {
return Ok((trace, parse_tree));
},
Action::Shift { next_state } => {
let (token, span) = current_token.clone().into_tuple();
state_stack.push(next_state);
tree_stack.push(Tree::Terminal {
token: current_token.clone(),
slice: current_slice,
});
tree_stack.push(Tree::Terminal { token, span, slice: current_slice });
(current_token, current_slice) = remaining_tokens.pop().unwrap();
},
Action::Reduce { rule_index } => {
Expand Down
1 change: 1 addition & 0 deletions src/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ pub enum Tree<'i> {
Terminal {
/// Matching token.
token: Token,
span: Span,
/// Matching slice.
slice: &'i str,
},
Expand Down
51 changes: 51 additions & 0 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,54 @@ where
}
map_serializer.end()
}


#[cfg_attr(feature = "serde", derive(Serialize))]
#[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))]
#[derive(Clone, Debug)]
pub struct Span {
pub offset: usize,
pub len: usize,
pub column: usize,
pub line: usize,
}


#[cfg(not(feature = "serde"))]
#[derive(Clone, Debug)]
pub struct Spanned<T: Debug + Clone> {
pub value: T,
span: Span,
}
#[cfg(feature = "serde")]
#[cfg_attr(feature = "serde", derive(Serialize))]
#[cfg_attr(feature = "serde", serde(crate = "serde_renamed"))]
#[derive(Clone, Debug)]
pub struct Spanned<T: Serialize + Debug + Clone> {
pub value: T,
span: Span,
}


impl<
#[cfg(not(feature = "serde"))] T: Debug + Clone,
#[cfg(feature = "serde")] T: Serialize + Debug + Clone,
> Spanned<T>
{
pub fn new(value: T, span: Span) -> Self {
Self { value, span }
}
pub fn get_span(&self) -> &Span {
&self.span
}

pub fn into_tuple(self) -> (T, Span) {
(self.value, self.span)
}
pub fn get_span_value(&self) -> &T {
&self.value
}
pub fn into_span_value(self) -> T {
self.value
}
}
Loading

0 comments on commit 61e2064

Please sign in to comment.