From 6c124693c10dd3e932013fdfa0e1690af8b53af7 Mon Sep 17 00:00:00 2001 From: cenodis <57576911+cenodis@users.noreply.github.com> Date: Sun, 8 Dec 2024 17:09:15 +0100 Subject: [PATCH] Precedence parsing (#1362) * Initial prototype * Update docs Remove unused code * More doc updates * Add feature flags for Vec * Add basic tests * Fix formatting * Add precedence to choosing_a_combinator.md * Fix typo * Minor refractoring * Update docs * Change parameter order * Add alloc feature to the entire precedence module The parser really cant work without it and the helpers dont make much sense without the parser. * Use fail parser to express "no operators of this type" * Document evaluation order * Better documentation for parameters * Fix precedence in documentation * Fix doc formatting * Fix typos * Use map_res when parsing integers * Example test for expressions with function calls and AST generation * Typo * Make evaluation a bit easier to read * Update expression_ast * Update expression_ast doc * Implement ternary operator in expression_ast * Shorten ast nodes * Implement some tests for parser failures * Update feature flags for docs * Properly append errors * Properly bubble up non Error errors * Split operators into 3 distinct types to help with exhaustiveness checks. --------- Co-authored-by: Geoffroy Couprie --- Cargo.toml | 4 + doc/choosing_a_combinator.md | 1 + src/error.rs | 3 + src/lib.rs | 2 + src/precedence/mod.rs | 379 +++++++++++++++++++++++++++++++++++ src/precedence/tests.rs | 75 +++++++ tests/expression_ast.rs | 158 +++++++++++++++ 7 files changed, 622 insertions(+) create mode 100644 src/precedence/mod.rs create mode 100644 src/precedence/tests.rs create mode 100644 tests/expression_ast.rs diff --git a/Cargo.toml b/Cargo.toml index ec1933b31..d88e70a44 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,6 +66,10 @@ name = "css" [[test]] name = "custom_errors" +[[test]] +name = "expression_ast" +required-features = ["alloc"] + [[test]] name = "float" diff --git a/doc/choosing_a_combinator.md b/doc/choosing_a_combinator.md index 3363f63c5..dfdee0940 100644 --- a/doc/choosing_a_combinator.md +++ b/doc/choosing_a_combinator.md @@ -106,6 +106,7 @@ The following parsers could be found on [docs.rs number section](https://docs.rs - [`escaped`](https://docs.rs/nom/latest/nom/bytes/complete/fn.escaped.html): Matches a byte string with escaped characters - [`escaped_transform`](https://docs.rs/nom/latest/nom/bytes/complete/fn.escaped_transform.html): Matches a byte string with escaped characters, and returns a new string with the escaped characters replaced +- [`precedence`](https://docs.rs/nom/latest/nom/precedence/fn.precedence.html): Parses an expression with regards to operator precedence ## Binary format parsing diff --git a/src/error.rs b/src/error.rs index bec263b53..dac6a4007 100644 --- a/src/error.rs +++ b/src/error.rs @@ -310,6 +310,7 @@ pub enum ErrorKind { Fail, Many, Fold, + Precedence, } #[rustfmt::skip] @@ -373,6 +374,7 @@ pub fn error_to_u32(e: &ErrorKind) -> u32 { ErrorKind::Many => 76, ErrorKind::Fold => 77, ErrorKind::BinDigit => 78, + ErrorKind::Precedence => 79, } } @@ -438,6 +440,7 @@ impl ErrorKind { ErrorKind::Fail => "Fail", ErrorKind::Many => "Many", ErrorKind::Fold => "Fold", + ErrorKind::Precedence => "Precedence", } } } diff --git a/src/lib.rs b/src/lib.rs index c82715a3b..db4c8703b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -451,6 +451,8 @@ pub mod bytes; pub mod character; +pub mod precedence; + mod str; pub mod number; diff --git a/src/precedence/mod.rs b/src/precedence/mod.rs new file mode 100644 index 000000000..c518fdff3 --- /dev/null +++ b/src/precedence/mod.rs @@ -0,0 +1,379 @@ +//! Combinators to parse expressions with operator precedence. +#![cfg(feature="alloc")] +#![cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] + +#[cfg(test)] +mod tests; + +use crate::error::{ErrorKind, FromExternalError, ParseError}; +use crate::lib::std::vec::Vec; +use crate::{Err, IResult, Parser}; + +/// An unary operator. +pub struct Unary { + value: V, + precedence: Q, +} + +/// A binary operator. +pub struct Binary { + value: V, + precedence: Q, + assoc: Assoc, +} + +/// A single evaluation step. +pub enum Operation { + /// A prefix operation. + Prefix(P1, O), + /// A postfix operation. + Postfix(O, P2), + /// A binary operation. + Binary(O, P3, O), +} + +/// Associativity for binary operators. +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum Assoc { + /// Left associative. + Left, + /// Right associative. + Right, +} + +/// Element for operator stack. +enum Operator { + Prefix(P1, Q), + Postfix(P2, Q), + Binary(P3, Q, Assoc), +} + +impl Operator +where + Q: Ord + Copy, +{ + fn precedence(&self) -> Q { + match self { + Operator::Prefix(_, p) => *p, + Operator::Postfix(_, p) => *p, + Operator::Binary(_, p, _) => *p, + } + } + + fn is_postfix(&self) -> bool { + match self { + Operator::Postfix(_, _) => true, + _ => false, + } + } +} + +/// Runs the inner parser and transforms the result into an unary operator with the given precedence. +/// +/// Intended for use with [precedence]. +/// # Arguments +/// * `precedence` The precedence of the operator. +/// * `parser` The parser to apply. +pub fn unary_op( + precedence: Q, + mut parser: P, +) -> impl FnMut(I) -> IResult, E> +where + P: Parser, + Q: Ord + Copy, +{ + move |input| match parser.parse(input) { + Ok((i, value)) => Ok(( + i, + Unary { + value, + precedence, + }, + )), + Err(e) => Err(e), + } +} + +/// Runs the inner parser and transforms the result into a binary operator with the given precedence and associativity. +/// +/// Intended for use with [precedence]. +/// # Arguments +/// * `precedence` The precedence of the operator. +/// * `assoc` The associativity of the operator. +/// * `parser` The parser to apply. +pub fn binary_op( + precedence: Q, + assoc: Assoc, + mut parser: P, +) -> impl FnMut(I) -> IResult, E> +where + P: Parser, + Q: Ord + Copy, +{ + move |input| match parser.parse(input) { + Ok((i, value)) => Ok(( + i, + Binary { + value, + precedence, + assoc, + }, + )), + Err(e) => Err(e), + } +} + +/// Parses an expression with operator precedence. +/// +/// Supports prefix, postfix and binary operators. Operators are applied in ascending precedence. +/// +/// The parser will track its current position inside the expression and call the respective +/// operand/operator parsers. The prefix and postfix parsers are called repeatedly until they fail before +/// execution moves on to the operand or binary parser. +/// +/// Expressions are folded as soon as possible. The result will be reused as another operand. After the +/// expression has been read completely any remaining operations are folded and the resulting, single +/// operand is returned as the result. +/// +/// It will return `Err(Err:Error((_, ErrorKind::Precedence)))` if: +/// * the `fold` function returns an `Err`. +/// * more than one or no operands remain after the expression has been evaluated completely. +/// * the input does not match the pattern: `prefix* operand postfix* (binary prefix* operand postfix*)*` +/// +/// # Arguments +/// * `prefix` Parser for prefix unary operators. +/// * `postfix` Parser for postfix unary operators. +/// * `binary` Parser for binary operators. +/// * `operand` Parser for operands. +/// * `fold` Function that evaluates a single operation and returns the result. +/// +/// # Example +/// ```rust +/// # use nom::{Err, error::{Error, ErrorKind}, IResult}; +/// use nom::precedence::{precedence, unary_op, binary_op, Assoc, Operation}; +/// use nom::character::complete::digit1; +/// use nom::combinator::{map_res, fail}; +/// use nom::sequence::delimited; +/// use nom::bytes::complete::tag; +/// use nom::branch::alt; +/// +/// fn parser(i: &str) -> IResult<&str, i64> { +/// precedence( +/// unary_op(1, tag("-")), +/// fail, +/// alt(( +/// binary_op(2, Assoc::Left, tag("*")), +/// binary_op(2, Assoc::Left, tag("/")), +/// binary_op(3, Assoc::Left, tag("+")), +/// binary_op(3, Assoc::Left, tag("-")), +/// )), +/// alt(( +/// map_res(digit1, |s: &str| s.parse::()), +/// delimited(tag("("), parser, tag(")")), +/// )), +/// |op: Operation<&str, &str, &str, i64>| { +/// use nom::precedence::Operation::*; +/// match op { +/// Prefix("-", o) => Ok(-o), +/// Binary(lhs, "*", rhs) => Ok(lhs * rhs), +/// Binary(lhs, "/", rhs) => Ok(lhs / rhs), +/// Binary(lhs, "+", rhs) => Ok(lhs + rhs), +/// Binary(lhs, "-", rhs) => Ok(lhs - rhs), +/// _ => Err("Invalid combination"), +/// } +/// } +/// )(i) +/// } +/// +/// assert_eq!(parser("8-2*2"), Ok(("", 4))); +/// assert_eq!(parser("4-(2+2)"), Ok(("", 0))); +/// assert_eq!(parser("3-(2*3)+7+2*2-(2*(2+4))"), Ok(("", -4))); +/// ``` +/// +/// # Evaluation order +/// This parser reads expressions from left to right and folds operations as soon as possible. This +/// behaviour is only important when using an operator grammar that allows for ambigious expressions. +/// +/// For example, the expression `-a++**b` is ambigious with the following precedence. +/// +/// | Operator | Position | Precedence | Associativity | +/// |----------|----------|------------|---------------| +/// | ** | Binary | 1 | Right | +/// | - | Prefix | 2 | N/A | +/// | ++ | Postfix | 3 | N/A | +/// +/// The expression can be parsed in two ways: `-((a++)**b)` or `((-a)++)**b`. This parser will always +/// parse it as the latter because of how it evaluates expressions: +/// * It reads, left-to-right, the first two operators `-a++`. +/// * Because the minus takes precedence over the increment it is evaluated immediately `(-a)++`. +/// * It then reads the remaining input and evaluates the increment next in order to preserve its +/// position in the expression \ +/// `((-a)++)**b`. +#[cfg_attr(feature = "docsrs", doc(cfg(feature = "alloc")))] +pub fn precedence( + mut prefix: H1, + mut postfix: H2, + mut binary: H3, + mut operand: F, + mut fold: G, +) -> impl FnMut(I) -> IResult +where + I: Clone + PartialEq, + E: ParseError + FromExternalError, + F: Parser, + G: FnMut(Operation) -> Result, + H1: Parser, E>, + H2: Parser, E>, + H3: Parser, E>, + Q: Ord + Copy, +{ + move |mut i| { + let mut operands = Vec::new(); + let mut operators = Vec::new(); + let mut i1 = i.clone(); + + 'main: loop { + 'prefix: loop { + match prefix.parse(i1.clone()) { + Err(Err::Error(_)) => break 'prefix, + Err(e) => return Err(e), + Ok((i2, o)) => { + // infinite loop check: the parser must always consume + if i2 == i1 { + return Err(Err::Error(E::from_error_kind(i1, ErrorKind::Precedence))); + } + i1 = i2; + operators.push(Operator::Prefix(o.value, o.precedence)); + } + } + } + + let (i2, o) = match operand.parse(i1.clone()) { + Ok((i, o)) => (i, o), + Err(Err::Error(e)) => return Err(Err::Error(E::append(i, ErrorKind::Precedence, e))), + Err(e) => return Err(e), + }; + i1 = i2; + operands.push(o); + + 'postfix: loop { + match postfix.parse(i1.clone()) { + Err(Err::Error(_)) => break 'postfix, + Err(e) => return Err(e), + Ok((i2, o)) => { + // infinite loop check: the parser must always consume + if i2 == i1 { + return Err(Err::Error(E::from_error_kind(i1, ErrorKind::Precedence))); + } + + while operators + .last() + .map(|op| op.precedence() <= o.precedence) + .unwrap_or(false) + { + let value = operands.pop().unwrap(); + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => return Err(Err::Error(E::from_error_kind(i1, ErrorKind::Precedence))), + }, + }; + let result = match fold(operation) { + Err(e) => { + return Err(Err::Error(E::from_external_error( + i, + ErrorKind::Precedence, + e, + ))) + } + Ok(r) => r, + }; + operands.push(result); + } + i1 = i2; + operators.push(Operator::Postfix(o.value, o.precedence)); + } + } + } + + match binary.parse(i1.clone()) { + Err(Err::Error(_)) => break 'main, + Err(e) => return Err(e), + Ok((i2, o)) => { + while operators + .last() + .map(|op| { + op.precedence() < o.precedence + || (o.assoc == Assoc::Left && op.precedence() == o.precedence) + || (op.is_postfix()) + }) + .unwrap_or(false) + { + let value = operands.pop().unwrap(); + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => return Err(Err::Error(E::from_error_kind(i1, ErrorKind::Precedence))), + }, + }; + let result = match fold(operation) { + Err(e) => { + return Err(Err::Error(E::from_external_error( + i, + ErrorKind::Precedence, + e, + ))) + } + Ok(r) => r, + }; + operands.push(result); + } + operators.push(Operator::Binary(o.value, o.precedence, o.assoc)); + i1 = i2; + } + } + + // infinite loop check: either operand or operator must consume input + if i == i1 { + return Err(Err::Error(E::from_error_kind(i, ErrorKind::Precedence))); + } + i = i1.clone(); + } + + while operators.len() > 0 { + let value = match operands.pop() { + Some(o) => o, + None => return Err(Err::Error(E::from_error_kind(i, ErrorKind::Precedence))), + }; + let operation = match operators.pop().unwrap() { + Operator::Prefix(op, _) => Operation::Prefix(op, value), + Operator::Postfix(op, _) => Operation::Postfix(value, op), + Operator::Binary(op, _, _) => match operands.pop() { + Some(lhs) => Operation::Binary(lhs, op, value), + None => return Err(Err::Error(E::from_error_kind(i, ErrorKind::Precedence))), + }, + }; + let result = match fold(operation) { + Ok(r) => r, + Err(e) => { + return Err(Err::Error(E::from_external_error( + i, + ErrorKind::Precedence, + e, + ))) + } + }; + operands.push(result); + } + + if operands.len() == 1 { + return Ok((i1, operands.pop().unwrap())); + } else { + return Err(Err::Error(E::from_error_kind(i, ErrorKind::Precedence))); + } + } +} diff --git a/src/precedence/tests.rs b/src/precedence/tests.rs new file mode 100644 index 000000000..f697730cf --- /dev/null +++ b/src/precedence/tests.rs @@ -0,0 +1,75 @@ +use crate::precedence::{binary_op, unary_op, Assoc, Operation}; +use crate::{ + branch::alt, + bytes::complete::tag, + character::complete::digit1, + combinator::{map_res, fail}, + internal::{Err, IResult}, + sequence::delimited, + error::ErrorKind, +}; + +#[cfg(feature = "alloc")] +use crate::precedence::precedence; + +#[cfg(feature = "alloc")] +fn parser(i: &str) -> IResult<&str, i64> { + precedence( + unary_op(1, tag("-")), + fail, + alt(( + binary_op(2, Assoc::Left, tag("*")), + binary_op(2, Assoc::Left, tag("/")), + binary_op(3, Assoc::Left, tag("+")), + binary_op(3, Assoc::Left, tag("-")), + )), + alt(( + map_res(digit1, |s: &str| s.parse::()), + delimited(tag("("), parser, tag(")")), + )), + |op: Operation<&str, (), &str, i64>| { + use crate::precedence::Operation::*; + match op { + Prefix("-", o) => Ok(-o), + Binary(lhs, "*", rhs) => Ok(lhs * rhs), + Binary(lhs, "/", rhs) => Ok(lhs / rhs), + Binary(lhs, "+", rhs) => Ok(lhs + rhs), + Binary(lhs, "-", rhs) => Ok(lhs - rhs), + _ => Err("Invalid combination"), + } + }, + )(i) +} + +#[test] +#[cfg(feature = "alloc")] +fn precedence_test() { + assert_eq!(parser("3"), Ok(("", 3))); + assert_eq!(parser("-3"), Ok(("", -3))); + assert_eq!(parser("4-(2*2)"), Ok(("", 0))); + assert_eq!(parser("4-2*2"), Ok(("", 0))); + assert_eq!(parser("(4-2)*2"), Ok(("", 4))); + assert_eq!(parser("2*2/1"), Ok(("", 4))); + + let a = "a"; + + assert_eq!( + parser(a), + Err(Err::Error(error_node_position!( + &a[..], + ErrorKind::Precedence, + error_position!(&a[..], ErrorKind::Tag) + ))) + ); + + let b = "3+b"; + + assert_eq!( + parser(b), + Err(Err::Error(error_node_position!( + &b[2..], + ErrorKind::Precedence, + error_position!(&b[2..], ErrorKind::Tag) + ))) + ); +} diff --git a/tests/expression_ast.rs b/tests/expression_ast.rs new file mode 100644 index 000000000..19ced17e0 --- /dev/null +++ b/tests/expression_ast.rs @@ -0,0 +1,158 @@ +use nom::{ + branch::alt, + bytes::complete::tag, + character::complete::{digit1 as digit, alphanumeric1 as alphanumeric}, + combinator::{map_res, map}, + multi::separated_list0, + sequence::delimited, + IResult, + precedence::{precedence, Assoc, binary_op, unary_op, Operation}, +}; + +// Elements of the abstract syntax tree (ast) that represents an expression. +#[derive(Debug)] +pub enum Expr { + // A number literal. + Num(i64), + // An identifier. + Iden(String), + // Arithmetic operations. Each have a left hand side (lhs) and a right hand side (rhs). + Add(Box, Box), + Sub(Box, Box), + Mul(Box, Box), + Div(Box, Box), + // The function call operation. Left is the expression the function is called on, right is the list of parameters. + Call(Box, Vec), + // The ternary operator, the expressions from left to right are: The condition, the true case, the false case. + Tern(Box, Box, Box), +} + +// Prefix operators. +enum PrefixOp { + Identity, // + + Negate, // - +} + +// Postfix operators. +enum PostfixOp { + // The function call operator. In addition to its own representation "()" it carries additional information that we need to keep here. + // Specifically the vector of expressions that make up the parameters. + Call(Vec), // () +} + +// Binary operators. +enum BinaryOp { + Addition, // + + Subtraction, // - + Multiplication, // * + Division, // / + // The ternary operator can contain a single expression. + Ternary(Expr), // ?: +} + +// Parser for function calls. +fn function_call(i: &str) -> IResult<&str, PostfixOp> { + map( + delimited( + tag("("), + // Subexpressions are evaluated by recursing back into the expression parser. + separated_list0(tag(","), expression), + tag(")") + ), + |v: Vec| PostfixOp::Call(v) + )(i) +} + +// The ternary operator is actually just a binary operator that contains another expression. So it can be +// handled similarly to the function call operator except its in a binary position and can only contain +// a single expression. +// +// For example the expression "a IResult<&str, BinaryOp> { + map( + delimited( + tag("?"), + expression, + tag(":") + ), + |e: Expr| BinaryOp::Ternary(e) + )(i) +} + +// The actual expression parser . +fn expression(i: &str) -> IResult<&str, Expr> { + precedence( + alt(( + unary_op(2, map(tag("+"), |_| PrefixOp::Identity)), + unary_op(2, map(tag("-"), |_| PrefixOp::Negate)), + )), + // Function calls are implemented as postfix unary operators. + unary_op(1, function_call), + alt(( + binary_op(3, Assoc::Left, alt(( + map(tag("*"), |_| BinaryOp::Multiplication), + map(tag("/"), |_| BinaryOp::Division), + ))), + binary_op(4, Assoc::Left, alt(( + map(tag("+"), |_| BinaryOp::Addition), + map(tag("-"), |_| BinaryOp::Subtraction), + ))), + // Ternary operators are just binary operators with a subexpression. + binary_op(5, Assoc::Right, ternary_operator), + )), + alt(( + map_res(digit, + |s: &str| match s.parse::() { + Ok(s) => Ok(Expr::Num(s)), + Err(e) => Err(e), + } + ), + map(alphanumeric, |s: &str| Expr::Iden(s.to_string())), + delimited(tag("("), expression, tag(")")), + )), + |op: Operation| -> Result { + use nom::precedence::Operation::*; + use PrefixOp::*; + use PostfixOp::*; + use BinaryOp::*; + match op { + // The identity operator (prefix +) is ignored. + Prefix(Identity, e) => Ok(e), + + // Unary minus gets evaluated to the same representation as a multiplication with -1. + Prefix(Negate, e) => Ok(Expr::Mul(Expr::Num(-1).into(), e.into())), + + // The list of parameters are taken from the operator and placed into the ast. + Postfix(e, Call(p)) => Ok(Expr::Call(e.into(), p)), + + // Meaning is assigned to the expressions of the ternary operator during evaluation. + // The lhs becomes the condition, the contained expression is the true case, rhs the false case. + Binary(lhs, Ternary(e), rhs) => Ok(Expr::Tern(lhs.into(), e.into(), rhs.into())), + + // Raw operators get turned into their respective ast nodes. + Binary(lhs, Multiplication, rhs) => Ok(Expr::Mul(lhs.into(), rhs.into())), + Binary(lhs, Division, rhs) => Ok(Expr::Div(lhs.into(), rhs.into())), + Binary(lhs, Addition, rhs) => Ok(Expr::Add(lhs.into(), rhs.into())), + Binary(lhs, Subtraction, rhs) => Ok(Expr::Sub(lhs.into(), rhs.into())), + } + } + )(i) +} + +#[test] +fn expression_test() { + assert_eq!( + expression("-2*max(2,3)-2").map(|(i, x)| (i, format!("{:?}", x))), + Ok(("", String::from("Sub(Mul(Mul(Num(-1), Num(2)), Call(Iden(\"max\"), [Num(2), Num(3)])), Num(2))"))) + ); + + assert_eq!( + expression("a?2+c:-2*2").map(|(i, x)| (i, format!("{:?}", x))), + Ok(("", String::from("Tern(Iden(\"a\"), Add(Num(2), Iden(\"c\")), Mul(Mul(Num(-1), Num(2)), Num(2)))"))) + ); +}