Use both char and byte positions (#458)

NomicFoundation · May 12, 2023 · c0fc7e9 · c0fc7e9
1 parent 6a05423
commit c0fc7e9
Show file tree

Hide file tree

Showing 22 changed files with 331 additions and 138 deletions.
diff --git a/.changeset/famous-falcons-lie.md b/.changeset/famous-falcons-lie.md
@@ -0,0 +1,5 @@
+---
+"changelog": minor
+---
+
+Record both character and byte offsets for input positions
diff --git a/crates/codegen/syntax/src/rust_lib_code_generator.rs b/crates/codegen/syntax/src/rust_lib_code_generator.rs
@@ -93,7 +93,7 @@ impl CodeGenerator {
                             let message = format!(\"ProductionKind {{production_kind}} is not valid in this version of {grammar_title}\");
                             ParseOutput {{
                                 parse_tree: None,
-                                errors: vec![ParseError::new(0, message)]
+                                errors: vec![ParseError::new(Default::default(), message)]
                             }}
                         }})
                     }}

diff --git a/crates/codegen/syntax_templates/src/rust/cst_visitor.rs b/crates/codegen/syntax_templates/src/rust/cst_visitor.rs
@@ -1,14 +1,15 @@
-use std::{ops::Range, rc::Rc};
+use std::rc::Rc;
 
 use super::cst::*;
 use super::kinds::*;
+use super::language::TextRange;
 
 #[allow(unused_variables)]
 pub trait Visitor<E> {
     fn enter_rule(
         &mut self,
         kind: RuleKind,
-        range: &Range<usize>,
+        range: &TextRange,
         children: &Vec<Rc<Node>>,
         node: &Rc<Node>,
         path: &Vec<Rc<Node>>,
@@ -19,7 +20,7 @@ pub trait Visitor<E> {
     fn exit_rule(
         &mut self,
         kind: RuleKind,
-        range: &Range<usize>,
+        range: &TextRange,
         children: &Vec<Rc<Node>>,
         node: &Rc<Node>,
         path: &Vec<Rc<Node>>,
@@ -30,7 +31,7 @@ pub trait Visitor<E> {
     fn enter_token(
         &mut self,
         kind: TokenKind,
-        range: &Range<usize>,
+        range: &TextRange,
         trivia: &Vec<Rc<Node>>,
         node: &Rc<Node>,
         path: &Vec<Rc<Node>>,
@@ -41,7 +42,7 @@ pub trait Visitor<E> {
     fn exit_token(
         &mut self,
         kind: TokenKind,
-        range: &Range<usize>,
+        range: &TextRange,
         trivia: &Vec<Rc<Node>>,
         node: &Rc<Node>,
         path: &Vec<Rc<Node>>,

diff --git a/crates/codegen/syntax_templates/src/rust/parser_output.rs b/crates/codegen/syntax_templates/src/rust/parser_output.rs
@@ -1,6 +1,9 @@
 use std::{collections::BTreeSet, rc::Rc};
 
-use super::{cst, language::render_error_report};
+use super::{
+    cst,
+    language::{render_error_report, TextPosition},
+};
 
 #[derive(PartialEq)]
 pub struct ParseOutput {
@@ -24,13 +27,13 @@ impl ParseOutput {
 
 #[derive(PartialEq)]
 pub struct ParseError {
-    pub(crate) position: usize,
+    pub(crate) position: TextPosition,
     pub(crate) expected: BTreeSet<String>,
 }
 
 impl ParseError {
-    pub fn position(&self) -> usize {
-        return self.position;
+    pub fn position(&self) -> &TextPosition {
+        return &self.position;
     }
 
     pub fn expected(&self) -> &BTreeSet<String> {

diff --git a/crates/codegen/syntax_templates/src/shared/cst.rs b/crates/codegen/syntax_templates/src/shared/cst.rs
@@ -5,31 +5,32 @@ use std::rc::Rc;
 use serde::Serialize;
 
 use super::kinds::*;
+use super::language::TextRange;
 
 #[derive(Clone, Debug, PartialEq, Eq, Serialize)]
 pub enum Node {
     Rule {
         kind: RuleKind,
-        range: Range<usize>,
+        range: TextRange,
         children: Vec<Rc<Node>>,
     },
     Token {
         kind: TokenKind,
-        range: Range<usize>,
+        range: TextRange,
         #[serde(skip_serializing_if = "Vec::is_empty")]
         trivia: Vec<Rc<Node>>,
     },
 }
 
 impl Node {
-    pub fn range(&self) -> Range<usize> {
+    pub fn range(&self) -> TextRange {
         match self {
             Self::Rule { range, .. } => range.clone(),
             Self::Token { range, .. } => range.clone(),
         }
     }
 
-    pub fn range_including_trivia(&self) -> Range<usize> {
+    pub fn range_including_trivia(&self) -> TextRange {
         match self {
             Self::Rule { range, .. } => range.clone(),
             Self::Token { range, trivia, .. } => {
@@ -76,7 +77,7 @@ impl Node {
             }
         }
         let range = if flattened_children.is_empty() {
-            Range { start: 0, end: 0 }
+            Default::default()
         } else {
             Range {
                 start: flattened_children
@@ -101,7 +102,7 @@ impl Node {
     #[allow(dead_code)]
     pub(crate) fn token(
         kind: TokenKind,
-        range: Range<usize>,
+        range: TextRange,
         leading_trivia: Option<Rc<Self>>,
         trailing_trivia: Option<Rc<Self>>,
     ) -> Rc<Self> {

diff --git a/crates/codegen/syntax_templates/src/shared/language.rs b/crates/codegen/syntax_templates/src/shared/language.rs
@@ -1,7 +1,9 @@
+use std::fmt::Display;
 pub use std::{collections::BTreeSet, ops::Range, rc::Rc};
 
 #[allow(deprecated, unused_imports)]
 use semver::Version;
+use serde::Serialize;
 
 pub use super::{
     cst,
@@ -12,7 +14,7 @@ pub use super::{
 const DEBUG_ERROR_MERGING: bool = false;
 
 impl ParseError {
-    pub(crate) fn new<T: Into<String>>(position: usize, expected: T) -> Self {
+    pub(crate) fn new<T: Into<String>>(position: TextPosition, expected: T) -> Self {
         Self {
             position,
             expected: BTreeSet::from([expected.into()]),
@@ -71,41 +73,68 @@ pub enum ParserResult {
     },
 }
 
+#[derive(Default, Copy, Clone, PartialEq, Eq, Debug, Serialize)]
+pub struct TextPosition {
+    pub byte: usize,
+    pub char: usize,
+}
+
+pub type TextRange = Range<TextPosition>;
+
+impl PartialOrd for TextPosition {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        self.char.partial_cmp(&other.char)
+    }
+}
+
+impl Ord for TextPosition {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.byte.cmp(&other.byte)
+    }
+}
+
+impl Display for TextPosition {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.char.fmt(f)
+    }
+}
+
 pub struct Stream<'s> {
     source: &'s str,
-    position: usize,
-    undo_position: usize,
+    position: TextPosition,
+    undo_position: TextPosition,
     has_undo: bool,
 }
 
 impl<'s> Stream<'s> {
     pub fn new(source: &'s str) -> Self {
         Self {
             source,
-            position: 0,
-            undo_position: 0,
+            position: Default::default(),
+            undo_position: Default::default(),
             has_undo: false,
         }
     }
 
-    pub fn position(&self) -> usize {
+    pub fn position(&self) -> TextPosition {
         self.position
     }
 
-    pub fn set_position(&mut self, position: usize) {
+    pub fn set_position(&mut self, position: TextPosition) {
         self.position = position;
     }
 
     pub fn peek(&self) -> Option<char> {
-        self.source[self.position..].chars().next()
+        self.source[self.position.byte..].chars().next()
     }
 
     pub fn next(&mut self) -> Option<char> {
         self.has_undo = true;
         self.undo_position = self.position;
-        let mut chars = self.source[self.position..].chars();
+        let mut chars = self.source[self.position.byte..].chars();
         if let Some(c) = chars.next() {
-            self.position += c.len_utf8();
+            self.position.byte += c.len_utf8();
+            self.position.char += 1;
             Some(c)
         } else {
             None
@@ -146,22 +175,26 @@ pub(crate) fn render_error_report(
         );
 
         if DEBUG_ERROR_MERGING {
-            format!("{position}: {message}", position = error.position)
+            format!("{position}: {message}", position = source_start.char)
         } else {
             message
         }
     };
 
     if source.is_empty() {
-        return format!("{kind}: {message}\n   ─[{source_id}:{source_start}:{source_end}]");
+        return format!(
+            "{kind}: {message}\n   ─[{source_id}:{source_start}:{source_end}]",
+            source_start = source_start.char,
+            source_end = source_end.char
+        );
     }
 
-    let mut builder = Report::build(kind, source_id, source_start)
+    let mut builder = Report::build(kind, source_id, source_start.byte)
         .with_config(Config::default().with_color(with_color))
         .with_message(message);
 
     builder.add_label(
-        Label::new((source_id, source_start..source_end))
+        Label::new((source_id, source_start.char..source_end.char))
             .with_color(color)
             .with_message("Error occurred here.".to_string()),
     );
@@ -196,7 +229,7 @@ where
                 parse_tree: Some(cst::Node::token(
                     kind,
                     Range {
-                        start: 0,
+                        start: Default::default(),
                         end: stream.position(),
                     },
                     None,
@@ -231,7 +264,7 @@ where
                 parse_tree: Some(cst::Node::token(
                     kind,
                     Range {
-                        start: 0,
+                        start: Default::default(),
                         end: stream.position(),
                     },
                     None,

diff --git a/crates/codegen/syntax_templates/src/typescript/cst_types.rs b/crates/codegen/syntax_templates/src/typescript/cst_types.rs
@@ -33,16 +33,28 @@ impl RuleNode {
         }
     }
 
-    #[napi(getter)]
-    pub fn range(&self) -> [usize; 2] {
+    #[napi(getter, ts_return_type = "[ start: number, end: number ]")]
+    pub fn byte_range(&self) -> [usize; 2] {
         let range = self.0.range();
-        [range.start, range.end]
+        [range.start.byte, range.end.byte]
     }
 
-    #[napi(getter)]
-    pub fn range_including_trivia(&self) -> [usize; 2] {
+    #[napi(getter, ts_return_type = "[ start: number, end: number ]")]
+    pub fn char_range(&self) -> [usize; 2] {
+        let range = self.0.range();
+        [range.start.char, range.end.char]
+    }
+
+    #[napi(getter, ts_return_type = "[ start: number, end: number ]")]
+    pub fn byte_range_including_trivia(&self) -> [usize; 2] {
+        let range = self.0.range_including_trivia();
+        [range.start.byte, range.end.byte]
+    }
+
+    #[napi(getter, ts_return_type = "[ start: number, end: number ]")]
+    pub fn char_range_including_trivia(&self) -> [usize; 2] {
         let range = self.0.range_including_trivia();
-        [range.start, range.end]
+        [range.start.char, range.end.char]
     }
 
     #[napi(ts_return_type = "(RuleNode | TokenNode)[]")]
@@ -69,16 +81,28 @@ impl TokenNode {
         }
     }
 
-    #[napi(getter)]
-    pub fn range(&self) -> [usize; 2] {
+    #[napi(getter, ts_return_type = "[ start: number, end: number ]")]
+    pub fn byte_range(&self) -> [usize; 2] {
         let range = self.0.range();
-        [range.start, range.end]
+        [range.start.byte, range.end.byte]
     }
 
-    #[napi(getter)]
-    pub fn range_including_trivia(&self) -> [usize; 2] {
+    #[napi(getter, ts_return_type = "[ start: number, end: number ]")]
+    pub fn char_range(&self) -> [usize; 2] {
+        let range = self.0.range();
+        [range.start.char, range.end.char]
+    }
+
+    #[napi(getter, ts_return_type = "[ start: number, end: number ]")]
+    pub fn byte_range_including_trivia(&self) -> [usize; 2] {
+        let range = self.0.range_including_trivia();
+        [range.start.byte, range.end.byte]
+    }
+
+    #[napi(getter, ts_return_type = "[ start: number, end: number ]")]
+    pub fn char_range_including_trivia(&self) -> [usize; 2] {
         let range = self.0.range_including_trivia();
-        [range.start, range.end]
+        [range.start.char, range.end.char]
     }
 
     #[napi(ts_return_type = "(RuleNode | TokenNode)[]")]

diff --git a/crates/codegen/syntax_templates/src/typescript/parser_output.rs b/crates/codegen/syntax_templates/src/typescript/parser_output.rs
@@ -1,7 +1,9 @@
 use std::{collections::BTreeSet, rc::Rc};
 
 use super::{
-    cst, cst_types::RcNodeExtensions as CSTRcNodeExtensions, language::render_error_report,
+    cst,
+    cst_types::RcNodeExtensions as CSTRcNodeExtensions,
+    language::{render_error_report, TextPosition},
 };
 use napi::bindgen_prelude::*;
 
@@ -32,15 +34,20 @@ impl ParseOutput {
 #[napi]
 #[derive(PartialEq, Clone)]
 pub struct ParseError {
-    pub(crate) position: usize,
+    pub(crate) position: TextPosition,
     pub(crate) expected: BTreeSet<String>,
 }
 
 #[napi]
 impl ParseError {
     #[napi(getter)]
-    pub fn position(&self) -> usize {
-        return self.position;
+    pub fn byte_position(&self) -> usize {
+        return self.position.byte;
+    }
+
+    #[napi(getter)]
+    pub fn char_position(&self) -> usize {
+        return self.position.char;
     }
 
     #[napi]