rotty · rotty · Aug 24, 2024 · Aug 24, 2024
diff --git a/lexpr/NEWS.md b/lexpr/NEWS.md
@@ -7,6 +7,8 @@ New features:
 - New parser option `leading_digit_symbols` (PR #106). This should now
   allow parsing files produced by recent KiCad versions, thus closing
   #64.
+- Accept symbols starting with an alphabetical unicode codepoint,
+  fixing #112.
 
 Fixes:
 

diff --git a/lexpr/src/parse/mod.rs b/lexpr/src/parse/mod.rs
@@ -11,7 +11,7 @@ use std::io;
 use std::str;
 
 use error::ErrorCode;
-use read::{ElispStr, Reference};
+use read::{decode_utf8_sequence, ElispStr, Reference};
 
 use crate::{datum::SpanInfo, Cons, Number, Value};
 
@@ -636,6 +636,14 @@ impl<'de, R: Read<'de>> Parser<R> {
                     _ => Token::Quotation("unquote"),
                 }
             }
+            c if c > 127 => {
+                self.eat_char();
+                let c = decode_utf8_sequence(&mut self.read, &mut self.scratch, c)?;
+                if !c.is_alphabetic() {
+                    return Err(self.peek_error(ErrorCode::ExpectedSomeValue));
+                }
+                Token::Symbol(self.parse_symbol_scratch_suffix()?.into())
+            }
             _ => {
                 if SYMBOL_EXTENDED.contains(&peek) {
                     Token::Symbol(self.parse_symbol()?.into())
@@ -804,15 +812,16 @@ impl<'de, R: Read<'de>> Parser<R> {
 
     fn parse_symbol(&mut self) -> Result<String> {
         self.scratch.clear();
-        match self.read.parse_symbol(&mut self.scratch)? {
-            Reference::Borrowed(s) => Ok(s.into()),
-            Reference::Copied(s) => Ok(s.into()),
-        }
+        self.parse_symbol_scratch_suffix()
     }
 
     fn parse_symbol_suffix(&mut self, prefix: &str) -> Result<String> {
         self.scratch.clear();
         self.scratch.extend(prefix.as_bytes());
+        self.parse_symbol_scratch_suffix()
+    }
+
+    fn parse_symbol_scratch_suffix(&mut self) -> Result<String> {
         match self.read.parse_symbol(&mut self.scratch)? {
             Reference::Borrowed(s) => Ok(s.into()),
             Reference::Copied(s) => Ok(s.into()),

diff --git a/lexpr/src/parse/read.rs b/lexpr/src/parse/read.rs
@@ -1185,7 +1185,7 @@ static DELIMITER: [u8; 12] = [
 
 /// Decode a UTF8 multibyte sequence starting with `initial` and return the
 /// decoded codepoint.
-fn decode_utf8_sequence<'de, R: Read<'de> + ?Sized>(
+pub(crate) fn decode_utf8_sequence<'de, R: Read<'de> + ?Sized>(
     read: &mut R,
     scratch: &mut Vec<u8>,
     initial: u8,

diff --git a/lexpr/src/tests.rs b/lexpr/src/tests.rs
@@ -54,12 +54,12 @@ fn gen_value(g: &mut Gen, depth: usize) -> Value {
         }
         Symbol => {
             let choices = [
-                "foo", "a-symbol", "$?:!", "+", "+foo", "-", "-foo", "..", ".foo",
+                "foo", "a-symbol", "$?:!", "+", "+foo", "-", "-foo", "..", ".foo", "λ-1",
             ];
             Value::symbol(*g.choose(&choices).unwrap())
         }
         Keyword => {
-            let choices = ["foo", "a-keyword", "$?:!"];
+            let choices = ["foo", "a-keyword", "$?:!", "λ-2"];
             Value::keyword(*g.choose(&choices).unwrap())
         }
         Bytes => {

diff --git a/lexpr/tests/print-parse.rs b/lexpr/tests/print-parse.rs
@@ -69,6 +69,11 @@ fn test_unicode_chars() {
     }
 }
 
+#[test]
+fn test_leading_unicode_symbols() {
+    check_roundtrip_default(Value::Symbol("λ-test-ω".into()), "λ-test-ω");
+}
+
 #[test]
 fn test_chars_elisp() {
     for (value, printed) in [