Preprocessor: add support for UCN identifiers

Closes #823
Vexu · Feb 9, 2025 · 285b80d · 285b80d
1 parent 8017436
commit 285b80d
Show file tree

Hide file tree

Showing 4 changed files with 313 additions and 4 deletions.
diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig
@@ -16,6 +16,7 @@ const RawToken = Tokenizer.Token;
 const Tree = @import("Tree.zig");
 const Token = Tree.Token;
 const TokenWithExpansionLocs = Tree.TokenWithExpansionLocs;
+const ucn = @import("ucn.zig");
 
 const DefineMap = std.StringHashMapUnmanaged(Macro);
 const RawTokenList = std.ArrayList(RawToken);
@@ -991,7 +992,7 @@ fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool {
                 }
             },
         }
-        pp.addTokenAssumeCapacity(tok);
+        pp.addTokenAssumeCapacity(try pp.unescapeUcn(tok));
     }
     try pp.addToken(.{
         .id = .eof,
@@ -2398,6 +2399,86 @@ fn expandMacroExhaustive(
     buf.items.len = moving_end_idx;
 }
 
+fn writeUnescapedChar(pp: *Preprocessor, decoded: ucn.DecodedUniversalChar, loc: Source.Location, offset: u32) !void {
+    pp.comp.generated_buf.appendSliceAssumeCapacity(decoded.buf);
+    switch (decoded.kind) {
+        .normal => {},
+        .ucn => {
+            // TODO: UCN not allowed before C99
+        },
+        .control => {
+            try pp.comp.addDiagnostic(.{
+                .tag = .ucn_control_char_error,
+                .loc = .{
+                    .id = loc.id,
+                    .byte_offset = loc.byte_offset + offset,
+                    .line = loc.line,
+                },
+            }, &.{});
+        },
+        .not_allowed => {
+            try pp.comp.addDiagnostic(.{
+                .tag = .ucn_basic_char_error,
+                .loc = .{
+                    .id = loc.id,
+                    .byte_offset = loc.byte_offset + offset,
+                    .line = loc.line,
+                },
+                .extra = .{ .ascii = @intCast(decoded.buf[0]) },
+            }, &.{});
+        },
+        .incomplete_ucn => {
+            try pp.comp.addDiagnostic(.{
+                .tag = .incomplete_universal_character,
+                .loc = .{
+                    .id = loc.id,
+                    .byte_offset = loc.byte_offset + offset,
+                    .line = loc.line,
+                },
+            }, &.{});
+        },
+        .invalid_utf_8 => {
+            try pp.comp.addDiagnostic(.{
+                .tag = .invalid_utf8,
+                .loc = .{
+                    .id = loc.id,
+                    .byte_offset = loc.byte_offset + offset,
+                    .line = loc.line,
+                },
+            }, &.{});
+        },
+        .invalid_codepoint => {
+            try pp.comp.addDiagnostic(.{
+                .tag = .invalid_universal_character,
+                .loc = loc,
+                .extra = .{ .offset = offset },
+            }, &.{});
+        },
+    }
+}
+
+fn unescapeUcn(pp: *Preprocessor, tok: TokenWithExpansionLocs) !TokenWithExpansionLocs {
+    if (tok.id == .extended_identifier) {
+        @branchHint(.cold);
+        const identifier = pp.expandedSlice(tok);
+        if (mem.indexOfScalar(u8, identifier, '\\') != null) {
+            @branchHint(.cold);
+            const start = pp.comp.generated_buf.items.len;
+            try pp.comp.generated_buf.ensureUnusedCapacity(pp.gpa, identifier.len + 1);
+            var it = ucn.CharIterator.init(pp.expandedSlice(tok)); // re-expand since previous line may have re-allocated and invalidated `identifier`
+            var offset: u32 = 0;
+            while (it.next()) |decoded| {
+                try pp.writeUnescapedChar(decoded, tok.loc, offset);
+                std.debug.assert(decoded.consumed <= 10);
+                offset += @truncate(decoded.consumed);
+            }
+            pp.comp.generated_buf.appendAssumeCapacity('\n');
+            return pp.makeGeneratedToken(start, .extended_identifier, tok);
+        }
+    }
+    return tok;
+}
+
 /// Try to expand a macro after a possible candidate has been read from the `tokenizer`
 /// into the `raw` token passed as argument
 fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroError!void {
@@ -2427,7 +2508,7 @@ fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroErr
             continue;
         }
         tok.id.simplifyMacroKeywordExtra(true);
-        pp.addTokenAssumeCapacity(tok.*);
+        pp.addTokenAssumeCapacity(try pp.unescapeUcn(tok.*));
     }
     if (pp.preserve_whitespace) {
         try pp.ensureUnusedTokenCapacity(pp.add_expansion_nl);
@@ -3100,7 +3181,8 @@ fn makePragmaToken(pp: *Preprocessor, raw: RawToken, operator_loc: ?Source.Locat
     return tok;
 }
 
-pub fn addToken(pp: *Preprocessor, tok: TokenWithExpansionLocs) !void {
+pub fn addToken(pp: *Preprocessor, tok_arg: TokenWithExpansionLocs) !void {
+    const tok = try pp.unescapeUcn(tok_arg);
     if (tok.expansion_locs) |expansion_locs| {
         try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs });
     }

diff --git a/src/aro/Tokenizer.zig b/src/aro/Tokenizer.zig
@@ -19,7 +19,7 @@ pub const Token = struct {
         eof,
         /// identifier containing solely basic character set characters
         identifier,
-        /// identifier with at least one extended character
+        /// identifier with at least one extended character or UCN escape sequence
         extended_identifier,
 
         // string literals with prefixes
@@ -1074,14 +1074,55 @@ pub fn next(self: *Tokenizer) Token {
         pp_num,
         pp_num_exponent,
         pp_num_digit_separator,
+        ucn_backslash,
+        ucn_start,
     } = .start;
 
     var start = self.index;
     var id: Token.Id = .eof;
+    var ucn_from: enum {
+        start,
+        within,
+    } = undefined;
 
     while (self.index < self.buf.len) : (self.index += 1) {
         const c = self.buf[self.index];
         switch (state) {
+            .ucn_backslash => switch (c) {
+                'u', 'U' => {
+                    state = .ucn_start;
+                },
+                else => switch (ucn_from) {
+                    .start => {
+                        id = .invalid;
+                        break;
+                    },
+                    .within => {
+                        id = .extended_identifier;
+                        self.index -= 1;
+                        break;
+                    },
+                },
+            },
+            .ucn_start => switch (c) {
+                'a'...'f', 'A'...'F', '0'...'9' => {
+                    state = .extended_identifier;
+                },
+                else => {
+                    switch (ucn_from) {
+                        .start => {
+                            id = .invalid;
+                            self.index -= 1;
+                            break;
+                        },
+                        .within => {
+                            id = .extended_identifier;
+                            self.index -= 2;
+                            break;
+                        },
+                    }
+                },
+            },
             .start => switch (c) {
                 '\n' => {
                     id = .nl;
@@ -1100,6 +1141,10 @@ pub fn next(self: *Tokenizer) Token {
                 'u' => state = .u,
                 'U' => state = .U,
                 'L' => state = .L,
+                '\\' => {
+                    ucn_from = .start;
+                    state = .ucn_backslash;
+                },
                 'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
                 '=' => state = .equal,
                 '!' => state = .bang,
@@ -1325,6 +1370,10 @@ pub fn next(self: *Tokenizer) Token {
                     break;
                 },
                 0x80...0xFF => state = .extended_identifier,
+                '\\' => {
+                    ucn_from = .within;
+                    state = .ucn_backslash;
+                },
                 else => {
                     id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
                     break;
@@ -1732,6 +1781,27 @@ pub fn next(self: *Tokenizer) Token {
         }
     } else if (self.index == self.buf.len) {
         switch (state) {
+            .ucn_backslash => {
+                switch (ucn_from) {
+                    .start => id = .invalid,
+                    .within => {
+                        id = .extended_identifier;
+                        self.index -= 1;
+                    },
+                }
+            },
+            .ucn_start => {
+                switch (ucn_from) {
+                    .start => {
+                        id = .invalid;
+                        self.index -= 1;
+                    },
+                    .within => {
+                        id = .extended_identifier;
+                        self.index -= 2;
+                    },
+                }
+            },
             .start, .line_comment => {},
             .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]),
             .extended_identifier => id = .extended_identifier,
@@ -2151,6 +2221,40 @@ test "C23 keywords" {
     }, .c23);
 }
 
+test "Universal character names" {
+    try expectTokens("\\", &.{.invalid});
+    try expectTokens("\\g", &.{ .invalid, .identifier });
+    try expectTokens("\\u", &.{ .invalid, .identifier });
+    try expectTokens("\\ua", &.{.extended_identifier});
+    try expectTokens("\\U9", &.{.extended_identifier});
+    try expectTokens("\\ug", &.{ .invalid, .identifier });
+    try expectTokens("\\uag", &.{.extended_identifier});
+
+    try expectTokens("\\ ", &.{ .invalid, .eof });
+    try expectTokens("\\g ", &.{ .invalid, .identifier, .eof });
+    try expectTokens("\\u ", &.{ .invalid, .identifier, .eof });
+    try expectTokens("\\ua ", &.{ .extended_identifier, .eof });
+    try expectTokens("\\U9 ", &.{ .extended_identifier, .eof });
+    try expectTokens("\\ug ", &.{ .invalid, .identifier, .eof });
+    try expectTokens("\\uag ", &.{ .extended_identifier, .eof });
+
+    try expectTokens("a\\", &.{ .extended_identifier, .invalid });
+    try expectTokens("a\\g", &.{ .extended_identifier, .invalid, .identifier });
+    try expectTokens("a\\u", &.{ .extended_identifier, .invalid, .identifier });
+    try expectTokens("a\\ua", &.{.extended_identifier});
+    try expectTokens("a\\U9", &.{.extended_identifier});
+    try expectTokens("a\\ug", &.{ .extended_identifier, .invalid, .identifier });
+    try expectTokens("a\\uag", &.{.extended_identifier});
+
+    try expectTokens("a\\ ", &.{ .extended_identifier, .invalid, .eof });
+    try expectTokens("a\\g ", &.{ .extended_identifier, .invalid, .identifier, .eof });
+    try expectTokens("a\\u ", &.{ .extended_identifier, .invalid, .identifier, .eof });
+    try expectTokens("a\\ua ", &.{ .extended_identifier, .eof });
+    try expectTokens("a\\U9 ", &.{ .extended_identifier, .eof });
+    try expectTokens("a\\ug ", &.{ .extended_identifier, .invalid, .identifier, .eof });
+    try expectTokens("a\\uag ", &.{ .extended_identifier, .eof });
+}
+
 test "Tokenizer fuzz test" {
     return std.testing.fuzz(testTokenizerFuzz, .{});
 }

diff --git a/src/aro/ucn.zig b/src/aro/ucn.zig
@@ -0,0 +1,97 @@
+//! Universal Character Name support
+
+const std = @import("std");
+
+const Kind = enum {
+    /// Valid not escaped char
+    normal,
+    /// Not escaped, but not valid UTF-8
+    invalid_utf_8,
+    /// Valid UCN char
+    ucn,
+    /// Incomplete UCN escape sequence
+    incomplete_ucn,
+    /// UCN escape sequence does not specify a unicode code point
+    invalid_codepoint,
+    /// UCN names a control character
+    control,
+    /// UCN names a basic character set character
+    not_allowed,
+};
+
+pub const DecodedUniversalChar = struct {
+    codepoint: u32,
+    consumed: usize,
+    buf: []const u8,
+    kind: Kind,
+};
+
+/// Decodes a C99-style universal character name (e.g., \uXXXX or \UXXXXXXXX)
+/// into a unicode codepoint. Returns the decoded character and the number of
+/// bytes consumed from the input string.
+fn decodeUniversalChar(input: []const u8, output: []u8) DecodedUniversalChar {
+    std.debug.assert(input.len >= 2 and input[0] == '\\' and (input[1] == 'u' or input[1] == 'U'));
+    const is_long = input[1] == 'U';
+    const required: usize = if (is_long) 10 else 6;
+
+    if (input.len < required) {
+        return .{
+            .codepoint = 0,
+            .consumed = input.len,
+            .buf = input,
+            .kind = .incomplete_ucn,
+        };
+    }
+
+    const hex_part = input[2..required];
+    var codepoint: u32 = 0;
+    for (hex_part, 0..) |c, i| {
+        codepoint *= 16;
+        const value = switch (c) {
+            '0'...'9' => c - '0',
+            'a'...'f' => 10 + (c - 'a'),
+            'A'...'F' => 10 + (c - 'A'),
+            else => return .{ .codepoint = 0, .consumed = i, .buf = input[0..i], .kind = .incomplete_ucn },
+        };
+        codepoint += value;
+    }
+    if (codepoint > std.math.maxInt(u21)) {
+        return .{ .codepoint = 0, .consumed = required, .buf = input, .kind = .invalid_codepoint };
+    }
+
+    const len = std.unicode.utf8Encode(@as(u21, @intCast(codepoint)), output) catch {
+        return .{ .codepoint = codepoint, .consumed = required, .buf = input, .kind = .invalid_codepoint };
+    };
+    const kind: Kind = switch (codepoint) {
+        0...0x1F, 0x7F...0x9F => .control,
+        0x20...0x7E => .not_allowed,
+        else => .ucn,
+    };
+    return .{ .codepoint = codepoint, .consumed = required, .buf = output[0..len], .kind = kind };
+}
+
+pub const CharIterator = struct {
+    str: []const u8,
+    i: usize,
+    buf: [10]u8,
+
+    pub fn init(str: []const u8) CharIterator {
+        return .{ .str = str, .i = 0, .buf = undefined };
+    }
+
+    pub fn next(self: *@This()) ?DecodedUniversalChar {
+        if (self.i >= self.str.len) return null;
+        if (self.str[self.i] == '\\' and self.i + 1 < self.str.len and (self.str[self.i + 1] == 'u' or self.str[self.i + 1] == 'U')) {
+            const decoded = decodeUniversalChar(self.str[self.i..], self.buf[0..]);
+            self.i += decoded.consumed;
+            return decoded;
+        } else {
+            const len = std.unicode.utf8ByteSequenceLength(self.str[self.i]) catch {
+                defer self.i += 1;
+                return .{ .codepoint = self.str[self.i], .consumed = 1, .buf = self.str[self.i..][0..1], .kind = .invalid_utf_8 };
+            };
+            defer self.i += len;
+            return .{ .codepoint = 0, .consumed = len, .buf = self.str[self.i..][0..len], .kind = .normal };
+        }
+    }
+};