Skip to content

Commit

Permalink
Preprocessor: add support for UCN identifiers
Browse files Browse the repository at this point in the history
Closes #823
  • Loading branch information
ehaas committed Feb 9, 2025
1 parent 8017436 commit 285b80d
Show file tree
Hide file tree
Showing 4 changed files with 313 additions and 4 deletions.
88 changes: 85 additions & 3 deletions src/aro/Preprocessor.zig
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const RawToken = Tokenizer.Token;
const Tree = @import("Tree.zig");
const Token = Tree.Token;
const TokenWithExpansionLocs = Tree.TokenWithExpansionLocs;
const ucn = @import("ucn.zig");

const DefineMap = std.StringHashMapUnmanaged(Macro);
const RawTokenList = std.ArrayList(RawToken);
Expand Down Expand Up @@ -991,7 +992,7 @@ fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool {
}
},
}
pp.addTokenAssumeCapacity(tok);
pp.addTokenAssumeCapacity(try pp.unescapeUcn(tok));
}
try pp.addToken(.{
.id = .eof,
Expand Down Expand Up @@ -2398,6 +2399,86 @@ fn expandMacroExhaustive(
buf.items.len = moving_end_idx;
}

fn writeUnescapedChar(pp: *Preprocessor, decoded: ucn.DecodedUniversalChar, loc: Source.Location, offset: u32) !void {
pp.comp.generated_buf.appendSliceAssumeCapacity(decoded.buf);
switch (decoded.kind) {
.normal => {},
.ucn => {
// TODO: UCN not allowed before C99
},
.control => {
try pp.comp.addDiagnostic(.{
.tag = .ucn_control_char_error,
.loc = .{
.id = loc.id,
.byte_offset = loc.byte_offset + offset,
.line = loc.line,
},
}, &.{});
},
.not_allowed => {
try pp.comp.addDiagnostic(.{
.tag = .ucn_basic_char_error,
.loc = .{
.id = loc.id,
.byte_offset = loc.byte_offset + offset,
.line = loc.line,
},
.extra = .{ .ascii = @intCast(decoded.buf[0]) },
}, &.{});
},
.incomplete_ucn => {
try pp.comp.addDiagnostic(.{
.tag = .incomplete_universal_character,
.loc = .{
.id = loc.id,
.byte_offset = loc.byte_offset + offset,
.line = loc.line,
},
}, &.{});
},
.invalid_utf_8 => {
try pp.comp.addDiagnostic(.{
.tag = .invalid_utf8,
.loc = .{
.id = loc.id,
.byte_offset = loc.byte_offset + offset,
.line = loc.line,
},
}, &.{});
},
.invalid_codepoint => {
try pp.comp.addDiagnostic(.{
.tag = .invalid_universal_character,
.loc = loc,
.extra = .{ .offset = offset },
}, &.{});
},
}
}

fn unescapeUcn(pp: *Preprocessor, tok: TokenWithExpansionLocs) !TokenWithExpansionLocs {
if (tok.id == .extended_identifier) {
@branchHint(.cold);
const identifier = pp.expandedSlice(tok);
if (mem.indexOfScalar(u8, identifier, '\\') != null) {
@branchHint(.cold);
const start = pp.comp.generated_buf.items.len;
try pp.comp.generated_buf.ensureUnusedCapacity(pp.gpa, identifier.len + 1);
var it = ucn.CharIterator.init(pp.expandedSlice(tok)); // re-expand since previous line may have re-allocated and invalidated `identifier`
var offset: u32 = 0;
while (it.next()) |decoded| {
try pp.writeUnescapedChar(decoded, tok.loc, offset);
std.debug.assert(decoded.consumed <= 10);
offset += @truncate(decoded.consumed);
}
pp.comp.generated_buf.appendAssumeCapacity('\n');
return pp.makeGeneratedToken(start, .extended_identifier, tok);
}
}
return tok;
}

/// Try to expand a macro after a possible candidate has been read from the `tokenizer`
/// into the `raw` token passed as argument
fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroError!void {
Expand Down Expand Up @@ -2427,7 +2508,7 @@ fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroErr
continue;
}
tok.id.simplifyMacroKeywordExtra(true);
pp.addTokenAssumeCapacity(tok.*);
pp.addTokenAssumeCapacity(try pp.unescapeUcn(tok.*));
}
if (pp.preserve_whitespace) {
try pp.ensureUnusedTokenCapacity(pp.add_expansion_nl);
Expand Down Expand Up @@ -3100,7 +3181,8 @@ fn makePragmaToken(pp: *Preprocessor, raw: RawToken, operator_loc: ?Source.Locat
return tok;
}

pub fn addToken(pp: *Preprocessor, tok: TokenWithExpansionLocs) !void {
pub fn addToken(pp: *Preprocessor, tok_arg: TokenWithExpansionLocs) !void {
const tok = try pp.unescapeUcn(tok_arg);
if (tok.expansion_locs) |expansion_locs| {
try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs });
}
Expand Down
106 changes: 105 additions & 1 deletion src/aro/Tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub const Token = struct {
eof,
/// identifier containing solely basic character set characters
identifier,
/// identifier with at least one extended character
/// identifier with at least one extended character or UCN escape sequence
extended_identifier,

// string literals with prefixes
Expand Down Expand Up @@ -1074,14 +1074,55 @@ pub fn next(self: *Tokenizer) Token {
pp_num,
pp_num_exponent,
pp_num_digit_separator,
ucn_backslash,
ucn_start,
} = .start;

var start = self.index;
var id: Token.Id = .eof;
var ucn_from: enum {
start,
within,
} = undefined;

while (self.index < self.buf.len) : (self.index += 1) {
const c = self.buf[self.index];
switch (state) {
.ucn_backslash => switch (c) {
'u', 'U' => {
state = .ucn_start;
},
else => switch (ucn_from) {
.start => {
id = .invalid;
break;
},
.within => {
id = .extended_identifier;
self.index -= 1;
break;
},
},
},
.ucn_start => switch (c) {
'a'...'f', 'A'...'F', '0'...'9' => {
state = .extended_identifier;
},
else => {
switch (ucn_from) {
.start => {
id = .invalid;
self.index -= 1;
break;
},
.within => {
id = .extended_identifier;
self.index -= 2;
break;
},
}
},
},
.start => switch (c) {
'\n' => {
id = .nl;
Expand All @@ -1100,6 +1141,10 @@ pub fn next(self: *Tokenizer) Token {
'u' => state = .u,
'U' => state = .U,
'L' => state = .L,
'\\' => {
ucn_from = .start;
state = .ucn_backslash;
},
'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
'=' => state = .equal,
'!' => state = .bang,
Expand Down Expand Up @@ -1325,6 +1370,10 @@ pub fn next(self: *Tokenizer) Token {
break;
},
0x80...0xFF => state = .extended_identifier,
'\\' => {
ucn_from = .within;
state = .ucn_backslash;
},
else => {
id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
break;
Expand Down Expand Up @@ -1732,6 +1781,27 @@ pub fn next(self: *Tokenizer) Token {
}
} else if (self.index == self.buf.len) {
switch (state) {
.ucn_backslash => {
switch (ucn_from) {
.start => id = .invalid,
.within => {
id = .extended_identifier;
self.index -= 1;
},
}
},
.ucn_start => {
switch (ucn_from) {
.start => {
id = .invalid;
self.index -= 1;
},
.within => {
id = .extended_identifier;
self.index -= 2;
},
}
},
.start, .line_comment => {},
.u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]),
.extended_identifier => id = .extended_identifier,
Expand Down Expand Up @@ -2151,6 +2221,40 @@ test "C23 keywords" {
}, .c23);
}

test "Universal character names" {
try expectTokens("\\", &.{.invalid});
try expectTokens("\\g", &.{ .invalid, .identifier });
try expectTokens("\\u", &.{ .invalid, .identifier });
try expectTokens("\\ua", &.{.extended_identifier});
try expectTokens("\\U9", &.{.extended_identifier});
try expectTokens("\\ug", &.{ .invalid, .identifier });
try expectTokens("\\uag", &.{.extended_identifier});

try expectTokens("\\ ", &.{ .invalid, .eof });
try expectTokens("\\g ", &.{ .invalid, .identifier, .eof });
try expectTokens("\\u ", &.{ .invalid, .identifier, .eof });
try expectTokens("\\ua ", &.{ .extended_identifier, .eof });
try expectTokens("\\U9 ", &.{ .extended_identifier, .eof });
try expectTokens("\\ug ", &.{ .invalid, .identifier, .eof });
try expectTokens("\\uag ", &.{ .extended_identifier, .eof });

try expectTokens("a\\", &.{ .extended_identifier, .invalid });
try expectTokens("a\\g", &.{ .extended_identifier, .invalid, .identifier });
try expectTokens("a\\u", &.{ .extended_identifier, .invalid, .identifier });
try expectTokens("a\\ua", &.{.extended_identifier});
try expectTokens("a\\U9", &.{.extended_identifier});
try expectTokens("a\\ug", &.{ .extended_identifier, .invalid, .identifier });
try expectTokens("a\\uag", &.{.extended_identifier});

try expectTokens("a\\ ", &.{ .extended_identifier, .invalid, .eof });
try expectTokens("a\\g ", &.{ .extended_identifier, .invalid, .identifier, .eof });
try expectTokens("a\\u ", &.{ .extended_identifier, .invalid, .identifier, .eof });
try expectTokens("a\\ua ", &.{ .extended_identifier, .eof });
try expectTokens("a\\U9 ", &.{ .extended_identifier, .eof });
try expectTokens("a\\ug ", &.{ .extended_identifier, .invalid, .identifier, .eof });
try expectTokens("a\\uag ", &.{ .extended_identifier, .eof });
}

test "Tokenizer fuzz test" {
return std.testing.fuzz(testTokenizerFuzz, .{});
}
Expand Down
97 changes: 97 additions & 0 deletions src/aro/ucn.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
//! Universal Character Name support

const std = @import("std");

const Kind = enum {
/// Valid not escaped char
normal,
/// Not escaped, but not valid UTF-8
invalid_utf_8,
/// Valid UCN char
ucn,
/// Incomplete UCN escape sequence
incomplete_ucn,
/// UCN escape sequence does not specify a unicode code point
invalid_codepoint,
/// UCN names a control character
control,
/// UCN names a basic character set character
not_allowed,
};

pub const DecodedUniversalChar = struct {
codepoint: u32,
consumed: usize,
buf: []const u8,
kind: Kind,
};

/// Decodes a C99-style universal character name (e.g., \uXXXX or \UXXXXXXXX)
/// into a unicode codepoint. Returns the decoded character and the number of
/// bytes consumed from the input string.
fn decodeUniversalChar(input: []const u8, output: []u8) DecodedUniversalChar {
std.debug.assert(input.len >= 2 and input[0] == '\\' and (input[1] == 'u' or input[1] == 'U'));
const is_long = input[1] == 'U';
const required: usize = if (is_long) 10 else 6;

if (input.len < required) {
return .{
.codepoint = 0,
.consumed = input.len,
.buf = input,
.kind = .incomplete_ucn,
};
}

const hex_part = input[2..required];
var codepoint: u32 = 0;
for (hex_part, 0..) |c, i| {
codepoint *= 16;
const value = switch (c) {
'0'...'9' => c - '0',
'a'...'f' => 10 + (c - 'a'),
'A'...'F' => 10 + (c - 'A'),
else => return .{ .codepoint = 0, .consumed = i, .buf = input[0..i], .kind = .incomplete_ucn },
};
codepoint += value;
}
if (codepoint > std.math.maxInt(u21)) {
return .{ .codepoint = 0, .consumed = required, .buf = input, .kind = .invalid_codepoint };
}

const len = std.unicode.utf8Encode(@as(u21, @intCast(codepoint)), output) catch {
return .{ .codepoint = codepoint, .consumed = required, .buf = input, .kind = .invalid_codepoint };
};
const kind: Kind = switch (codepoint) {
0...0x1F, 0x7F...0x9F => .control,
0x20...0x7E => .not_allowed,
else => .ucn,
};
return .{ .codepoint = codepoint, .consumed = required, .buf = output[0..len], .kind = kind };
}

pub const CharIterator = struct {
str: []const u8,
i: usize,
buf: [10]u8,

pub fn init(str: []const u8) CharIterator {
return .{ .str = str, .i = 0, .buf = undefined };
}

pub fn next(self: *@This()) ?DecodedUniversalChar {
if (self.i >= self.str.len) return null;
if (self.str[self.i] == '\\' and self.i + 1 < self.str.len and (self.str[self.i + 1] == 'u' or self.str[self.i + 1] == 'U')) {
const decoded = decodeUniversalChar(self.str[self.i..], self.buf[0..]);
self.i += decoded.consumed;
return decoded;
} else {
const len = std.unicode.utf8ByteSequenceLength(self.str[self.i]) catch {
defer self.i += 1;
return .{ .codepoint = self.str[self.i], .consumed = 1, .buf = self.str[self.i..][0..1], .kind = .invalid_utf_8 };
};
defer self.i += len;
return .{ .codepoint = 0, .consumed = len, .buf = self.str[self.i..][0..len], .kind = .normal };
}
}
};
Loading

0 comments on commit 285b80d

Please sign in to comment.