Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

StringInterner: add support for UCN identifiers #838

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 86 additions & 3 deletions src/aro/Preprocessor.zig
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const RawToken = Tokenizer.Token;
const Tree = @import("Tree.zig");
const Token = Tree.Token;
const TokenWithExpansionLocs = Tree.TokenWithExpansionLocs;
const ucn = @import("ucn.zig");

const DefineMap = std.StringHashMapUnmanaged(Macro);
const RawTokenList = std.ArrayList(RawToken);
Expand Down Expand Up @@ -991,7 +992,7 @@ fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool {
}
},
}
pp.addTokenAssumeCapacity(tok);
pp.addTokenAssumeCapacity(try pp.unescapeUcn(tok));
}
try pp.addToken(.{
.id = .eof,
Expand Down Expand Up @@ -2398,6 +2399,87 @@ fn expandMacroExhaustive(
buf.items.len = moving_end_idx;
}

fn writeUnescapedChar(pp: *Preprocessor, decoded: ucn.DecodedUniversalChar, loc: Source.Location, offset: u32) !void {
pp.comp.generated_buf.appendSliceAssumeCapacity(decoded.buf);
switch (decoded.kind) {
.normal => {},
.ucn => {
// TODO: UCN not allowed before C99
},
.control => {
try pp.comp.addDiagnostic(.{
.tag = .ucn_control_char_error,
.loc = .{
.id = loc.id,
.byte_offset = loc.byte_offset + offset,
.line = loc.line,
},
}, &.{});
},
.not_allowed => {
try pp.comp.addDiagnostic(.{
.tag = .ucn_basic_char_error,
.loc = .{
.id = loc.id,
.byte_offset = loc.byte_offset + offset,
.line = loc.line,
},
.extra = .{ .ascii = @intCast(decoded.buf[0]) },
}, &.{});
},
.incomplete_ucn => {
try pp.comp.addDiagnostic(.{
.tag = .incomplete_universal_character,
.loc = .{
.id = loc.id,
.byte_offset = loc.byte_offset + offset,
.line = loc.line,
},
}, &.{});
},
.invalid_utf_8 => {
try pp.comp.addDiagnostic(.{
.tag = .invalid_utf8,
.loc = .{
.id = loc.id,
.byte_offset = loc.byte_offset + offset,
.line = loc.line,
},
}, &.{});
},
.invalid_codepoint => {
try pp.comp.addDiagnostic(.{
.tag = .invalid_universal_character,
.loc = loc,
.extra = .{ .offset = offset },
}, &.{});
},
}
}

fn unescapeUcn(pp: *Preprocessor, tok: TokenWithExpansionLocs) !TokenWithExpansionLocs {
if (tok.id == .extended_identifier) {
@branchHint(.cold);
const identifier = pp.expandedSlice(tok);
if (mem.indexOfScalar(u8, identifier, '\\') != null) {
@branchHint(.cold);
const start = pp.comp.generated_buf.items.len;
try pp.comp.generated_buf.ensureUnusedCapacity(pp.gpa, identifier.len + 1);
var it = ucn.CharIterator.init(pp.expandedSlice(tok)); // re-expand since previous line may have re-allocated and invalidated `identifier`
var offset: u32 = 0;
while (it.next()) |decoded| {
try pp.writeUnescapedChar(decoded, tok.loc, offset);
std.debug.assert(decoded.consumed <= 10);
offset += @truncate(decoded.consumed);
}
pp.comp.generated_buf.appendAssumeCapacity('\n');
defer TokenWithExpansionLocs.free(tok.expansion_locs, pp.gpa);
return pp.makeGeneratedToken(start, .extended_identifier, tok);
}
}
return tok;
}

/// Try to expand a macro after a possible candidate has been read from the `tokenizer`
/// into the `raw` token passed as argument
fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroError!void {
Expand Down Expand Up @@ -2427,7 +2509,7 @@ fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroErr
continue;
}
tok.id.simplifyMacroKeywordExtra(true);
pp.addTokenAssumeCapacity(tok.*);
pp.addTokenAssumeCapacity(try pp.unescapeUcn(tok.*));
}
if (pp.preserve_whitespace) {
try pp.ensureUnusedTokenCapacity(pp.add_expansion_nl);
Expand Down Expand Up @@ -3100,7 +3182,8 @@ fn makePragmaToken(pp: *Preprocessor, raw: RawToken, operator_loc: ?Source.Locat
return tok;
}

pub fn addToken(pp: *Preprocessor, tok: TokenWithExpansionLocs) !void {
pub fn addToken(pp: *Preprocessor, tok_arg: TokenWithExpansionLocs) !void {
const tok = try pp.unescapeUcn(tok_arg);
if (tok.expansion_locs) |expansion_locs| {
try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs });
}
Expand Down
106 changes: 105 additions & 1 deletion src/aro/Tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub const Token = struct {
eof,
/// identifier containing solely basic character set characters
identifier,
/// identifier with at least one extended character
/// identifier with at least one extended character or UCN escape sequence
extended_identifier,

// string literals with prefixes
Expand Down Expand Up @@ -1074,14 +1074,55 @@ pub fn next(self: *Tokenizer) Token {
pp_num,
pp_num_exponent,
pp_num_digit_separator,
ucn_backslash,
ucn_start,
} = .start;

var start = self.index;
var id: Token.Id = .eof;
var ucn_from: enum {
start,
within,
} = undefined;

while (self.index < self.buf.len) : (self.index += 1) {
const c = self.buf[self.index];
switch (state) {
.ucn_backslash => switch (c) {
'u', 'U' => {
state = .ucn_start;
},
else => switch (ucn_from) {
.start => {
id = .invalid;
break;
},
.within => {
id = .extended_identifier;
self.index -= 1;
break;
},
},
},
.ucn_start => switch (c) {
'a'...'f', 'A'...'F', '0'...'9' => {
state = .extended_identifier;
},
else => {
switch (ucn_from) {
.start => {
id = .invalid;
self.index -= 1;
break;
},
.within => {
id = .extended_identifier;
self.index -= 2;
break;
},
}
},
},
.start => switch (c) {
'\n' => {
id = .nl;
Expand All @@ -1100,6 +1141,10 @@ pub fn next(self: *Tokenizer) Token {
'u' => state = .u,
'U' => state = .U,
'L' => state = .L,
'\\' => {
ucn_from = .start;
state = .ucn_backslash;
},
'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
'=' => state = .equal,
'!' => state = .bang,
Expand Down Expand Up @@ -1325,6 +1370,10 @@ pub fn next(self: *Tokenizer) Token {
break;
},
0x80...0xFF => state = .extended_identifier,
'\\' => {
ucn_from = .within;
state = .ucn_backslash;
},
else => {
id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
break;
Expand Down Expand Up @@ -1732,6 +1781,27 @@ pub fn next(self: *Tokenizer) Token {
}
} else if (self.index == self.buf.len) {
switch (state) {
.ucn_backslash => {
switch (ucn_from) {
.start => id = .invalid,
.within => {
id = .extended_identifier;
self.index -= 1;
},
}
},
.ucn_start => {
switch (ucn_from) {
.start => {
id = .invalid;
self.index -= 1;
},
.within => {
id = .extended_identifier;
self.index -= 2;
},
}
},
.start, .line_comment => {},
.u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]),
.extended_identifier => id = .extended_identifier,
Expand Down Expand Up @@ -2151,6 +2221,40 @@ test "C23 keywords" {
}, .c23);
}

test "Universal character names" {
try expectTokens("\\", &.{.invalid});
try expectTokens("\\g", &.{ .invalid, .identifier });
try expectTokens("\\u", &.{ .invalid, .identifier });
try expectTokens("\\ua", &.{.extended_identifier});
try expectTokens("\\U9", &.{.extended_identifier});
try expectTokens("\\ug", &.{ .invalid, .identifier });
try expectTokens("\\uag", &.{.extended_identifier});

try expectTokens("\\ ", &.{ .invalid, .eof });
try expectTokens("\\g ", &.{ .invalid, .identifier, .eof });
try expectTokens("\\u ", &.{ .invalid, .identifier, .eof });
try expectTokens("\\ua ", &.{ .extended_identifier, .eof });
try expectTokens("\\U9 ", &.{ .extended_identifier, .eof });
try expectTokens("\\ug ", &.{ .invalid, .identifier, .eof });
try expectTokens("\\uag ", &.{ .extended_identifier, .eof });

try expectTokens("a\\", &.{ .extended_identifier, .invalid });
try expectTokens("a\\g", &.{ .extended_identifier, .invalid, .identifier });
try expectTokens("a\\u", &.{ .extended_identifier, .invalid, .identifier });
try expectTokens("a\\ua", &.{.extended_identifier});
try expectTokens("a\\U9", &.{.extended_identifier});
try expectTokens("a\\ug", &.{ .extended_identifier, .invalid, .identifier });
try expectTokens("a\\uag", &.{.extended_identifier});

try expectTokens("a\\ ", &.{ .extended_identifier, .invalid, .eof });
try expectTokens("a\\g ", &.{ .extended_identifier, .invalid, .identifier, .eof });
try expectTokens("a\\u ", &.{ .extended_identifier, .invalid, .identifier, .eof });
try expectTokens("a\\ua ", &.{ .extended_identifier, .eof });
try expectTokens("a\\U9 ", &.{ .extended_identifier, .eof });
try expectTokens("a\\ug ", &.{ .extended_identifier, .invalid, .identifier, .eof });
try expectTokens("a\\uag ", &.{ .extended_identifier, .eof });
}

test "Tokenizer fuzz test" {
return std.testing.fuzz(testTokenizerFuzz, .{});
}
Expand Down
97 changes: 97 additions & 0 deletions src/aro/ucn.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
//! Universal Character Name support

const std = @import("std");

const Kind = enum {
/// Valid not escaped char
normal,
/// Not escaped, but not valid UTF-8
invalid_utf_8,
/// Valid UCN char
ucn,
/// Incomplete UCN escape sequence
incomplete_ucn,
/// UCN escape sequence does not specify a unicode code point
invalid_codepoint,
/// UCN names a control character
control,
/// UCN names a basic character set character
not_allowed,
};

pub const DecodedUniversalChar = struct {
codepoint: u32,
consumed: usize,
buf: []const u8,
kind: Kind,
};

/// Decodes a C99-style universal character name (e.g., \uXXXX or \UXXXXXXXX)
/// into a unicode codepoint. Returns the decoded character and the number of
/// bytes consumed from the input string.
fn decodeUniversalChar(input: []const u8, output: []u8) DecodedUniversalChar {
std.debug.assert(input.len >= 2 and input[0] == '\\' and (input[1] == 'u' or input[1] == 'U'));
const is_long = input[1] == 'U';
const required: usize = if (is_long) 10 else 6;

if (input.len < required) {
return .{
.codepoint = 0,
.consumed = input.len,
.buf = input,
.kind = .incomplete_ucn,
};
}

const hex_part = input[2..required];
var codepoint: u32 = 0;
for (hex_part, 0..) |c, i| {
codepoint *= 16;
const value = switch (c) {
'0'...'9' => c - '0',
'a'...'f' => 10 + (c - 'a'),
'A'...'F' => 10 + (c - 'A'),
else => return .{ .codepoint = 0, .consumed = i, .buf = input[0..i], .kind = .incomplete_ucn },
};
codepoint += value;
}
if (codepoint > std.math.maxInt(u21)) {
return .{ .codepoint = 0, .consumed = required, .buf = input, .kind = .invalid_codepoint };
}

const len = std.unicode.utf8Encode(@as(u21, @intCast(codepoint)), output) catch {
return .{ .codepoint = codepoint, .consumed = required, .buf = input, .kind = .invalid_codepoint };
};
const kind: Kind = switch (codepoint) {
0...0x1F, 0x7F...0x9F => .control,
0x20...0x7E => .not_allowed,
else => .ucn,
};
return .{ .codepoint = codepoint, .consumed = required, .buf = output[0..len], .kind = kind };
}

pub const CharIterator = struct {
str: []const u8,
i: usize,
buf: [10]u8,

pub fn init(str: []const u8) CharIterator {
return .{ .str = str, .i = 0, .buf = undefined };
}

pub fn next(self: *@This()) ?DecodedUniversalChar {
if (self.i >= self.str.len) return null;
if (self.str[self.i] == '\\' and self.i + 1 < self.str.len and (self.str[self.i + 1] == 'u' or self.str[self.i + 1] == 'U')) {
const decoded = decodeUniversalChar(self.str[self.i..], self.buf[0..]);
self.i += decoded.consumed;
return decoded;
} else {
const len = std.unicode.utf8ByteSequenceLength(self.str[self.i]) catch {
defer self.i += 1;
return .{ .codepoint = self.str[self.i], .consumed = 1, .buf = self.str[self.i..][0..1], .kind = .invalid_utf_8 };
};
defer self.i += len;
return .{ .codepoint = 0, .consumed = len, .buf = self.str[self.i..][0..len], .kind = .normal };
}
}
};
Loading