Skip to content

Commit

Permalink
Math and currency symbols are safe for display
Browse files Browse the repository at this point in the history
This also cleans up some technical debt: a Python script generated C
code, which was in turn used to generate more C code!  Instead, just use
ICU4C in the C code directly, deleting one of two code generators.
Finally, improve the tests to ensure that too-large Unicode codepoints
are rejected and ensure that ASCII characters that are neither DEL nor
controls are accepted.

(cherry picked from commit 4e9e357)
  • Loading branch information
DemiMarie authored and marmarek committed Jun 22, 2024
1 parent 654f4c2 commit 0221a90
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 92 deletions.
6 changes: 1 addition & 5 deletions qrexec-lib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,10 @@ endif
/bin/bash -euc 'set -o pipefail; ./unicode-generator | diff -u ./unicode-allowlist-table.c -'
.PHONY: check-table-up-to-date

unicode-generator.o: unicode-class-table.c
unicode-class-table.c: gentbl.py Makefile
python3 -- $< > $@

%.a: $(objs)
$(AR) rcs $@ $^
clean:
rm -f ./*.o ./*~ ./*.a ./*.so.* ./*.dep unicode-class-table.c unicode-allowlist-table.c.tmp
rm -f ./*.o ./*~ ./*.a ./*.so.* ./*.dep unicode-allowlist-table.c.tmp

install:
mkdir -p $(DESTDIR)$(LIBDIR)
Expand Down
29 changes: 0 additions & 29 deletions qrexec-lib/gentbl.py

This file was deleted.

95 changes: 50 additions & 45 deletions qrexec-lib/unicode-allowlist-table.c
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
case 0x000020 ... 0x00007e:
case 0x0000a1:
case 0x0000a1 ... 0x0000a5:
case 0x0000a7:
case 0x0000aa ... 0x0000ab:
case 0x0000b2 ... 0x0000b3:
case 0x0000aa ... 0x0000ac:
case 0x0000b1 ... 0x0000b3:
case 0x0000b5 ... 0x0000b7:
case 0x0000b9 ... 0x0000d6:
case 0x0000d8 ... 0x0000f6:
case 0x0000f8 ... 0x0002c1:
case 0x0000b9 ... 0x0002c1:
case 0x0002c6 ... 0x0002d1:
case 0x0002e0 ... 0x0002e4:
case 0x0002ec:
Expand All @@ -18,11 +16,11 @@
case 0x00038c:
case 0x00038e ... 0x0003a1:
case 0x0003a3 ... 0x0003e1:
case 0x0003f0 ... 0x0003f5:
case 0x0003f7 ... 0x000481:
case 0x0003f0 ... 0x000481:
case 0x00048a ... 0x00052f:
case 0x00060c:
case 0x000964 ... 0x000965:
case 0x000e3f:
case 0x0010fb:
case 0x001100 ... 0x0011ff:
case 0x0016eb ... 0x0016ed:
Expand Down Expand Up @@ -56,38 +54,52 @@
case 0x001ff2 ... 0x001ff4:
case 0x001ff6 ... 0x001ffc:
case 0x002010 ... 0x002027:
case 0x002030 ... 0x002043:
case 0x002045 ... 0x002051:
case 0x002053 ... 0x00205e:
case 0x002030 ... 0x00205e:
case 0x002070 ... 0x002071:
case 0x002074 ... 0x002079:
case 0x00207d ... 0x002089:
case 0x00208d ... 0x00208e:
case 0x002074 ... 0x00208e:
case 0x002090 ... 0x00209c:
case 0x0020a0 ... 0x0020c0:
case 0x002102:
case 0x002107:
case 0x00210a ... 0x002113:
case 0x002115:
case 0x002119 ... 0x00211d:
case 0x002118 ... 0x00211d:
case 0x002124:
case 0x002126:
case 0x002128:
case 0x00212a ... 0x00212d:
case 0x00212f ... 0x002139:
case 0x00213c ... 0x00213f:
case 0x002145 ... 0x002149:
case 0x00213c ... 0x002149:
case 0x00214b:
case 0x00214e:
case 0x002150 ... 0x002189:
case 0x002190 ... 0x002194:
case 0x00219a ... 0x00219b:
case 0x0021a0:
case 0x0021a3:
case 0x0021a6:
case 0x0021ae:
case 0x0021ce ... 0x0021cf:
case 0x0021d2:
case 0x0021d4:
case 0x0021f4 ... 0x0022ff:
case 0x002308 ... 0x00230b:
case 0x002320 ... 0x002321:
case 0x002329 ... 0x00232a:
case 0x00237c:
case 0x00239b ... 0x0023b3:
case 0x0023dc ... 0x0023e1:
case 0x002460 ... 0x00249b:
case 0x0024ea ... 0x0024ff:
case 0x0025b7:
case 0x0025c1:
case 0x0025f8 ... 0x0025ff:
case 0x00266f:
case 0x002768 ... 0x002793:
case 0x0027c5 ... 0x0027c6:
case 0x0027e6 ... 0x0027ef:
case 0x002983 ... 0x002998:
case 0x0029d8 ... 0x0029db:
case 0x0029fc ... 0x0029fd:
case 0x0027c0 ... 0x0027ff:
case 0x002900 ... 0x002aff:
case 0x002b30 ... 0x002b44:
case 0x002b47 ... 0x002b4c:
case 0x002c60 ... 0x002c7f:
case 0x002e00 ... 0x002e4f:
case 0x002e52 ... 0x002e5d:
Expand Down Expand Up @@ -122,6 +134,7 @@
case 0x00a7d5 ... 0x00a7d9:
case 0x00a7f2 ... 0x00a7ff:
case 0x00a830 ... 0x00a835:
case 0x00a838:
case 0x00a92e:
case 0x00a960 ... 0x00a97c:
case 0x00a9cf:
Expand All @@ -136,22 +149,18 @@
case 0x00fd3e ... 0x00fd3f:
case 0x00fe10 ... 0x00fe19:
case 0x00fe30 ... 0x00fe52:
case 0x00fe54 ... 0x00fe61:
case 0x00fe63:
case 0x00fe68:
case 0x00fe6a ... 0x00fe6b:
case 0x00ff01 ... 0x00ff03:
case 0x00ff05 ... 0x00ff0a:
case 0x00ff0c ... 0x00ff1b:
case 0x00ff1f ... 0x00ff3d:
case 0x00fe54 ... 0x00fe66:
case 0x00fe68 ... 0x00fe6b:
case 0x00ff01 ... 0x00ff3d:
case 0x00ff3f:
case 0x00ff41 ... 0x00ff5b:
case 0x00ff5d:
case 0x00ff5f ... 0x00ffbe:
case 0x00ff41 ... 0x00ffbe:
case 0x00ffc2 ... 0x00ffc7:
case 0x00ffca ... 0x00ffcf:
case 0x00ffd2 ... 0x00ffd7:
case 0x00ffda ... 0x00ffdc:
case 0x00ffe0 ... 0x00ffe2:
case 0x00ffe5 ... 0x00ffe6:
case 0x00ffe9 ... 0x00ffec:
case 0x010100 ... 0x010102:
case 0x010107 ... 0x010133:
case 0x010140 ... 0x010178:
Expand All @@ -165,8 +174,11 @@
case 0x01aff5 ... 0x01affb:
case 0x01affd ... 0x01affe:
case 0x01b000 ... 0x01b122:
case 0x01b132:
case 0x01b150 ... 0x01b152:
case 0x01b155:
case 0x01b164 ... 0x01b167:
case 0x01d2c0 ... 0x01d2d3:
case 0x01d2e0 ... 0x01d2f3:
case 0x01d360 ... 0x01d378:
case 0x01d400 ... 0x01d454:
Expand All @@ -188,25 +200,18 @@
case 0x01d546:
case 0x01d54a ... 0x01d550:
case 0x01d552 ... 0x01d6a5:
case 0x01d6a8 ... 0x01d6c0:
case 0x01d6c2 ... 0x01d6da:
case 0x01d6dc ... 0x01d6fa:
case 0x01d6fc ... 0x01d714:
case 0x01d716 ... 0x01d734:
case 0x01d736 ... 0x01d74e:
case 0x01d750 ... 0x01d76e:
case 0x01d770 ... 0x01d788:
case 0x01d78a ... 0x01d7a8:
case 0x01d7aa ... 0x01d7c2:
case 0x01d7c4 ... 0x01d7cb:
case 0x01d6a8 ... 0x01d7cb:
case 0x01d7ce ... 0x01d7ff:
case 0x01df00 ... 0x01df1e:
case 0x01df25 ... 0x01df2a:
case 0x01e030 ... 0x01e06d:
case 0x01f100 ... 0x01f10c:
case 0x01fbf0 ... 0x01fbf9:
case 0x020000 ... 0x02a6df:
case 0x02a700 ... 0x02b738:
case 0x02a700 ... 0x02b739:
case 0x02b740 ... 0x02b81d:
case 0x02b820 ... 0x02cea1:
case 0x02ceb0 ... 0x02ebe0:
case 0x02f800 ... 0x02fa1d:
case 0x030000 ... 0x03134a:
case 0x031350 ... 0x0323af:
57 changes: 53 additions & 4 deletions qrexec-lib/unicode-generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,62 @@ static bool is_permitted_code_point(uint32_t const code_point)
if (!(U_IS_UNICODE_CHAR(code_point)))
return false;

/* Reject all control characters */
if (code_point < 0x20)
return false;

/* Allow all other ASCII characters except DEL */
if (code_point < 0x7F)
return true;

/*
* Validate that the codepoint is a valid scalar value and is not a symbol,
* space, unassigned character, or control character.
*/
switch (code_point) {
#include "unicode-class-table.c"
return false; // Invalid UTF-8 or forbidden codepoint
default:
int category = u_charType(code_point);
switch (category) {
case U_UNASSIGNED:
return false;
case U_UPPERCASE_LETTER:
case U_LOWERCASE_LETTER:
case U_TITLECASE_LETTER:
case U_MODIFIER_LETTER:
case U_OTHER_LETTER:
break;
case U_NON_SPACING_MARK:
case U_ENCLOSING_MARK:
case U_COMBINING_SPACING_MARK:
return false;
case U_DECIMAL_DIGIT_NUMBER:
case U_LETTER_NUMBER:
case U_OTHER_NUMBER:
break;
case U_SPACE_SEPARATOR:
return false;
case U_LINE_SEPARATOR:
case U_PARAGRAPH_SEPARATOR:
case U_CONTROL_CHAR:
case U_FORMAT_CHAR:
case U_PRIVATE_USE_CHAR:
return false;
case U_DASH_PUNCTUATION:
case U_START_PUNCTUATION:
case U_END_PUNCTUATION:
case U_CONNECTOR_PUNCTUATION:
case U_OTHER_PUNCTUATION:
case U_MATH_SYMBOL:
case U_CURRENCY_SYMBOL:
break;
case U_MODIFIER_SYMBOL:
case U_OTHER_SYMBOL:
return false;
case U_INITIAL_PUNCTUATION:
case U_FINAL_PUNCTUATION:
break;
case U_SURROGATE:
default:
fprintf(stderr, "BUG: u_charType(0x%" PRIx32 ") returned unexpected value %d", code_point, category);
abort();
}

uint32_t s = u_charDirection(code_point);
Expand Down Expand Up @@ -242,6 +289,8 @@ static void print_code_point_list(FILE *out)
uint32_t range_start = 0;
for (uint32_t v = 0x20; v < 0x110000; ++v) {
bool this_allowed = is_permitted_code_point(v);
if (v < 0x7F)
assert(this_allowed);
if (this_allowed ^ last_allowed) {
last_allowed = this_allowed;
if (this_allowed) {
Expand Down
48 changes: 39 additions & 9 deletions qrexec-lib/validator-test.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,41 @@ static void character_must_be_allowed(UChar32 c)

static void character_must_be_forbidden(UChar32 c)
{
char buf[5];
uint8_t buf[128];
int32_t off = 0;
UBool e = false;
U8_APPEND((uint8_t *)buf, off, 4, c, e);
assert(!e && off <= 4);
buf[off] = 0;
if (qubes_pure_code_point_safe_for_display(c) ||
qubes_pure_string_safe_for_display(buf, 0))
if (qubes_pure_code_point_safe_for_display(c)) {
fprintf(stderr, "BUG: allowed codepoint U+%" PRIx32 "\n", (int32_t)c);
abort();
} else if (c < 0) {
return; // cannot be encoded sensibly
} else if (c < (1 << 7)) {
buf[off++] = c;
} else if (c < (1 << 11)) {
buf[off++] = (0xC0 | (c >> 6));
buf[off++] = (0x80 | (c & 0x3F));
} else if (c < (1L << 16)) {
buf[off++] = (0xE0 | (c >> 12));
buf[off++] = (0x80 | ((c >> 6) & 0x3F));
buf[off++] = (0x80 | (c & 0x3F));
} else if (c < 0x140000) {
buf[off++] = (0xF0 | (c >> 18));
buf[off++] = (0x80 | ((c >> 12) & 0x3F));
buf[off++] = (0x80 | ((c >> 6) & 0x3F));
buf[off++] = (0x80 | (c & 0x3F));
} else {
return; // trivially rejected
}
if (c < 0x110000 && !U_IS_SURROGATE(c)) {
UChar32 compare_c;
U8_GET(buf, 0, 0, off, compare_c);
assert(compare_c >= 0);
assert(compare_c == c);
}

buf[off++] = 0;
if (qubes_pure_string_safe_for_display((const char *)buf, 0))
{
fprintf(stderr, "BUG: allowed file name with codepoint U+%" PRIx32 "\n", (int32_t)c);
fprintf(stderr, "BUG: allowed string with codepoint U+%" PRIx32 "\n", (int32_t)c);
abort();
}
}
Expand All @@ -53,6 +78,9 @@ int main(int argc, char **argv)
assert(qubes_pure_validate_file_name((uint8_t *)u8"\u0400.txt"));
// As are unicode quotation marks
assert(qubes_pure_validate_file_name((uint8_t *)u8"\u201c"));
// As are ASCII characters, except DEL and controls
for (uint32_t i = 0x20; i < 0x7F; ++i)
character_must_be_allowed(i);
// And CJK ideographs
uint32_t cjk_ranges[] = {
0x03400, 0x04DBF,
Expand Down Expand Up @@ -92,7 +120,9 @@ int main(int argc, char **argv)
0x1FFFE, 0x1FFFF,
0x2FFFE, 0x2FFFF,
// Forbidden codepoints
0x3134B, 0x10FFFF,
0x0323B0, 0x10FFFF,
// Too long
0x110000, UINT32_MAX - 1,
0x0,
};
for (size_t i = 0; i == 0 || forbidden[i]; i += 2) {
Expand Down

0 comments on commit 0221a90

Please sign in to comment.