From 0221a909394fbd7317c8ef83754f7de99763c0a7 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Fri, 29 Dec 2023 18:13:38 -0500 Subject: [PATCH] Math and currency symbols are safe for display This also cleans up some technical debt: a Python script generated C code, which was in turn used to generate more C code! Instead, just use ICU4C in the C code directly, deleting one of two code generators. Finally, improve the tests to ensure that too-large Unicode codepoints are rejected and ensure that ASCII characters that are neither DEL nor controls are accepted. (cherry picked from commit 4e9e357276898167e802d6bbc77e236fd2ff17a4) --- qrexec-lib/Makefile | 6 +- qrexec-lib/gentbl.py | 29 --------- qrexec-lib/unicode-allowlist-table.c | 95 +++++++++++++++------------- qrexec-lib/unicode-generator.c | 57 +++++++++++++++-- qrexec-lib/validator-test.c | 48 +++++++++++--- 5 files changed, 143 insertions(+), 92 deletions(-) delete mode 100755 qrexec-lib/gentbl.py diff --git a/qrexec-lib/Makefile b/qrexec-lib/Makefile index 13799b30..3c918220 100644 --- a/qrexec-lib/Makefile +++ b/qrexec-lib/Makefile @@ -44,14 +44,10 @@ endif /bin/bash -euc 'set -o pipefail; ./unicode-generator | diff -u ./unicode-allowlist-table.c -' .PHONY: check-table-up-to-date -unicode-generator.o: unicode-class-table.c -unicode-class-table.c: gentbl.py Makefile - python3 -- $< > $@ - %.a: $(objs) $(AR) rcs $@ $^ clean: - rm -f ./*.o ./*~ ./*.a ./*.so.* ./*.dep unicode-class-table.c unicode-allowlist-table.c.tmp + rm -f ./*.o ./*~ ./*.a ./*.so.* ./*.dep unicode-allowlist-table.c.tmp install: mkdir -p $(DESTDIR)$(LIBDIR) diff --git a/qrexec-lib/gentbl.py b/qrexec-lib/gentbl.py deleted file mode 100755 index dd52b319..00000000 --- a/qrexec-lib/gentbl.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/python3 -- -import sys -def main(): - def print_interval(interval, last_cat): - if interval[0] != interval[1]: - print(f' case 0x{interval[0]:X} ... 0x{interval[1]:X}: // category {last_cat}' - .replace('category Cs', 'surrogates')) - else: - print(f' case 0x{interval[0]:X}: // category {last_cat}') - import sys - interval = [0, 0] - from unicodedata import category, name - cat = last_cat = 'Cc' - for i in range(0, 0x110000): - last_cat = cat - cat = category(chr(i)) - if cat == last_cat: - interval[1] = i - else: - # Allow the interval consisting only of 0x20, to allow spaces in - # file names - if (last_cat[0] in ('M', 'S', 'C', 'Z')) and interval[1] > 0x7E: - print_interval(interval, last_cat) - interval = [i, i] - print_interval(interval, last_cat) - print(' case 0x110000 ... UINT32_MAX: // too large') - sys.stdout.flush() -if __name__ == '__main__': - main() diff --git a/qrexec-lib/unicode-allowlist-table.c b/qrexec-lib/unicode-allowlist-table.c index 3250a30b..c3df96c6 100644 --- a/qrexec-lib/unicode-allowlist-table.c +++ b/qrexec-lib/unicode-allowlist-table.c @@ -1,12 +1,10 @@ case 0x000020 ... 0x00007e: - case 0x0000a1: + case 0x0000a1 ... 0x0000a5: case 0x0000a7: - case 0x0000aa ... 0x0000ab: - case 0x0000b2 ... 0x0000b3: + case 0x0000aa ... 0x0000ac: + case 0x0000b1 ... 0x0000b3: case 0x0000b5 ... 0x0000b7: - case 0x0000b9 ... 0x0000d6: - case 0x0000d8 ... 0x0000f6: - case 0x0000f8 ... 0x0002c1: + case 0x0000b9 ... 0x0002c1: case 0x0002c6 ... 0x0002d1: case 0x0002e0 ... 0x0002e4: case 0x0002ec: @@ -18,11 +16,11 @@ case 0x00038c: case 0x00038e ... 0x0003a1: case 0x0003a3 ... 0x0003e1: - case 0x0003f0 ... 0x0003f5: - case 0x0003f7 ... 0x000481: + case 0x0003f0 ... 0x000481: case 0x00048a ... 0x00052f: case 0x00060c: case 0x000964 ... 0x000965: + case 0x000e3f: case 0x0010fb: case 0x001100 ... 0x0011ff: case 0x0016eb ... 0x0016ed: @@ -56,38 +54,52 @@ case 0x001ff2 ... 0x001ff4: case 0x001ff6 ... 0x001ffc: case 0x002010 ... 0x002027: - case 0x002030 ... 0x002043: - case 0x002045 ... 0x002051: - case 0x002053 ... 0x00205e: + case 0x002030 ... 0x00205e: case 0x002070 ... 0x002071: - case 0x002074 ... 0x002079: - case 0x00207d ... 0x002089: - case 0x00208d ... 0x00208e: + case 0x002074 ... 0x00208e: case 0x002090 ... 0x00209c: + case 0x0020a0 ... 0x0020c0: case 0x002102: case 0x002107: case 0x00210a ... 0x002113: case 0x002115: - case 0x002119 ... 0x00211d: + case 0x002118 ... 0x00211d: case 0x002124: case 0x002126: case 0x002128: case 0x00212a ... 0x00212d: case 0x00212f ... 0x002139: - case 0x00213c ... 0x00213f: - case 0x002145 ... 0x002149: + case 0x00213c ... 0x002149: + case 0x00214b: case 0x00214e: case 0x002150 ... 0x002189: + case 0x002190 ... 0x002194: + case 0x00219a ... 0x00219b: + case 0x0021a0: + case 0x0021a3: + case 0x0021a6: + case 0x0021ae: + case 0x0021ce ... 0x0021cf: + case 0x0021d2: + case 0x0021d4: + case 0x0021f4 ... 0x0022ff: case 0x002308 ... 0x00230b: + case 0x002320 ... 0x002321: case 0x002329 ... 0x00232a: + case 0x00237c: + case 0x00239b ... 0x0023b3: + case 0x0023dc ... 0x0023e1: case 0x002460 ... 0x00249b: case 0x0024ea ... 0x0024ff: + case 0x0025b7: + case 0x0025c1: + case 0x0025f8 ... 0x0025ff: + case 0x00266f: case 0x002768 ... 0x002793: - case 0x0027c5 ... 0x0027c6: - case 0x0027e6 ... 0x0027ef: - case 0x002983 ... 0x002998: - case 0x0029d8 ... 0x0029db: - case 0x0029fc ... 0x0029fd: + case 0x0027c0 ... 0x0027ff: + case 0x002900 ... 0x002aff: + case 0x002b30 ... 0x002b44: + case 0x002b47 ... 0x002b4c: case 0x002c60 ... 0x002c7f: case 0x002e00 ... 0x002e4f: case 0x002e52 ... 0x002e5d: @@ -122,6 +134,7 @@ case 0x00a7d5 ... 0x00a7d9: case 0x00a7f2 ... 0x00a7ff: case 0x00a830 ... 0x00a835: + case 0x00a838: case 0x00a92e: case 0x00a960 ... 0x00a97c: case 0x00a9cf: @@ -136,22 +149,18 @@ case 0x00fd3e ... 0x00fd3f: case 0x00fe10 ... 0x00fe19: case 0x00fe30 ... 0x00fe52: - case 0x00fe54 ... 0x00fe61: - case 0x00fe63: - case 0x00fe68: - case 0x00fe6a ... 0x00fe6b: - case 0x00ff01 ... 0x00ff03: - case 0x00ff05 ... 0x00ff0a: - case 0x00ff0c ... 0x00ff1b: - case 0x00ff1f ... 0x00ff3d: + case 0x00fe54 ... 0x00fe66: + case 0x00fe68 ... 0x00fe6b: + case 0x00ff01 ... 0x00ff3d: case 0x00ff3f: - case 0x00ff41 ... 0x00ff5b: - case 0x00ff5d: - case 0x00ff5f ... 0x00ffbe: + case 0x00ff41 ... 0x00ffbe: case 0x00ffc2 ... 0x00ffc7: case 0x00ffca ... 0x00ffcf: case 0x00ffd2 ... 0x00ffd7: case 0x00ffda ... 0x00ffdc: + case 0x00ffe0 ... 0x00ffe2: + case 0x00ffe5 ... 0x00ffe6: + case 0x00ffe9 ... 0x00ffec: case 0x010100 ... 0x010102: case 0x010107 ... 0x010133: case 0x010140 ... 0x010178: @@ -165,8 +174,11 @@ case 0x01aff5 ... 0x01affb: case 0x01affd ... 0x01affe: case 0x01b000 ... 0x01b122: + case 0x01b132: case 0x01b150 ... 0x01b152: + case 0x01b155: case 0x01b164 ... 0x01b167: + case 0x01d2c0 ... 0x01d2d3: case 0x01d2e0 ... 0x01d2f3: case 0x01d360 ... 0x01d378: case 0x01d400 ... 0x01d454: @@ -188,25 +200,18 @@ case 0x01d546: case 0x01d54a ... 0x01d550: case 0x01d552 ... 0x01d6a5: - case 0x01d6a8 ... 0x01d6c0: - case 0x01d6c2 ... 0x01d6da: - case 0x01d6dc ... 0x01d6fa: - case 0x01d6fc ... 0x01d714: - case 0x01d716 ... 0x01d734: - case 0x01d736 ... 0x01d74e: - case 0x01d750 ... 0x01d76e: - case 0x01d770 ... 0x01d788: - case 0x01d78a ... 0x01d7a8: - case 0x01d7aa ... 0x01d7c2: - case 0x01d7c4 ... 0x01d7cb: + case 0x01d6a8 ... 0x01d7cb: case 0x01d7ce ... 0x01d7ff: case 0x01df00 ... 0x01df1e: + case 0x01df25 ... 0x01df2a: + case 0x01e030 ... 0x01e06d: case 0x01f100 ... 0x01f10c: case 0x01fbf0 ... 0x01fbf9: case 0x020000 ... 0x02a6df: - case 0x02a700 ... 0x02b738: + case 0x02a700 ... 0x02b739: case 0x02b740 ... 0x02b81d: case 0x02b820 ... 0x02cea1: case 0x02ceb0 ... 0x02ebe0: case 0x02f800 ... 0x02fa1d: case 0x030000 ... 0x03134a: + case 0x031350 ... 0x0323af: diff --git a/qrexec-lib/unicode-generator.c b/qrexec-lib/unicode-generator.c index 054124b5..9c96372c 100644 --- a/qrexec-lib/unicode-generator.c +++ b/qrexec-lib/unicode-generator.c @@ -23,15 +23,62 @@ static bool is_permitted_code_point(uint32_t const code_point) if (!(U_IS_UNICODE_CHAR(code_point))) return false; + /* Reject all control characters */ + if (code_point < 0x20) + return false; + + /* Allow all other ASCII characters except DEL */ + if (code_point < 0x7F) + return true; + /* * Validate that the codepoint is a valid scalar value and is not a symbol, * space, unassigned character, or control character. */ - switch (code_point) { -#include "unicode-class-table.c" - return false; // Invalid UTF-8 or forbidden codepoint - default: + int category = u_charType(code_point); + switch (category) { + case U_UNASSIGNED: + return false; + case U_UPPERCASE_LETTER: + case U_LOWERCASE_LETTER: + case U_TITLECASE_LETTER: + case U_MODIFIER_LETTER: + case U_OTHER_LETTER: + break; + case U_NON_SPACING_MARK: + case U_ENCLOSING_MARK: + case U_COMBINING_SPACING_MARK: + return false; + case U_DECIMAL_DIGIT_NUMBER: + case U_LETTER_NUMBER: + case U_OTHER_NUMBER: break; + case U_SPACE_SEPARATOR: + return false; + case U_LINE_SEPARATOR: + case U_PARAGRAPH_SEPARATOR: + case U_CONTROL_CHAR: + case U_FORMAT_CHAR: + case U_PRIVATE_USE_CHAR: + return false; + case U_DASH_PUNCTUATION: + case U_START_PUNCTUATION: + case U_END_PUNCTUATION: + case U_CONNECTOR_PUNCTUATION: + case U_OTHER_PUNCTUATION: + case U_MATH_SYMBOL: + case U_CURRENCY_SYMBOL: + break; + case U_MODIFIER_SYMBOL: + case U_OTHER_SYMBOL: + return false; + case U_INITIAL_PUNCTUATION: + case U_FINAL_PUNCTUATION: + break; + case U_SURROGATE: + default: + fprintf(stderr, "BUG: u_charType(0x%" PRIx32 ") returned unexpected value %d", code_point, category); + abort(); } uint32_t s = u_charDirection(code_point); @@ -242,6 +289,8 @@ static void print_code_point_list(FILE *out) uint32_t range_start = 0; for (uint32_t v = 0x20; v < 0x110000; ++v) { bool this_allowed = is_permitted_code_point(v); + if (v < 0x7F) + assert(this_allowed); if (this_allowed ^ last_allowed) { last_allowed = this_allowed; if (this_allowed) { diff --git a/qrexec-lib/validator-test.c b/qrexec-lib/validator-test.c index 887fd359..0adf55b9 100644 --- a/qrexec-lib/validator-test.c +++ b/qrexec-lib/validator-test.c @@ -27,16 +27,41 @@ static void character_must_be_allowed(UChar32 c) static void character_must_be_forbidden(UChar32 c) { - char buf[5]; + uint8_t buf[128]; int32_t off = 0; - UBool e = false; - U8_APPEND((uint8_t *)buf, off, 4, c, e); - assert(!e && off <= 4); - buf[off] = 0; - if (qubes_pure_code_point_safe_for_display(c) || - qubes_pure_string_safe_for_display(buf, 0)) + if (qubes_pure_code_point_safe_for_display(c)) { + fprintf(stderr, "BUG: allowed codepoint U+%" PRIx32 "\n", (int32_t)c); + abort(); + } else if (c < 0) { + return; // cannot be encoded sensibly + } else if (c < (1 << 7)) { + buf[off++] = c; + } else if (c < (1 << 11)) { + buf[off++] = (0xC0 | (c >> 6)); + buf[off++] = (0x80 | (c & 0x3F)); + } else if (c < (1L << 16)) { + buf[off++] = (0xE0 | (c >> 12)); + buf[off++] = (0x80 | ((c >> 6) & 0x3F)); + buf[off++] = (0x80 | (c & 0x3F)); + } else if (c < 0x140000) { + buf[off++] = (0xF0 | (c >> 18)); + buf[off++] = (0x80 | ((c >> 12) & 0x3F)); + buf[off++] = (0x80 | ((c >> 6) & 0x3F)); + buf[off++] = (0x80 | (c & 0x3F)); + } else { + return; // trivially rejected + } + if (c < 0x110000 && !U_IS_SURROGATE(c)) { + UChar32 compare_c; + U8_GET(buf, 0, 0, off, compare_c); + assert(compare_c >= 0); + assert(compare_c == c); + } + + buf[off++] = 0; + if (qubes_pure_string_safe_for_display((const char *)buf, 0)) { - fprintf(stderr, "BUG: allowed file name with codepoint U+%" PRIx32 "\n", (int32_t)c); + fprintf(stderr, "BUG: allowed string with codepoint U+%" PRIx32 "\n", (int32_t)c); abort(); } } @@ -53,6 +78,9 @@ int main(int argc, char **argv) assert(qubes_pure_validate_file_name((uint8_t *)u8"\u0400.txt")); // As are unicode quotation marks assert(qubes_pure_validate_file_name((uint8_t *)u8"\u201c")); + // As are ASCII characters, except DEL and controls + for (uint32_t i = 0x20; i < 0x7F; ++i) + character_must_be_allowed(i); // And CJK ideographs uint32_t cjk_ranges[] = { 0x03400, 0x04DBF, @@ -92,7 +120,9 @@ int main(int argc, char **argv) 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, // Forbidden codepoints - 0x3134B, 0x10FFFF, + 0x0323B0, 0x10FFFF, + // Too long + 0x110000, UINT32_MAX - 1, 0x0, }; for (size_t i = 0; i == 0 || forbidden[i]; i += 2) {