From 01dc1a6e280699c1229ac3a0adcf304674db0777 Mon Sep 17 00:00:00 2001 From: Tim Perry Date: Fri, 19 Apr 2024 15:04:15 +0200 Subject: [PATCH] Fix isProbablyUtf8 to handle multibyte chars over the 1024 byte line Previously, multibyte chars crossing this line would make a valid UTF-8 string be considered as binary. That's particularly bad because if you're modifying a string containing multibyte chars it might change from utf-8 to binary when a multibyte char is pushed across that line, resulting in a very weird edit experience. --- src/util/buffer.ts | 26 +++++++++++- test/unit/util/buffer.spec.ts | 77 +++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 test/unit/util/buffer.spec.ts diff --git a/src/util/buffer.ts b/src/util/buffer.ts index 2435ced4..45e0c232 100644 --- a/src/util/buffer.ts +++ b/src/util/buffer.ts @@ -22,9 +22,31 @@ const binaryLaxDecoder = new TextDecoder('latin1', { fatal: false }); // as UTF8) but for _editing_ binary data we need a lossless encoding, not utf8. // (Monaco isn't perfectly lossless editing binary anyway, but we can try). export function isProbablyUtf8(buffer: Buffer) { + let dataToCheck: Buffer; + if (buffer.byteLength > 1028) { + // We want to trim the data to ~1KB, to avoid checking huge bodies in full. + // Unfortunately, for UTF-8 this isn't safe: if a multibyte char crosses the + // line when we cut the string, our slice of valid UTF-8 will be invalid. + // To handle this, we find the first non-continuation byte after 1024, and + // decode everything up to that point. + + const lastUtf8IndexBefore1024 = buffer + .slice(1024, 1028) // 4 bytes should be enough - max length of UTF8 char + .findIndex((byte) => + (byte & 0xC0) != 0x80 // 0x80 === 0b10... => continuation byte + ); + + // If there's no non-continuation byte, it's definitely not UTF-8 + if (lastUtf8IndexBefore1024 === -1) return false; + const cleanEndOfUtf8Data = 1024 + lastUtf8IndexBefore1024; + + dataToCheck = buffer.slice(0, cleanEndOfUtf8Data); + } else { + dataToCheck = buffer; + } + try { - // Just check the first 1kb, in case it's a huge file - const dataToCheck = buffer.slice(0, 1024); + // Fully decode our maybe-UTF8 chunk, and see if it's valid throughout: strictDecoder.decode(dataToCheck); return true; // Decoded OK, probably safe } catch (e) { diff --git a/test/unit/util/buffer.spec.ts b/test/unit/util/buffer.spec.ts new file mode 100644 index 00000000..739669e3 --- /dev/null +++ b/test/unit/util/buffer.spec.ts @@ -0,0 +1,77 @@ + +import { isProbablyUtf8 } from "../../../src/util/buffer"; +import { expect } from "../../test-setup"; + +describe("Buffer utils", () => { + describe("isProbablyUtf8", () => { + it("returns true for empty string", () => { + expect( + isProbablyUtf8(Buffer.from("")) + ).to.equal(true); + }); + + it("returns true for a short UTF-8 string", () => { + expect( + isProbablyUtf8(Buffer.from("hello world")) + ).to.equal(true); + }); + + it("returns false for a short binary string", () => { + expect( + isProbablyUtf8(Buffer.from([ + // The header for a PNG + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A + ])) + ).to.equal(false); + }); + + it("returns true for >1KB ASCII string", () => { + expect( + isProbablyUtf8(Buffer.alloc(4096).fill( + 'hello' + )) + ).to.equal(true); + }); + + it("returns true for >1KB multibyte UTF-8 string", () => { + expect( + isProbablyUtf8(Buffer.alloc(4096).fill( + '你好' // Hello in chinese (2x 3-byte chars) + // N.b. 1024 is not divisible by 3, so this + // tests char-split detection + )) + ).to.equal(true); + }); + + it("returns true for an exactly 1026 byte multibyte UTF-8 string", () => { + expect( + isProbablyUtf8(Buffer.alloc(1026).fill( + '你好' // Hello in chinese (2x 3-byte chars) + // 1026 is divisible by 3, but this means we run out of + // buffer looking for the next UTF-8 char from 1024+ + )) + ).to.equal(true); + }); + + it("returns false for >1KB binary string", () => { + expect( + isProbablyUtf8(Buffer.alloc(4096).fill( + Buffer.from([ + // The header for a PNG + 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A + ]) + )) + ).to.equal(false); + }); + + it("returns false for >1KB binary string of continuation bytes", () => { + expect( + isProbablyUtf8(Buffer.from( + Buffer.alloc(4096).fill(Buffer.from([ + 0xba // All continuation bytes + ])) + )) + ).to.equal(false); + }); + }); +}) \ No newline at end of file