Skip to content

Commit

Permalink
Fix isProbablyUtf8 to handle multibyte chars over the 1024 byte line
Browse files Browse the repository at this point in the history
Previously, multibyte chars crossing this line would make a valid UTF-8
string be considered as binary. That's particularly bad because if
you're modifying a string containing multibyte chars it might change
from utf-8 to binary when a multibyte char is pushed across that line,
resulting in a very weird edit experience.
  • Loading branch information
pimterry committed Apr 19, 2024
1 parent 64b2a8e commit 01dc1a6
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 2 deletions.
26 changes: 24 additions & 2 deletions src/util/buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,31 @@ const binaryLaxDecoder = new TextDecoder('latin1', { fatal: false });
// as UTF8) but for _editing_ binary data we need a lossless encoding, not utf8.
// (Monaco isn't perfectly lossless editing binary anyway, but we can try).
export function isProbablyUtf8(buffer: Buffer) {
let dataToCheck: Buffer;
if (buffer.byteLength > 1028) {
// We want to trim the data to ~1KB, to avoid checking huge bodies in full.
// Unfortunately, for UTF-8 this isn't safe: if a multibyte char crosses the
// line when we cut the string, our slice of valid UTF-8 will be invalid.
// To handle this, we find the first non-continuation byte after 1024, and
// decode everything up to that point.

const lastUtf8IndexBefore1024 = buffer
.slice(1024, 1028) // 4 bytes should be enough - max length of UTF8 char
.findIndex((byte) =>
(byte & 0xC0) != 0x80 // 0x80 === 0b10... => continuation byte
);

// If there's no non-continuation byte, it's definitely not UTF-8
if (lastUtf8IndexBefore1024 === -1) return false;
const cleanEndOfUtf8Data = 1024 + lastUtf8IndexBefore1024;

dataToCheck = buffer.slice(0, cleanEndOfUtf8Data);
} else {
dataToCheck = buffer;
}

try {
// Just check the first 1kb, in case it's a huge file
const dataToCheck = buffer.slice(0, 1024);
// Fully decode our maybe-UTF8 chunk, and see if it's valid throughout:
strictDecoder.decode(dataToCheck);
return true; // Decoded OK, probably safe
} catch (e) {
Expand Down
77 changes: 77 additions & 0 deletions test/unit/util/buffer.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@

import { isProbablyUtf8 } from "../../../src/util/buffer";
import { expect } from "../../test-setup";

describe("Buffer utils", () => {
describe("isProbablyUtf8", () => {
it("returns true for empty string", () => {
expect(
isProbablyUtf8(Buffer.from(""))
).to.equal(true);
});

it("returns true for a short UTF-8 string", () => {
expect(
isProbablyUtf8(Buffer.from("hello world"))
).to.equal(true);
});

it("returns false for a short binary string", () => {
expect(
isProbablyUtf8(Buffer.from([
// The header for a PNG
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A
]))
).to.equal(false);
});

it("returns true for >1KB ASCII string", () => {
expect(
isProbablyUtf8(Buffer.alloc(4096).fill(
'hello'
))
).to.equal(true);
});

it("returns true for >1KB multibyte UTF-8 string", () => {
expect(
isProbablyUtf8(Buffer.alloc(4096).fill(
'你好' // Hello in chinese (2x 3-byte chars)
// N.b. 1024 is not divisible by 3, so this
// tests char-split detection
))
).to.equal(true);
});

it("returns true for an exactly 1026 byte multibyte UTF-8 string", () => {
expect(
isProbablyUtf8(Buffer.alloc(1026).fill(
'你好' // Hello in chinese (2x 3-byte chars)
// 1026 is divisible by 3, but this means we run out of
// buffer looking for the next UTF-8 char from 1024+
))
).to.equal(true);
});

it("returns false for >1KB binary string", () => {
expect(
isProbablyUtf8(Buffer.alloc(4096).fill(
Buffer.from([
// The header for a PNG
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A
])
))
).to.equal(false);
});

it("returns false for >1KB binary string of continuation bytes", () => {
expect(
isProbablyUtf8(Buffer.from(
Buffer.alloc(4096).fill(Buffer.from([
0xba // All continuation bytes
]))
))
).to.equal(false);
});
});
})

0 comments on commit 01dc1a6

Please sign in to comment.