From 0db371849f1ab9bd67d45328a9e393ee619ff3ef Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Sat, 1 Apr 2023 15:15:09 +0100 Subject: [PATCH 01/12] refactor(tokenizer): Use `EntityDecoder` --- src/Tokenizer.ts | 279 ++++++++--------------------------------------- 1 file changed, 48 insertions(+), 231 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index cf20dc688..71538bc44 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -1,9 +1,8 @@ import { + EntityDecoder, + EntityDecoderMode, htmlDecodeTree, xmlDecodeTree, - BinTrieFlags, - determineBranch, - replaceCodePoint, } from "entities/lib/decode.js"; const enum CharCodes { @@ -73,11 +72,7 @@ const enum State { SpecialStartSequence, InSpecialTag, - BeforeEntity, // & - BeforeNumericEntity, // # - InNamedEntity, - InNumericEntity, - InHexEntity, // X + InEntity, } function isWhitespace(c: number): boolean { @@ -94,10 +89,6 @@ function isEndOfTagSection(c: number): boolean { return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c); } -function isNumber(c: number): boolean { - return c >= CharCodes.Zero && c <= CharCodes.Nine; -} - function isASCIIAlpha(c: number): boolean { return ( (c >= CharCodes.LowerA && c <= CharCodes.LowerZ) || @@ -105,13 +96,6 @@ function isASCIIAlpha(c: number): boolean { ); } -function isHexDigit(c: number): boolean { - return ( - (c >= CharCodes.UpperA && c <= CharCodes.UpperF) || - (c >= CharCodes.LowerA && c <= CharCodes.LowerF) - ); -} - export enum QuoteType { NoValue = 0, Unquoted = 1, @@ -161,6 +145,8 @@ export default class Tokenizer { private sectionStart = 0; /** The index within the buffer that we are currently looking at. */ private index = 0; + /** The start of the last entity. */ + private entityStart = 0; /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */ private baseState = State.Text; /** For special parsing behavior inside of script and style tags. */ @@ -172,7 +158,7 @@ export default class Tokenizer { private readonly xmlMode: boolean; private readonly decodeEntities: boolean; - private readonly entityTrie: Uint16Array; + private readonly entityDecoder: EntityDecoder; constructor( { @@ -183,7 +169,10 @@ export default class Tokenizer { ) { this.xmlMode = xmlMode; this.decodeEntities = decodeEntities; - this.entityTrie = xmlMode ? xmlDecodeTree : htmlDecodeTree; + this.entityDecoder = new EntityDecoder( + xmlMode ? xmlDecodeTree : htmlDecodeTree, + (cp) => this.emitCodePoint(cp) + ); } public reset(): void { @@ -243,7 +232,7 @@ export default class Tokenizer { this.state = State.BeforeTagName; this.sectionStart = this.index; } else if (this.decodeEntities && c === CharCodes.Amp) { - this.state = State.BeforeEntity; + this.startEntity(); } } @@ -298,7 +287,7 @@ export default class Tokenizer { if (this.currentSequence === Sequences.TitleEnd) { // We have to parse entities in tags. if (this.decodeEntities && c === CharCodes.Amp) { - this.state = State.BeforeEntity; + this.startEntity(); } } else if (this.fastForwardTo(CharCodes.Lt)) { // Outside of <title> tags, we can fast-forward. @@ -538,8 +527,7 @@ export default class Tokenizer { ); this.state = State.BeforeAttributeName; } else if (this.decodeEntities && c === CharCodes.Amp) { - this.baseState = this.state; - this.state = State.BeforeEntity; + this.startEntity(); } } private stateInAttributeValueDoubleQuotes(c: number): void { @@ -556,8 +544,7 @@ export default class Tokenizer { this.state = State.BeforeAttributeName; this.stateBeforeAttributeName(c); } else if (this.decodeEntities && c === CharCodes.Amp) { - this.baseState = this.state; - this.state = State.BeforeEntity; + this.startEntity(); } } private stateBeforeDeclaration(c: number): void { @@ -615,177 +602,35 @@ export default class Tokenizer { } } - private trieIndex = 0; - private trieCurrent = 0; - /** For named entities, the index of the value. For numeric entities, the code point. */ - private entityResult = 0; - private entityExcess = 0; - - private stateBeforeEntity(c: number): void { - // Start excess with 1 to include the '&' - this.entityExcess = 1; - this.entityResult = 0; - - if (c === CharCodes.Number) { - this.state = State.BeforeNumericEntity; - } else if (c === CharCodes.Amp) { - // We have two `&` characters in a row. Stay in the current state. - } else { - this.trieIndex = 0; - this.trieCurrent = this.entityTrie[0]; - this.state = State.InNamedEntity; - this.stateInNamedEntity(c); - } - } - - private stateInNamedEntity(c: number): void { - this.entityExcess += 1; - - this.trieIndex = determineBranch( - this.entityTrie, - this.trieCurrent, - this.trieIndex + 1, - c + private startEntity() { + this.baseState = this.state; + this.state = State.InEntity; + this.entityStart = this.index; + this.entityDecoder.startEntity( + this.xmlMode + ? EntityDecoderMode.Strict + : this.baseState === State.Text || + this.baseState === State.InSpecialTag + ? EntityDecoderMode.Text + : EntityDecoderMode.Attribute ); - - if (this.trieIndex < 0) { - this.emitNamedEntity(); - this.index--; - return; - } - - this.trieCurrent = this.entityTrie[this.trieIndex]; - - const masked = this.trieCurrent & BinTrieFlags.VALUE_LENGTH; - - // If the branch is a value, store it and continue - if (masked) { - // The mask is the number of bytes of the value, including the current byte. - const valueLength = (masked >> 14) - 1; - - // If we have a legacy entity while parsing strictly, just skip the number of bytes - if (!this.allowLegacyEntity() && c !== CharCodes.Semi) { - this.trieIndex += valueLength; - } else { - // Add 1 as we have already incremented the excess - const entityStart = this.index - this.entityExcess + 1; - - if (entityStart > this.sectionStart) { - this.emitPartial(this.sectionStart, entityStart); - } - - // If this is a surrogate pair, consume the next two bytes - this.entityResult = this.trieIndex; - this.trieIndex += valueLength; - this.entityExcess = 0; - this.sectionStart = this.index + 1; - - if (valueLength === 0) { - this.emitNamedEntity(); - } - } - } } - private emitNamedEntity(): void { - this.state = this.baseState; - - if (this.entityResult === 0) { - return; - } - - const valueLength = - (this.entityTrie[this.entityResult] & BinTrieFlags.VALUE_LENGTH) >> - 14; + private stateInEntity(): void { + const length = this.entityDecoder.write(this.buffer, this.index); - switch (valueLength) { - case 1: { - this.emitCodePoint( - this.entityTrie[this.entityResult] & - ~BinTrieFlags.VALUE_LENGTH - ); - break; - } - case 2: { - this.emitCodePoint(this.entityTrie[this.entityResult + 1]); - break; - } - case 3: { - this.emitCodePoint(this.entityTrie[this.entityResult + 1]); - this.emitCodePoint(this.entityTrie[this.entityResult + 2]); - } - } - } + // If `length` is negative, we need to wait for more data. + if (length >= 0) { + this.index = this.entityStart + length; + this.state = this.baseState; - private stateBeforeNumericEntity(c: number): void { - if ((c | 0x20) === CharCodes.LowerX) { - this.entityExcess++; - this.state = State.InHexEntity; - } else { - this.state = State.InNumericEntity; - this.stateInNumericEntity(c); - } - } - - private emitNumericEntity(strict: boolean) { - const entityStart = this.index - this.entityExcess - 1; - const numberStart = - entityStart + 2 + Number(this.state === State.InHexEntity); - - if (numberStart !== this.index) { - // Emit leading data if any - if (entityStart > this.sectionStart) { - this.emitPartial(this.sectionStart, entityStart); - } - - this.sectionStart = this.index + Number(strict); - this.emitCodePoint(replaceCodePoint(this.entityResult)); - } - this.state = this.baseState; - } - private stateInNumericEntity(c: number): void { - if (c === CharCodes.Semi) { - this.emitNumericEntity(true); - } else if (isNumber(c)) { - this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero); - this.entityExcess++; - } else { - if (this.allowLegacyEntity()) { - this.emitNumericEntity(false); - } else { - this.state = this.baseState; - } - this.index--; - } - } - private stateInHexEntity(c: number): void { - if (c === CharCodes.Semi) { - this.emitNumericEntity(true); - } else if (isNumber(c)) { - this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero); - this.entityExcess++; - } else if (isHexDigit(c)) { - this.entityResult = - this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10); - this.entityExcess++; - } else { - if (this.allowLegacyEntity()) { - this.emitNumericEntity(false); - } else { - this.state = this.baseState; + // If we encountered an entity, we already emitted the current section. + if (length > 0) { + this.sectionStart = this.index; } - this.index--; } } - private allowLegacyEntity() { - return ( - !this.xmlMode && - (this.baseState === State.Text || - this.baseState === State.InSpecialTag) - ); - } - /** * Remove data that has already been consumed from the buffer. */ @@ -918,26 +763,10 @@ export default class Tokenizer { this.stateInProcessingInstruction(c); break; } - case State.InNamedEntity: { - this.stateInNamedEntity(c); - break; - } - case State.BeforeEntity: { - this.stateBeforeEntity(c); + case State.InEntity: { + this.stateInEntity(); break; } - case State.InHexEntity: { - this.stateInHexEntity(c); - break; - } - case State.InNumericEntity: { - this.stateInNumericEntity(c); - break; - } - default: { - // `this._state === State.BeforeNumericEntity` - this.stateBeforeNumericEntity(c); - } } this.index++; } @@ -945,8 +774,8 @@ export default class Tokenizer { } private finish() { - if (this.state === State.InNamedEntity) { - this.emitNamedEntity(); + if (this.state === State.InEntity) { + this.entityDecoder.end(); } // If there is remaining data, emit it in a reasonable way @@ -965,18 +794,6 @@ export default class Tokenizer { } else { this.cbs.oncomment(this.sectionStart, endIndex, 0); } - } else if ( - this.state === State.InNumericEntity && - this.allowLegacyEntity() - ) { - this.emitNumericEntity(false); - // All trailing data will have been consumed - } else if ( - this.state === State.InHexEntity && - this.allowLegacyEntity() - ) { - this.emitNumericEntity(false); - // All trailing data will have been consumed } else if ( this.state === State.InTagName || this.state === State.BeforeAttributeName || @@ -997,23 +814,23 @@ export default class Tokenizer { } } - private emitPartial(start: number, endIndex: number): void { - if ( - this.baseState !== State.Text && - this.baseState !== State.InSpecialTag - ) { - this.cbs.onattribdata(start, endIndex); - } else { - this.cbs.ontext(start, endIndex); - } - } private emitCodePoint(cp: number): void { if ( this.baseState !== State.Text && this.baseState !== State.InSpecialTag ) { + if (this.sectionStart < this.entityStart) { + this.cbs.onattribdata(this.sectionStart, this.entityStart); + this.sectionStart = this.entityStart; + } + this.cbs.onattribentity(cp); } else { + if (this.sectionStart < this.entityStart) { + this.cbs.ontext(this.sectionStart, this.entityStart); + this.sectionStart = this.entityStart; + } + this.cbs.ontextentity(cp); } } From 80647b52992286f34bcc79490ab9dfb512138006 Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Sun, 2 Apr 2023 09:58:01 +0100 Subject: [PATCH 02/12] Fix up indices --- src/Tokenizer.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 71538bc44..6276d9a79 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -617,7 +617,10 @@ export default class Tokenizer { } private stateInEntity(): void { - const length = this.entityDecoder.write(this.buffer, this.index); + const length = this.entityDecoder.write( + this.buffer, + this.index - this.offset + ); // If `length` is negative, we need to wait for more data. if (length >= 0) { @@ -775,7 +778,8 @@ export default class Tokenizer { private finish() { if (this.state === State.InEntity) { - this.entityDecoder.end(); + this.index += this.entityDecoder.end(); + this.state = this.baseState; } // If there is remaining data, emit it in a reasonable way From e3843f225d418bf4fc9b1e5b40174dff83d3690c Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Sun, 2 Apr 2023 11:56:27 +0100 Subject: [PATCH 03/12] Add consumed to decoder callback --- src/Tokenizer.ts | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 6276d9a79..1e9ff0a5e 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -171,7 +171,7 @@ export default class Tokenizer { this.decodeEntities = decodeEntities; this.entityDecoder = new EntityDecoder( xmlMode ? xmlDecodeTree : htmlDecodeTree, - (cp) => this.emitCodePoint(cp) + (cp, consumed) => this.emitCodePoint(cp, consumed) ); } @@ -622,15 +622,9 @@ export default class Tokenizer { this.index - this.offset ); - // If `length` is negative, we need to wait for more data. + // If `length` is positive, we are done with the entity. if (length >= 0) { - this.index = this.entityStart + length; this.state = this.baseState; - - // If we encountered an entity, we already emitted the current section. - if (length > 0) { - this.sectionStart = this.index; - } } } @@ -778,7 +772,7 @@ export default class Tokenizer { private finish() { if (this.state === State.InEntity) { - this.index += this.entityDecoder.end(); + this.entityDecoder.end(); this.state = this.baseState; } @@ -818,22 +812,24 @@ export default class Tokenizer { } } - private emitCodePoint(cp: number): void { + private emitCodePoint(cp: number, consumed: number): void { if ( this.baseState !== State.Text && this.baseState !== State.InSpecialTag ) { if (this.sectionStart < this.entityStart) { this.cbs.onattribdata(this.sectionStart, this.entityStart); - this.sectionStart = this.entityStart; } + this.sectionStart = this.entityStart + consumed; + this.index = this.sectionStart - 1; this.cbs.onattribentity(cp); } else { if (this.sectionStart < this.entityStart) { this.cbs.ontext(this.sectionStart, this.entityStart); - this.sectionStart = this.entityStart; } + this.sectionStart = this.entityStart + consumed; + this.index = this.sectionStart - 1; this.cbs.ontextentity(cp); } From 0b2e8b13fd44760a71d234a0721b243b81963e0c Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Sun, 2 Apr 2023 11:56:42 +0100 Subject: [PATCH 04/12] Add xml mode entity tokenizer test --- src/Tokenizer.spec.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts index 438e23bae..dc8d92297 100644 --- a/src/Tokenizer.spec.ts +++ b/src/Tokenizer.spec.ts @@ -1,10 +1,10 @@ import { Tokenizer } from "./index.js"; import type { Callbacks } from "./Tokenizer.js"; -function tokenize(data: string) { +function tokenize(data: string, options = {}) { const log: unknown[][] = []; const tokenizer = new Tokenizer( - {}, + options, new Proxy( {}, { @@ -56,6 +56,13 @@ describe("Tokenizer", () => { }); }); + it("should support XML entities", () => + expect( + tokenize("&>&<üabcde", { + xmlMode: true, + }) + ).toMatchSnapshot()); + it("should not lose data when pausing", () => { const log: unknown[][] = []; const tokenizer = new Tokenizer( From 85d3056afda3436a77da55006030a898b221c818 Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Sun, 2 Apr 2023 21:44:10 +0100 Subject: [PATCH 05/12] Fix non-entity --- src/Tokenizer.ts | 4 ++ src/__snapshots__/Tokenizer.spec.ts.snap | 48 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 1e9ff0a5e..feebe346d 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -625,6 +625,10 @@ export default class Tokenizer { // If `length` is positive, we are done with the entity. if (length >= 0) { this.state = this.baseState; + + if (length === 0) { + this.index = this.entityStart + 1; + } } } diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap index ec3de47da..2f3fe8ea2 100644 --- a/src/__snapshots__/Tokenizer.spec.ts.snap +++ b/src/__snapshots__/Tokenizer.spec.ts.snap @@ -87,6 +87,54 @@ exports[`Tokenizer should not lose data when pausing 1`] = ` ] `; +exports[`Tokenizer should support XML entities 1`] = ` +[ + [ + "ontextentity", + 38, + ], + [ + "ontextentity", + 62, + ], + [ + "ontext", + 9, + 13, + ], + [ + "ontextentity", + 60, + ], + [ + "ontext", + 17, + 23, + ], + [ + "ontextentity", + 97, + ], + [ + "ontext", + 29, + 34, + ], + [ + "ontextentity", + 99, + ], + [ + "ontext", + 39, + 49, + ], + [ + "onend", + ], +] +`; + exports[`Tokenizer should support self-closing special tags for self-closing script tag 1`] = ` [ [ From d2813b873ec138076f83b24bc9d79ff3a0a029c9 Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Sun, 2 Apr 2023 21:54:33 +0100 Subject: [PATCH 06/12] Remove most base state writes --- src/Tokenizer.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index feebe346d..d42616b2a 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -444,7 +444,6 @@ export default class Tokenizer { // Skip everything until ">" if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { this.state = State.Text; - this.baseState = State.Text; this.sectionStart = this.index + 1; } } @@ -457,7 +456,6 @@ export default class Tokenizer { } else { this.state = State.Text; } - this.baseState = this.state; this.sectionStart = this.index + 1; } else if (c === CharCodes.Slash) { this.state = State.InSelfClosingTag; @@ -470,7 +468,6 @@ export default class Tokenizer { if (c === CharCodes.Gt) { this.cbs.onselfclosingtag(this.index); this.state = State.Text; - this.baseState = State.Text; this.sectionStart = this.index + 1; this.isSpecial = false; // Reset special state, in case of self-closing special tags } else if (!isWhitespace(c)) { From 8e48cf7b3275d51a32cf17fbb425366651c875c6 Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Sun, 2 Apr 2023 21:56:50 +0100 Subject: [PATCH 07/12] Add tokenizer tests --- src/Tokenizer.spec.ts | 27 ++++- src/__snapshots__/Tokenizer.spec.ts.snap | 140 +++++++++++++++++++++++ 2 files changed, 161 insertions(+), 6 deletions(-) diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts index dc8d92297..df07b97aa 100644 --- a/src/Tokenizer.spec.ts +++ b/src/Tokenizer.spec.ts @@ -56,12 +56,27 @@ describe("Tokenizer", () => { }); }); - it("should support XML entities", () => - expect( - tokenize("&>&<üabcde", { - xmlMode: true, - }) - ).toMatchSnapshot()); + describe("should handle entities", () => { + it("for XML entities", () => + expect( + tokenize("&>&<üabcde", { + xmlMode: true, + }) + ).toMatchSnapshot()); + + it("for entities in attributes (#276)", () => + expect( + tokenize( + '<img src="?&image_uri=1&ℑ=2&image=3"/>?&image_uri=1&ℑ=2&image=3' + ) + ).toMatchSnapshot()); + + it("for trailing legacy entity", () => + expect(tokenize("⨱×bar")).toMatchSnapshot()); + + it("for multi-byte entities", () => + expect(tokenize("≧̸")).toMatchSnapshot()); + }); it("should not lose data when pausing", () => { const log: unknown[][] = []; diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap index 2f3fe8ea2..02954ceaa 100644 --- a/src/__snapshots__/Tokenizer.spec.ts.snap +++ b/src/__snapshots__/Tokenizer.spec.ts.snap @@ -1,5 +1,145 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP +exports[`Tokenizer should handle entities for XML entities 1`] = ` +[ + [ + "ontextentity", + 38, + ], + [ + "ontextentity", + 62, + ], + [ + "ontext", + 9, + 13, + ], + [ + "ontextentity", + 60, + ], + [ + "ontext", + 17, + 23, + ], + [ + "ontextentity", + 97, + ], + [ + "ontext", + 29, + 34, + ], + [ + "ontextentity", + 99, + ], + [ + "ontext", + 39, + 49, + ], + [ + "onend", + ], +] +`; + +exports[`Tokenizer should handle entities for entities in attributes (#276) 1`] = ` +[ + [ + "onopentagname", + 1, + 4, + ], + [ + "onattribname", + 5, + 8, + ], + [ + "onattribdata", + 10, + 24, + ], + [ + "onattribentity", + 8465, + ], + [ + "onattribdata", + 31, + 41, + ], + [ + "onattribend", + 3, + 41, + ], + [ + "onselfclosingtag", + 43, + ], + [ + "ontext", + 44, + 58, + ], + [ + "ontextentity", + 8465, + ], + [ + "ontext", + 65, + 75, + ], + [ + "onend", + ], +] +`; + +exports[`Tokenizer should handle entities for multi-byte entities 1`] = ` +[ + [ + "ontextentity", + 8807, + ], + [ + "ontextentity", + 824, + ], + [ + "onend", + ], +] +`; + +exports[`Tokenizer should handle entities for trailing legacy entity 1`] = ` +[ + [ + "ontextentity", + 10801, + ], + [ + "ontextentity", + 215, + ], + [ + "ontext", + 16, + 19, + ], + [ + "onend", + ], +] +`; + exports[`Tokenizer should not break after special tag followed by an entity for normal special tag 1`] = ` [ [ From 22364d7662e0f791d3427058d52ae12896790c76 Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Sun, 2 Apr 2023 22:25:14 +0100 Subject: [PATCH 08/12] Fix trailing data handling --- src/Tokenizer.ts | 14 ++++--- src/__snapshots__/Tokenizer.spec.ts.snap | 48 ------------------------ 2 files changed, 9 insertions(+), 53 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index d42616b2a..902e73c83 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -624,7 +624,7 @@ export default class Tokenizer { this.state = this.baseState; if (length === 0) { - this.index = this.entityStart + 1; + this.index = this.entityStart; } } } @@ -777,16 +777,20 @@ export default class Tokenizer { this.state = this.baseState; } - // If there is remaining data, emit it in a reasonable way - if (this.sectionStart < this.index) { - this.handleTrailingData(); - } + this.handleTrailingData(); + this.cbs.onend(); } /** Handle any trailing data. */ private handleTrailingData() { const endIndex = this.buffer.length + this.offset; + + // If there is no remaining data, we are done. + if (this.sectionStart >= endIndex) { + return; + } + if (this.state === State.InCommentLike) { if (this.currentSequence === Sequences.CdataEnd) { this.cbs.oncdata(this.sectionStart, endIndex, 0); diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap index 02954ceaa..70b3ce473 100644 --- a/src/__snapshots__/Tokenizer.spec.ts.snap +++ b/src/__snapshots__/Tokenizer.spec.ts.snap @@ -227,54 +227,6 @@ exports[`Tokenizer should not lose data when pausing 1`] = ` ] `; -exports[`Tokenizer should support XML entities 1`] = ` -[ - [ - "ontextentity", - 38, - ], - [ - "ontextentity", - 62, - ], - [ - "ontext", - 9, - 13, - ], - [ - "ontextentity", - 60, - ], - [ - "ontext", - 17, - 23, - ], - [ - "ontextentity", - 97, - ], - [ - "ontext", - 29, - 34, - ], - [ - "ontextentity", - 99, - ], - [ - "ontext", - 39, - 49, - ], - [ - "onend", - ], -] -`; - exports[`Tokenizer should support self-closing special tags for self-closing script tag 1`] = ` [ [ From 7e969186d086ad5fa5d02d27fe0a1a52e9e9f9ef Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Wed, 5 Apr 2023 11:14:02 +0100 Subject: [PATCH 09/12] fix: Increase index to mark buffer as consumed --- src/Tokenizer.spec.ts | 3 ++- src/Tokenizer.ts | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts index df07b97aa..4c635272c 100644 --- a/src/Tokenizer.spec.ts +++ b/src/Tokenizer.spec.ts @@ -97,7 +97,8 @@ describe("Tokenizer", () => { ) as Callbacks ); - tokenizer.write("& it up!"); + tokenizer.write("&am"); + tokenizer.write("p; it up!"); tokenizer.resume(); tokenizer.resume(); diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 902e73c83..7a784c953 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -626,6 +626,9 @@ export default class Tokenizer { if (length === 0) { this.index = this.entityStart; } + } else { + // Mark buffer as consumed. + this.index = this.offset + this.buffer.length - 1; } } From e018a5c074743b96323fa3db930d6e7910db2738 Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Wed, 5 Apr 2023 11:24:25 +0100 Subject: [PATCH 10/12] Add `endIndex` to `ontextentity` And remove unused getter methods --- src/Parser.ts | 11 +++-------- src/Tokenizer.ts | 18 ++---------------- src/__snapshots__/Tokenizer.spec.ts.snap | 13 +++++++++++++ 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/src/Parser.ts b/src/Parser.ts index 710f44274..e0bb79abe 100644 --- a/src/Parser.ts +++ b/src/Parser.ts @@ -251,15 +251,10 @@ export class Parser implements Callbacks { } /** @internal */ - ontextentity(cp: number): void { - /* - * Entities can be emitted on the character, or directly after. - * We use the section start here to get accurate indices. - */ - const index = this.tokenizer.getSectionStart(); - this.endIndex = index - 1; + ontextentity(cp: number, endIndex: number): void { + this.endIndex = endIndex - 1; this.cbs.ontext?.(fromCodePoint(cp)); - this.startIndex = index; + this.startIndex = endIndex; } protected isVoidElement(name: string): boolean { diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 7a784c953..31218aeae 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -118,7 +118,7 @@ export interface Callbacks { onprocessinginstruction(start: number, endIndex: number): void; onselfclosingtag(endIndex: number): void; ontext(start: number, endIndex: number): void; - ontextentity(codepoint: number): void; + ontextentity(codepoint: number, endIndex: number): void; } /** @@ -207,20 +207,6 @@ export default class Tokenizer { } } - /** - * The current index within all of the written data. - */ - public getIndex(): number { - return this.index; - } - - /** - * The start of the current section. - */ - public getSectionStart(): number { - return this.sectionStart; - } - private stateText(c: number): void { if ( c === CharCodes.Lt || @@ -839,7 +825,7 @@ export default class Tokenizer { this.sectionStart = this.entityStart + consumed; this.index = this.sectionStart - 1; - this.cbs.ontextentity(cp); + this.cbs.ontextentity(cp, this.sectionStart); } } } diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap index 70b3ce473..36722cdc3 100644 --- a/src/__snapshots__/Tokenizer.spec.ts.snap +++ b/src/__snapshots__/Tokenizer.spec.ts.snap @@ -5,10 +5,12 @@ exports[`Tokenizer should handle entities for XML entities 1`] = ` [ "ontextentity", 38, + 5, ], [ "ontextentity", 62, + 9, ], [ "ontext", @@ -18,6 +20,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = ` [ "ontextentity", 60, + 17, ], [ "ontext", @@ -27,6 +30,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = ` [ "ontextentity", 97, + 29, ], [ "ontext", @@ -36,6 +40,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = ` [ "ontextentity", 99, + 39, ], [ "ontext", @@ -91,6 +96,7 @@ exports[`Tokenizer should handle entities for entities in attributes (#276) 1`] [ "ontextentity", 8465, + 65, ], [ "ontext", @@ -108,10 +114,12 @@ exports[`Tokenizer should handle entities for multi-byte entities 1`] = ` [ "ontextentity", 8807, + 21, ], [ "ontextentity", 824, + 21, ], [ "onend", @@ -124,10 +132,12 @@ exports[`Tokenizer should handle entities for trailing legacy entity 1`] = ` [ "ontextentity", 10801, + 10, ], [ "ontextentity", 215, + 16, ], [ "ontext", @@ -164,6 +174,7 @@ exports[`Tokenizer should not break after special tag followed by an entity for [ "ontextentity", 39, + 24, ], [ "onopentagname", @@ -194,6 +205,7 @@ exports[`Tokenizer should not break after special tag followed by an entity for [ "ontextentity", 39, + 15, ], [ "onopentagname", @@ -215,6 +227,7 @@ exports[`Tokenizer should not lose data when pausing 1`] = ` [ "ontextentity", 38, + 5, ], [ "ontext", From 202edccab236d0b112c859ac571ebcecf0a3c4cf Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Thu, 6 Apr 2023 11:55:43 +0100 Subject: [PATCH 11/12] Update enum name --- src/Tokenizer.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index 31218aeae..40332be1d 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -1,6 +1,6 @@ import { EntityDecoder, - EntityDecoderMode, + DecodingMode, htmlDecodeTree, xmlDecodeTree, } from "entities/lib/decode.js"; @@ -591,11 +591,11 @@ export default class Tokenizer { this.entityStart = this.index; this.entityDecoder.startEntity( this.xmlMode - ? EntityDecoderMode.Strict + ? DecodingMode.Strict : this.baseState === State.Text || this.baseState === State.InSpecialTag - ? EntityDecoderMode.Text - : EntityDecoderMode.Attribute + ? DecodingMode.Legacy + : DecodingMode.Attribute ); } From 07946daf85f300b908f195b6f843e68193a7cf8f Mon Sep 17 00:00:00 2001 From: Felix <188768+fb55@users.noreply.github.com> Date: Thu, 13 Apr 2023 19:10:52 +0100 Subject: [PATCH 12/12] Bump `entities` --- package-lock.json | 14 +++++++------- package.json | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/package-lock.json b/package-lock.json index af26f8034..6c53e67dc 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,7 +19,7 @@ "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1", - "entities": "^4.4.0" + "entities": "^4.5.0" }, "devDependencies": { "@types/jest": "^29.5.0", @@ -2135,9 +2135,9 @@ "dev": true }, "node_modules/entities": { - "version": "4.4.0", - "resolved": "https://registry.npmjs.org/entities/-/entities-4.4.0.tgz", - "integrity": "sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==", + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", "engines": { "node": ">=0.12" }, @@ -6752,9 +6752,9 @@ "dev": true }, "entities": { - "version": "4.4.0", - "resolved": "https://registry.npmjs.org/entities/-/entities-4.4.0.tgz", - "integrity": "sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==" + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==" }, "error-ex": { "version": "1.3.2", diff --git a/package.json b/package.json index 2154dcd82..dce9f7d86 100644 --- a/package.json +++ b/package.json @@ -64,7 +64,7 @@ "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1", - "entities": "^4.4.0" + "entities": "^4.5.0" }, "devDependencies": { "@types/jest": "^29.5.0",