From 0db371849f1ab9bd67d45328a9e393ee619ff3ef Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sat, 1 Apr 2023 15:15:09 +0100
Subject: [PATCH 01/12] refactor(tokenizer): Use `EntityDecoder`
---
src/Tokenizer.ts | 279 ++++++++---------------------------------------
1 file changed, 48 insertions(+), 231 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index cf20dc688..71538bc44 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -1,9 +1,8 @@
import {
+ EntityDecoder,
+ EntityDecoderMode,
htmlDecodeTree,
xmlDecodeTree,
- BinTrieFlags,
- determineBranch,
- replaceCodePoint,
} from "entities/lib/decode.js";
const enum CharCodes {
@@ -73,11 +72,7 @@ const enum State {
SpecialStartSequence,
InSpecialTag,
- BeforeEntity, // &
- BeforeNumericEntity, // #
- InNamedEntity,
- InNumericEntity,
- InHexEntity, // X
+ InEntity,
}
function isWhitespace(c: number): boolean {
@@ -94,10 +89,6 @@ function isEndOfTagSection(c: number): boolean {
return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c);
}
-function isNumber(c: number): boolean {
- return c >= CharCodes.Zero && c <= CharCodes.Nine;
-}
-
function isASCIIAlpha(c: number): boolean {
return (
(c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
@@ -105,13 +96,6 @@ function isASCIIAlpha(c: number): boolean {
);
}
-function isHexDigit(c: number): boolean {
- return (
- (c >= CharCodes.UpperA && c <= CharCodes.UpperF) ||
- (c >= CharCodes.LowerA && c <= CharCodes.LowerF)
- );
-}
-
export enum QuoteType {
NoValue = 0,
Unquoted = 1,
@@ -161,6 +145,8 @@ export default class Tokenizer {
private sectionStart = 0;
/** The index within the buffer that we are currently looking at. */
private index = 0;
+ /** The start of the last entity. */
+ private entityStart = 0;
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
private baseState = State.Text;
/** For special parsing behavior inside of script and style tags. */
@@ -172,7 +158,7 @@ export default class Tokenizer {
private readonly xmlMode: boolean;
private readonly decodeEntities: boolean;
- private readonly entityTrie: Uint16Array;
+ private readonly entityDecoder: EntityDecoder;
constructor(
{
@@ -183,7 +169,10 @@ export default class Tokenizer {
) {
this.xmlMode = xmlMode;
this.decodeEntities = decodeEntities;
- this.entityTrie = xmlMode ? xmlDecodeTree : htmlDecodeTree;
+ this.entityDecoder = new EntityDecoder(
+ xmlMode ? xmlDecodeTree : htmlDecodeTree,
+ (cp) => this.emitCodePoint(cp)
+ );
}
public reset(): void {
@@ -243,7 +232,7 @@ export default class Tokenizer {
this.state = State.BeforeTagName;
this.sectionStart = this.index;
} else if (this.decodeEntities && c === CharCodes.Amp) {
- this.state = State.BeforeEntity;
+ this.startEntity();
}
}
@@ -298,7 +287,7 @@ export default class Tokenizer {
if (this.currentSequence === Sequences.TitleEnd) {
// We have to parse entities in
tags.
if (this.decodeEntities && c === CharCodes.Amp) {
- this.state = State.BeforeEntity;
+ this.startEntity();
}
} else if (this.fastForwardTo(CharCodes.Lt)) {
// Outside of tags, we can fast-forward.
@@ -538,8 +527,7 @@ export default class Tokenizer {
);
this.state = State.BeforeAttributeName;
} else if (this.decodeEntities && c === CharCodes.Amp) {
- this.baseState = this.state;
- this.state = State.BeforeEntity;
+ this.startEntity();
}
}
private stateInAttributeValueDoubleQuotes(c: number): void {
@@ -556,8 +544,7 @@ export default class Tokenizer {
this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
} else if (this.decodeEntities && c === CharCodes.Amp) {
- this.baseState = this.state;
- this.state = State.BeforeEntity;
+ this.startEntity();
}
}
private stateBeforeDeclaration(c: number): void {
@@ -615,177 +602,35 @@ export default class Tokenizer {
}
}
- private trieIndex = 0;
- private trieCurrent = 0;
- /** For named entities, the index of the value. For numeric entities, the code point. */
- private entityResult = 0;
- private entityExcess = 0;
-
- private stateBeforeEntity(c: number): void {
- // Start excess with 1 to include the '&'
- this.entityExcess = 1;
- this.entityResult = 0;
-
- if (c === CharCodes.Number) {
- this.state = State.BeforeNumericEntity;
- } else if (c === CharCodes.Amp) {
- // We have two `&` characters in a row. Stay in the current state.
- } else {
- this.trieIndex = 0;
- this.trieCurrent = this.entityTrie[0];
- this.state = State.InNamedEntity;
- this.stateInNamedEntity(c);
- }
- }
-
- private stateInNamedEntity(c: number): void {
- this.entityExcess += 1;
-
- this.trieIndex = determineBranch(
- this.entityTrie,
- this.trieCurrent,
- this.trieIndex + 1,
- c
+ private startEntity() {
+ this.baseState = this.state;
+ this.state = State.InEntity;
+ this.entityStart = this.index;
+ this.entityDecoder.startEntity(
+ this.xmlMode
+ ? EntityDecoderMode.Strict
+ : this.baseState === State.Text ||
+ this.baseState === State.InSpecialTag
+ ? EntityDecoderMode.Text
+ : EntityDecoderMode.Attribute
);
-
- if (this.trieIndex < 0) {
- this.emitNamedEntity();
- this.index--;
- return;
- }
-
- this.trieCurrent = this.entityTrie[this.trieIndex];
-
- const masked = this.trieCurrent & BinTrieFlags.VALUE_LENGTH;
-
- // If the branch is a value, store it and continue
- if (masked) {
- // The mask is the number of bytes of the value, including the current byte.
- const valueLength = (masked >> 14) - 1;
-
- // If we have a legacy entity while parsing strictly, just skip the number of bytes
- if (!this.allowLegacyEntity() && c !== CharCodes.Semi) {
- this.trieIndex += valueLength;
- } else {
- // Add 1 as we have already incremented the excess
- const entityStart = this.index - this.entityExcess + 1;
-
- if (entityStart > this.sectionStart) {
- this.emitPartial(this.sectionStart, entityStart);
- }
-
- // If this is a surrogate pair, consume the next two bytes
- this.entityResult = this.trieIndex;
- this.trieIndex += valueLength;
- this.entityExcess = 0;
- this.sectionStart = this.index + 1;
-
- if (valueLength === 0) {
- this.emitNamedEntity();
- }
- }
- }
}
- private emitNamedEntity(): void {
- this.state = this.baseState;
-
- if (this.entityResult === 0) {
- return;
- }
-
- const valueLength =
- (this.entityTrie[this.entityResult] & BinTrieFlags.VALUE_LENGTH) >>
- 14;
+ private stateInEntity(): void {
+ const length = this.entityDecoder.write(this.buffer, this.index);
- switch (valueLength) {
- case 1: {
- this.emitCodePoint(
- this.entityTrie[this.entityResult] &
- ~BinTrieFlags.VALUE_LENGTH
- );
- break;
- }
- case 2: {
- this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
- break;
- }
- case 3: {
- this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
- this.emitCodePoint(this.entityTrie[this.entityResult + 2]);
- }
- }
- }
+ // If `length` is negative, we need to wait for more data.
+ if (length >= 0) {
+ this.index = this.entityStart + length;
+ this.state = this.baseState;
- private stateBeforeNumericEntity(c: number): void {
- if ((c | 0x20) === CharCodes.LowerX) {
- this.entityExcess++;
- this.state = State.InHexEntity;
- } else {
- this.state = State.InNumericEntity;
- this.stateInNumericEntity(c);
- }
- }
-
- private emitNumericEntity(strict: boolean) {
- const entityStart = this.index - this.entityExcess - 1;
- const numberStart =
- entityStart + 2 + Number(this.state === State.InHexEntity);
-
- if (numberStart !== this.index) {
- // Emit leading data if any
- if (entityStart > this.sectionStart) {
- this.emitPartial(this.sectionStart, entityStart);
- }
-
- this.sectionStart = this.index + Number(strict);
- this.emitCodePoint(replaceCodePoint(this.entityResult));
- }
- this.state = this.baseState;
- }
- private stateInNumericEntity(c: number): void {
- if (c === CharCodes.Semi) {
- this.emitNumericEntity(true);
- } else if (isNumber(c)) {
- this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero);
- this.entityExcess++;
- } else {
- if (this.allowLegacyEntity()) {
- this.emitNumericEntity(false);
- } else {
- this.state = this.baseState;
- }
- this.index--;
- }
- }
- private stateInHexEntity(c: number): void {
- if (c === CharCodes.Semi) {
- this.emitNumericEntity(true);
- } else if (isNumber(c)) {
- this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero);
- this.entityExcess++;
- } else if (isHexDigit(c)) {
- this.entityResult =
- this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10);
- this.entityExcess++;
- } else {
- if (this.allowLegacyEntity()) {
- this.emitNumericEntity(false);
- } else {
- this.state = this.baseState;
+ // If we encountered an entity, we already emitted the current section.
+ if (length > 0) {
+ this.sectionStart = this.index;
}
- this.index--;
}
}
- private allowLegacyEntity() {
- return (
- !this.xmlMode &&
- (this.baseState === State.Text ||
- this.baseState === State.InSpecialTag)
- );
- }
-
/**
* Remove data that has already been consumed from the buffer.
*/
@@ -918,26 +763,10 @@ export default class Tokenizer {
this.stateInProcessingInstruction(c);
break;
}
- case State.InNamedEntity: {
- this.stateInNamedEntity(c);
- break;
- }
- case State.BeforeEntity: {
- this.stateBeforeEntity(c);
+ case State.InEntity: {
+ this.stateInEntity();
break;
}
- case State.InHexEntity: {
- this.stateInHexEntity(c);
- break;
- }
- case State.InNumericEntity: {
- this.stateInNumericEntity(c);
- break;
- }
- default: {
- // `this._state === State.BeforeNumericEntity`
- this.stateBeforeNumericEntity(c);
- }
}
this.index++;
}
@@ -945,8 +774,8 @@ export default class Tokenizer {
}
private finish() {
- if (this.state === State.InNamedEntity) {
- this.emitNamedEntity();
+ if (this.state === State.InEntity) {
+ this.entityDecoder.end();
}
// If there is remaining data, emit it in a reasonable way
@@ -965,18 +794,6 @@ export default class Tokenizer {
} else {
this.cbs.oncomment(this.sectionStart, endIndex, 0);
}
- } else if (
- this.state === State.InNumericEntity &&
- this.allowLegacyEntity()
- ) {
- this.emitNumericEntity(false);
- // All trailing data will have been consumed
- } else if (
- this.state === State.InHexEntity &&
- this.allowLegacyEntity()
- ) {
- this.emitNumericEntity(false);
- // All trailing data will have been consumed
} else if (
this.state === State.InTagName ||
this.state === State.BeforeAttributeName ||
@@ -997,23 +814,23 @@ export default class Tokenizer {
}
}
- private emitPartial(start: number, endIndex: number): void {
- if (
- this.baseState !== State.Text &&
- this.baseState !== State.InSpecialTag
- ) {
- this.cbs.onattribdata(start, endIndex);
- } else {
- this.cbs.ontext(start, endIndex);
- }
- }
private emitCodePoint(cp: number): void {
if (
this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag
) {
+ if (this.sectionStart < this.entityStart) {
+ this.cbs.onattribdata(this.sectionStart, this.entityStart);
+ this.sectionStart = this.entityStart;
+ }
+
this.cbs.onattribentity(cp);
} else {
+ if (this.sectionStart < this.entityStart) {
+ this.cbs.ontext(this.sectionStart, this.entityStart);
+ this.sectionStart = this.entityStart;
+ }
+
this.cbs.ontextentity(cp);
}
}
From 80647b52992286f34bcc79490ab9dfb512138006 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 09:58:01 +0100
Subject: [PATCH 02/12] Fix up indices
---
src/Tokenizer.ts | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 71538bc44..6276d9a79 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -617,7 +617,10 @@ export default class Tokenizer {
}
private stateInEntity(): void {
- const length = this.entityDecoder.write(this.buffer, this.index);
+ const length = this.entityDecoder.write(
+ this.buffer,
+ this.index - this.offset
+ );
// If `length` is negative, we need to wait for more data.
if (length >= 0) {
@@ -775,7 +778,8 @@ export default class Tokenizer {
private finish() {
if (this.state === State.InEntity) {
- this.entityDecoder.end();
+ this.index += this.entityDecoder.end();
+ this.state = this.baseState;
}
// If there is remaining data, emit it in a reasonable way
From e3843f225d418bf4fc9b1e5b40174dff83d3690c Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 11:56:27 +0100
Subject: [PATCH 03/12] Add consumed to decoder callback
---
src/Tokenizer.ts | 20 ++++++++------------
1 file changed, 8 insertions(+), 12 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 6276d9a79..1e9ff0a5e 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -171,7 +171,7 @@ export default class Tokenizer {
this.decodeEntities = decodeEntities;
this.entityDecoder = new EntityDecoder(
xmlMode ? xmlDecodeTree : htmlDecodeTree,
- (cp) => this.emitCodePoint(cp)
+ (cp, consumed) => this.emitCodePoint(cp, consumed)
);
}
@@ -622,15 +622,9 @@ export default class Tokenizer {
this.index - this.offset
);
- // If `length` is negative, we need to wait for more data.
+ // If `length` is positive, we are done with the entity.
if (length >= 0) {
- this.index = this.entityStart + length;
this.state = this.baseState;
-
- // If we encountered an entity, we already emitted the current section.
- if (length > 0) {
- this.sectionStart = this.index;
- }
}
}
@@ -778,7 +772,7 @@ export default class Tokenizer {
private finish() {
if (this.state === State.InEntity) {
- this.index += this.entityDecoder.end();
+ this.entityDecoder.end();
this.state = this.baseState;
}
@@ -818,22 +812,24 @@ export default class Tokenizer {
}
}
- private emitCodePoint(cp: number): void {
+ private emitCodePoint(cp: number, consumed: number): void {
if (
this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag
) {
if (this.sectionStart < this.entityStart) {
this.cbs.onattribdata(this.sectionStart, this.entityStart);
- this.sectionStart = this.entityStart;
}
+ this.sectionStart = this.entityStart + consumed;
+ this.index = this.sectionStart - 1;
this.cbs.onattribentity(cp);
} else {
if (this.sectionStart < this.entityStart) {
this.cbs.ontext(this.sectionStart, this.entityStart);
- this.sectionStart = this.entityStart;
}
+ this.sectionStart = this.entityStart + consumed;
+ this.index = this.sectionStart - 1;
this.cbs.ontextentity(cp);
}
From 0b2e8b13fd44760a71d234a0721b243b81963e0c Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 11:56:42 +0100
Subject: [PATCH 04/12] Add xml mode entity tokenizer test
---
src/Tokenizer.spec.ts | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts
index 438e23bae..dc8d92297 100644
--- a/src/Tokenizer.spec.ts
+++ b/src/Tokenizer.spec.ts
@@ -1,10 +1,10 @@
import { Tokenizer } from "./index.js";
import type { Callbacks } from "./Tokenizer.js";
-function tokenize(data: string) {
+function tokenize(data: string, options = {}) {
const log: unknown[][] = [];
const tokenizer = new Tokenizer(
- {},
+ options,
new Proxy(
{},
{
@@ -56,6 +56,13 @@ describe("Tokenizer", () => {
});
});
+ it("should support XML entities", () =>
+ expect(
+ tokenize("&>&<üabcde", {
+ xmlMode: true,
+ })
+ ).toMatchSnapshot());
+
it("should not lose data when pausing", () => {
const log: unknown[][] = [];
const tokenizer = new Tokenizer(
From 85d3056afda3436a77da55006030a898b221c818 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:44:10 +0100
Subject: [PATCH 05/12] Fix non-entity
---
src/Tokenizer.ts | 4 ++
src/__snapshots__/Tokenizer.spec.ts.snap | 48 ++++++++++++++++++++++++
2 files changed, 52 insertions(+)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 1e9ff0a5e..feebe346d 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -625,6 +625,10 @@ export default class Tokenizer {
// If `length` is positive, we are done with the entity.
if (length >= 0) {
this.state = this.baseState;
+
+ if (length === 0) {
+ this.index = this.entityStart + 1;
+ }
}
}
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index ec3de47da..2f3fe8ea2 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -87,6 +87,54 @@ exports[`Tokenizer should not lose data when pausing 1`] = `
]
`;
+exports[`Tokenizer should support XML entities 1`] = `
+[
+ [
+ "ontextentity",
+ 38,
+ ],
+ [
+ "ontextentity",
+ 62,
+ ],
+ [
+ "ontext",
+ 9,
+ 13,
+ ],
+ [
+ "ontextentity",
+ 60,
+ ],
+ [
+ "ontext",
+ 17,
+ 23,
+ ],
+ [
+ "ontextentity",
+ 97,
+ ],
+ [
+ "ontext",
+ 29,
+ 34,
+ ],
+ [
+ "ontextentity",
+ 99,
+ ],
+ [
+ "ontext",
+ 39,
+ 49,
+ ],
+ [
+ "onend",
+ ],
+]
+`;
+
exports[`Tokenizer should support self-closing special tags for self-closing script tag 1`] = `
[
[
From d2813b873ec138076f83b24bc9d79ff3a0a029c9 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:54:33 +0100
Subject: [PATCH 06/12] Remove most base state writes
---
src/Tokenizer.ts | 3 ---
1 file changed, 3 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index feebe346d..d42616b2a 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -444,7 +444,6 @@ export default class Tokenizer {
// Skip everything until ">"
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.state = State.Text;
- this.baseState = State.Text;
this.sectionStart = this.index + 1;
}
}
@@ -457,7 +456,6 @@ export default class Tokenizer {
} else {
this.state = State.Text;
}
- this.baseState = this.state;
this.sectionStart = this.index + 1;
} else if (c === CharCodes.Slash) {
this.state = State.InSelfClosingTag;
@@ -470,7 +468,6 @@ export default class Tokenizer {
if (c === CharCodes.Gt) {
this.cbs.onselfclosingtag(this.index);
this.state = State.Text;
- this.baseState = State.Text;
this.sectionStart = this.index + 1;
this.isSpecial = false; // Reset special state, in case of self-closing special tags
} else if (!isWhitespace(c)) {
From 8e48cf7b3275d51a32cf17fbb425366651c875c6 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:56:50 +0100
Subject: [PATCH 07/12] Add tokenizer tests
---
src/Tokenizer.spec.ts | 27 ++++-
src/__snapshots__/Tokenizer.spec.ts.snap | 140 +++++++++++++++++++++++
2 files changed, 161 insertions(+), 6 deletions(-)
diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts
index dc8d92297..df07b97aa 100644
--- a/src/Tokenizer.spec.ts
+++ b/src/Tokenizer.spec.ts
@@ -56,12 +56,27 @@ describe("Tokenizer", () => {
});
});
- it("should support XML entities", () =>
- expect(
- tokenize("&>&<üabcde", {
- xmlMode: true,
- })
- ).toMatchSnapshot());
+ describe("should handle entities", () => {
+ it("for XML entities", () =>
+ expect(
+ tokenize("&>&<üabcde", {
+ xmlMode: true,
+ })
+ ).toMatchSnapshot());
+
+ it("for entities in attributes (#276)", () =>
+ expect(
+ tokenize(
+ '?&image_uri=1&ℑ=2&image=3'
+ )
+ ).toMatchSnapshot());
+
+ it("for trailing legacy entity", () =>
+ expect(tokenize("⨱×bar")).toMatchSnapshot());
+
+ it("for multi-byte entities", () =>
+ expect(tokenize("≧̸")).toMatchSnapshot());
+ });
it("should not lose data when pausing", () => {
const log: unknown[][] = [];
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 2f3fe8ea2..02954ceaa 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -1,5 +1,145 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP
+exports[`Tokenizer should handle entities for XML entities 1`] = `
+[
+ [
+ "ontextentity",
+ 38,
+ ],
+ [
+ "ontextentity",
+ 62,
+ ],
+ [
+ "ontext",
+ 9,
+ 13,
+ ],
+ [
+ "ontextentity",
+ 60,
+ ],
+ [
+ "ontext",
+ 17,
+ 23,
+ ],
+ [
+ "ontextentity",
+ 97,
+ ],
+ [
+ "ontext",
+ 29,
+ 34,
+ ],
+ [
+ "ontextentity",
+ 99,
+ ],
+ [
+ "ontext",
+ 39,
+ 49,
+ ],
+ [
+ "onend",
+ ],
+]
+`;
+
+exports[`Tokenizer should handle entities for entities in attributes (#276) 1`] = `
+[
+ [
+ "onopentagname",
+ 1,
+ 4,
+ ],
+ [
+ "onattribname",
+ 5,
+ 8,
+ ],
+ [
+ "onattribdata",
+ 10,
+ 24,
+ ],
+ [
+ "onattribentity",
+ 8465,
+ ],
+ [
+ "onattribdata",
+ 31,
+ 41,
+ ],
+ [
+ "onattribend",
+ 3,
+ 41,
+ ],
+ [
+ "onselfclosingtag",
+ 43,
+ ],
+ [
+ "ontext",
+ 44,
+ 58,
+ ],
+ [
+ "ontextentity",
+ 8465,
+ ],
+ [
+ "ontext",
+ 65,
+ 75,
+ ],
+ [
+ "onend",
+ ],
+]
+`;
+
+exports[`Tokenizer should handle entities for multi-byte entities 1`] = `
+[
+ [
+ "ontextentity",
+ 8807,
+ ],
+ [
+ "ontextentity",
+ 824,
+ ],
+ [
+ "onend",
+ ],
+]
+`;
+
+exports[`Tokenizer should handle entities for trailing legacy entity 1`] = `
+[
+ [
+ "ontextentity",
+ 10801,
+ ],
+ [
+ "ontextentity",
+ 215,
+ ],
+ [
+ "ontext",
+ 16,
+ 19,
+ ],
+ [
+ "onend",
+ ],
+]
+`;
+
exports[`Tokenizer should not break after special tag followed by an entity for normal special tag 1`] = `
[
[
From 22364d7662e0f791d3427058d52ae12896790c76 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 22:25:14 +0100
Subject: [PATCH 08/12] Fix trailing data handling
---
src/Tokenizer.ts | 14 ++++---
src/__snapshots__/Tokenizer.spec.ts.snap | 48 ------------------------
2 files changed, 9 insertions(+), 53 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index d42616b2a..902e73c83 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -624,7 +624,7 @@ export default class Tokenizer {
this.state = this.baseState;
if (length === 0) {
- this.index = this.entityStart + 1;
+ this.index = this.entityStart;
}
}
}
@@ -777,16 +777,20 @@ export default class Tokenizer {
this.state = this.baseState;
}
- // If there is remaining data, emit it in a reasonable way
- if (this.sectionStart < this.index) {
- this.handleTrailingData();
- }
+ this.handleTrailingData();
+
this.cbs.onend();
}
/** Handle any trailing data. */
private handleTrailingData() {
const endIndex = this.buffer.length + this.offset;
+
+ // If there is no remaining data, we are done.
+ if (this.sectionStart >= endIndex) {
+ return;
+ }
+
if (this.state === State.InCommentLike) {
if (this.currentSequence === Sequences.CdataEnd) {
this.cbs.oncdata(this.sectionStart, endIndex, 0);
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 02954ceaa..70b3ce473 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -227,54 +227,6 @@ exports[`Tokenizer should not lose data when pausing 1`] = `
]
`;
-exports[`Tokenizer should support XML entities 1`] = `
-[
- [
- "ontextentity",
- 38,
- ],
- [
- "ontextentity",
- 62,
- ],
- [
- "ontext",
- 9,
- 13,
- ],
- [
- "ontextentity",
- 60,
- ],
- [
- "ontext",
- 17,
- 23,
- ],
- [
- "ontextentity",
- 97,
- ],
- [
- "ontext",
- 29,
- 34,
- ],
- [
- "ontextentity",
- 99,
- ],
- [
- "ontext",
- 39,
- 49,
- ],
- [
- "onend",
- ],
-]
-`;
-
exports[`Tokenizer should support self-closing special tags for self-closing script tag 1`] = `
[
[
From 7e969186d086ad5fa5d02d27fe0a1a52e9e9f9ef Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Wed, 5 Apr 2023 11:14:02 +0100
Subject: [PATCH 09/12] fix: Increase index to mark buffer as consumed
---
src/Tokenizer.spec.ts | 3 ++-
src/Tokenizer.ts | 3 +++
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts
index df07b97aa..4c635272c 100644
--- a/src/Tokenizer.spec.ts
+++ b/src/Tokenizer.spec.ts
@@ -97,7 +97,8 @@ describe("Tokenizer", () => {
) as Callbacks
);
- tokenizer.write("& it up!");
+ tokenizer.write("&am");
+ tokenizer.write("p; it up!");
tokenizer.resume();
tokenizer.resume();
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 902e73c83..7a784c953 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -626,6 +626,9 @@ export default class Tokenizer {
if (length === 0) {
this.index = this.entityStart;
}
+ } else {
+ // Mark buffer as consumed.
+ this.index = this.offset + this.buffer.length - 1;
}
}
From e018a5c074743b96323fa3db930d6e7910db2738 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Wed, 5 Apr 2023 11:24:25 +0100
Subject: [PATCH 10/12] Add `endIndex` to `ontextentity`
And remove unused getter methods
---
src/Parser.ts | 11 +++--------
src/Tokenizer.ts | 18 ++----------------
src/__snapshots__/Tokenizer.spec.ts.snap | 13 +++++++++++++
3 files changed, 18 insertions(+), 24 deletions(-)
diff --git a/src/Parser.ts b/src/Parser.ts
index 710f44274..e0bb79abe 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -251,15 +251,10 @@ export class Parser implements Callbacks {
}
/** @internal */
- ontextentity(cp: number): void {
- /*
- * Entities can be emitted on the character, or directly after.
- * We use the section start here to get accurate indices.
- */
- const index = this.tokenizer.getSectionStart();
- this.endIndex = index - 1;
+ ontextentity(cp: number, endIndex: number): void {
+ this.endIndex = endIndex - 1;
this.cbs.ontext?.(fromCodePoint(cp));
- this.startIndex = index;
+ this.startIndex = endIndex;
}
protected isVoidElement(name: string): boolean {
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 7a784c953..31218aeae 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -118,7 +118,7 @@ export interface Callbacks {
onprocessinginstruction(start: number, endIndex: number): void;
onselfclosingtag(endIndex: number): void;
ontext(start: number, endIndex: number): void;
- ontextentity(codepoint: number): void;
+ ontextentity(codepoint: number, endIndex: number): void;
}
/**
@@ -207,20 +207,6 @@ export default class Tokenizer {
}
}
- /**
- * The current index within all of the written data.
- */
- public getIndex(): number {
- return this.index;
- }
-
- /**
- * The start of the current section.
- */
- public getSectionStart(): number {
- return this.sectionStart;
- }
-
private stateText(c: number): void {
if (
c === CharCodes.Lt ||
@@ -839,7 +825,7 @@ export default class Tokenizer {
this.sectionStart = this.entityStart + consumed;
this.index = this.sectionStart - 1;
- this.cbs.ontextentity(cp);
+ this.cbs.ontextentity(cp, this.sectionStart);
}
}
}
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 70b3ce473..36722cdc3 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -5,10 +5,12 @@ exports[`Tokenizer should handle entities for XML entities 1`] = `
[
"ontextentity",
38,
+ 5,
],
[
"ontextentity",
62,
+ 9,
],
[
"ontext",
@@ -18,6 +20,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = `
[
"ontextentity",
60,
+ 17,
],
[
"ontext",
@@ -27,6 +30,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = `
[
"ontextentity",
97,
+ 29,
],
[
"ontext",
@@ -36,6 +40,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = `
[
"ontextentity",
99,
+ 39,
],
[
"ontext",
@@ -91,6 +96,7 @@ exports[`Tokenizer should handle entities for entities in attributes (#276) 1`]
[
"ontextentity",
8465,
+ 65,
],
[
"ontext",
@@ -108,10 +114,12 @@ exports[`Tokenizer should handle entities for multi-byte entities 1`] = `
[
"ontextentity",
8807,
+ 21,
],
[
"ontextentity",
824,
+ 21,
],
[
"onend",
@@ -124,10 +132,12 @@ exports[`Tokenizer should handle entities for trailing legacy entity 1`] = `
[
"ontextentity",
10801,
+ 10,
],
[
"ontextentity",
215,
+ 16,
],
[
"ontext",
@@ -164,6 +174,7 @@ exports[`Tokenizer should not break after special tag followed by an entity for
[
"ontextentity",
39,
+ 24,
],
[
"onopentagname",
@@ -194,6 +205,7 @@ exports[`Tokenizer should not break after special tag followed by an entity for
[
"ontextentity",
39,
+ 15,
],
[
"onopentagname",
@@ -215,6 +227,7 @@ exports[`Tokenizer should not lose data when pausing 1`] = `
[
"ontextentity",
38,
+ 5,
],
[
"ontext",
From 202edccab236d0b112c859ac571ebcecf0a3c4cf Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Thu, 6 Apr 2023 11:55:43 +0100
Subject: [PATCH 11/12] Update enum name
---
src/Tokenizer.ts | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 31218aeae..40332be1d 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -1,6 +1,6 @@
import {
EntityDecoder,
- EntityDecoderMode,
+ DecodingMode,
htmlDecodeTree,
xmlDecodeTree,
} from "entities/lib/decode.js";
@@ -591,11 +591,11 @@ export default class Tokenizer {
this.entityStart = this.index;
this.entityDecoder.startEntity(
this.xmlMode
- ? EntityDecoderMode.Strict
+ ? DecodingMode.Strict
: this.baseState === State.Text ||
this.baseState === State.InSpecialTag
- ? EntityDecoderMode.Text
- : EntityDecoderMode.Attribute
+ ? DecodingMode.Legacy
+ : DecodingMode.Attribute
);
}
From 07946daf85f300b908f195b6f843e68193a7cf8f Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Thu, 13 Apr 2023 19:10:52 +0100
Subject: [PATCH 12/12] Bump `entities`
---
package-lock.json | 14 +++++++-------
package.json | 2 +-
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/package-lock.json b/package-lock.json
index af26f8034..6c53e67dc 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -19,7 +19,7 @@
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
- "entities": "^4.4.0"
+ "entities": "^4.5.0"
},
"devDependencies": {
"@types/jest": "^29.5.0",
@@ -2135,9 +2135,9 @@
"dev": true
},
"node_modules/entities": {
- "version": "4.4.0",
- "resolved": "https://registry.npmjs.org/entities/-/entities-4.4.0.tgz",
- "integrity": "sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==",
+ "version": "4.5.0",
+ "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+ "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
"engines": {
"node": ">=0.12"
},
@@ -6752,9 +6752,9 @@
"dev": true
},
"entities": {
- "version": "4.4.0",
- "resolved": "https://registry.npmjs.org/entities/-/entities-4.4.0.tgz",
- "integrity": "sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA=="
+ "version": "4.5.0",
+ "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+ "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="
},
"error-ex": {
"version": "1.3.2",
diff --git a/package.json b/package.json
index 2154dcd82..dce9f7d86 100644
--- a/package.json
+++ b/package.json
@@ -64,7 +64,7 @@
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
- "entities": "^4.4.0"
+ "entities": "^4.5.0"
},
"devDependencies": {
"@types/jest": "^29.5.0",