From 0db371849f1ab9bd67d45328a9e393ee619ff3ef Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sat, 1 Apr 2023 15:15:09 +0100
Subject: [PATCH 01/12] refactor(tokenizer): Use `EntityDecoder`

---
 src/Tokenizer.ts | 279 ++++++++---------------------------------------
 1 file changed, 48 insertions(+), 231 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index cf20dc688..71538bc44 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -1,9 +1,8 @@
 import {
+    EntityDecoder,
+    EntityDecoderMode,
     htmlDecodeTree,
     xmlDecodeTree,
-    BinTrieFlags,
-    determineBranch,
-    replaceCodePoint,
 } from "entities/lib/decode.js";
 
 const enum CharCodes {
@@ -73,11 +72,7 @@ const enum State {
     SpecialStartSequence,
     InSpecialTag,
 
-    BeforeEntity, // &
-    BeforeNumericEntity, // #
-    InNamedEntity,
-    InNumericEntity,
-    InHexEntity, // X
+    InEntity,
 }
 
 function isWhitespace(c: number): boolean {
@@ -94,10 +89,6 @@ function isEndOfTagSection(c: number): boolean {
     return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c);
 }
 
-function isNumber(c: number): boolean {
-    return c >= CharCodes.Zero && c <= CharCodes.Nine;
-}
-
 function isASCIIAlpha(c: number): boolean {
     return (
         (c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
@@ -105,13 +96,6 @@ function isASCIIAlpha(c: number): boolean {
     );
 }
 
-function isHexDigit(c: number): boolean {
-    return (
-        (c >= CharCodes.UpperA && c <= CharCodes.UpperF) ||
-        (c >= CharCodes.LowerA && c <= CharCodes.LowerF)
-    );
-}
-
 export enum QuoteType {
     NoValue = 0,
     Unquoted = 1,
@@ -161,6 +145,8 @@ export default class Tokenizer {
     private sectionStart = 0;
     /** The index within the buffer that we are currently looking at. */
     private index = 0;
+    /** The start of the last entity. */
+    private entityStart = 0;
     /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
     private baseState = State.Text;
     /** For special parsing behavior inside of script and style tags. */
@@ -172,7 +158,7 @@ export default class Tokenizer {
 
     private readonly xmlMode: boolean;
     private readonly decodeEntities: boolean;
-    private readonly entityTrie: Uint16Array;
+    private readonly entityDecoder: EntityDecoder;
 
     constructor(
         {
@@ -183,7 +169,10 @@ export default class Tokenizer {
     ) {
         this.xmlMode = xmlMode;
         this.decodeEntities = decodeEntities;
-        this.entityTrie = xmlMode ? xmlDecodeTree : htmlDecodeTree;
+        this.entityDecoder = new EntityDecoder(
+            xmlMode ? xmlDecodeTree : htmlDecodeTree,
+            (cp) => this.emitCodePoint(cp)
+        );
     }
 
     public reset(): void {
@@ -243,7 +232,7 @@ export default class Tokenizer {
             this.state = State.BeforeTagName;
             this.sectionStart = this.index;
         } else if (this.decodeEntities && c === CharCodes.Amp) {
-            this.state = State.BeforeEntity;
+            this.startEntity();
         }
     }
 
@@ -298,7 +287,7 @@ export default class Tokenizer {
             if (this.currentSequence === Sequences.TitleEnd) {
                 // We have to parse entities in <title> tags.
                 if (this.decodeEntities && c === CharCodes.Amp) {
-                    this.state = State.BeforeEntity;
+                    this.startEntity();
                 }
             } else if (this.fastForwardTo(CharCodes.Lt)) {
                 // Outside of <title> tags, we can fast-forward.
@@ -538,8 +527,7 @@ export default class Tokenizer {
             );
             this.state = State.BeforeAttributeName;
         } else if (this.decodeEntities && c === CharCodes.Amp) {
-            this.baseState = this.state;
-            this.state = State.BeforeEntity;
+            this.startEntity();
         }
     }
     private stateInAttributeValueDoubleQuotes(c: number): void {
@@ -556,8 +544,7 @@ export default class Tokenizer {
             this.state = State.BeforeAttributeName;
             this.stateBeforeAttributeName(c);
         } else if (this.decodeEntities && c === CharCodes.Amp) {
-            this.baseState = this.state;
-            this.state = State.BeforeEntity;
+            this.startEntity();
         }
     }
     private stateBeforeDeclaration(c: number): void {
@@ -615,177 +602,35 @@ export default class Tokenizer {
         }
     }
 
-    private trieIndex = 0;
-    private trieCurrent = 0;
-    /** For named entities, the index of the value. For numeric entities, the code point. */
-    private entityResult = 0;
-    private entityExcess = 0;
-
-    private stateBeforeEntity(c: number): void {
-        // Start excess with 1 to include the '&'
-        this.entityExcess = 1;
-        this.entityResult = 0;
-
-        if (c === CharCodes.Number) {
-            this.state = State.BeforeNumericEntity;
-        } else if (c === CharCodes.Amp) {
-            // We have two `&` characters in a row. Stay in the current state.
-        } else {
-            this.trieIndex = 0;
-            this.trieCurrent = this.entityTrie[0];
-            this.state = State.InNamedEntity;
-            this.stateInNamedEntity(c);
-        }
-    }
-
-    private stateInNamedEntity(c: number): void {
-        this.entityExcess += 1;
-
-        this.trieIndex = determineBranch(
-            this.entityTrie,
-            this.trieCurrent,
-            this.trieIndex + 1,
-            c
+    private startEntity() {
+        this.baseState = this.state;
+        this.state = State.InEntity;
+        this.entityStart = this.index;
+        this.entityDecoder.startEntity(
+            this.xmlMode
+                ? EntityDecoderMode.Strict
+                : this.baseState === State.Text ||
+                  this.baseState === State.InSpecialTag
+                ? EntityDecoderMode.Text
+                : EntityDecoderMode.Attribute
         );
-
-        if (this.trieIndex < 0) {
-            this.emitNamedEntity();
-            this.index--;
-            return;
-        }
-
-        this.trieCurrent = this.entityTrie[this.trieIndex];
-
-        const masked = this.trieCurrent & BinTrieFlags.VALUE_LENGTH;
-
-        // If the branch is a value, store it and continue
-        if (masked) {
-            // The mask is the number of bytes of the value, including the current byte.
-            const valueLength = (masked >> 14) - 1;
-
-            // If we have a legacy entity while parsing strictly, just skip the number of bytes
-            if (!this.allowLegacyEntity() && c !== CharCodes.Semi) {
-                this.trieIndex += valueLength;
-            } else {
-                // Add 1 as we have already incremented the excess
-                const entityStart = this.index - this.entityExcess + 1;
-
-                if (entityStart > this.sectionStart) {
-                    this.emitPartial(this.sectionStart, entityStart);
-                }
-
-                // If this is a surrogate pair, consume the next two bytes
-                this.entityResult = this.trieIndex;
-                this.trieIndex += valueLength;
-                this.entityExcess = 0;
-                this.sectionStart = this.index + 1;
-
-                if (valueLength === 0) {
-                    this.emitNamedEntity();
-                }
-            }
-        }
     }
 
-    private emitNamedEntity(): void {
-        this.state = this.baseState;
-
-        if (this.entityResult === 0) {
-            return;
-        }
-
-        const valueLength =
-            (this.entityTrie[this.entityResult] & BinTrieFlags.VALUE_LENGTH) >>
-            14;
+    private stateInEntity(): void {
+        const length = this.entityDecoder.write(this.buffer, this.index);
 
-        switch (valueLength) {
-            case 1: {
-                this.emitCodePoint(
-                    this.entityTrie[this.entityResult] &
-                        ~BinTrieFlags.VALUE_LENGTH
-                );
-                break;
-            }
-            case 2: {
-                this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
-                break;
-            }
-            case 3: {
-                this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
-                this.emitCodePoint(this.entityTrie[this.entityResult + 2]);
-            }
-        }
-    }
+        // If `length` is negative, we need to wait for more data.
+        if (length >= 0) {
+            this.index = this.entityStart + length;
+            this.state = this.baseState;
 
-    private stateBeforeNumericEntity(c: number): void {
-        if ((c | 0x20) === CharCodes.LowerX) {
-            this.entityExcess++;
-            this.state = State.InHexEntity;
-        } else {
-            this.state = State.InNumericEntity;
-            this.stateInNumericEntity(c);
-        }
-    }
-
-    private emitNumericEntity(strict: boolean) {
-        const entityStart = this.index - this.entityExcess - 1;
-        const numberStart =
-            entityStart + 2 + Number(this.state === State.InHexEntity);
-
-        if (numberStart !== this.index) {
-            // Emit leading data if any
-            if (entityStart > this.sectionStart) {
-                this.emitPartial(this.sectionStart, entityStart);
-            }
-
-            this.sectionStart = this.index + Number(strict);
-            this.emitCodePoint(replaceCodePoint(this.entityResult));
-        }
-        this.state = this.baseState;
-    }
-    private stateInNumericEntity(c: number): void {
-        if (c === CharCodes.Semi) {
-            this.emitNumericEntity(true);
-        } else if (isNumber(c)) {
-            this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero);
-            this.entityExcess++;
-        } else {
-            if (this.allowLegacyEntity()) {
-                this.emitNumericEntity(false);
-            } else {
-                this.state = this.baseState;
-            }
-            this.index--;
-        }
-    }
-    private stateInHexEntity(c: number): void {
-        if (c === CharCodes.Semi) {
-            this.emitNumericEntity(true);
-        } else if (isNumber(c)) {
-            this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero);
-            this.entityExcess++;
-        } else if (isHexDigit(c)) {
-            this.entityResult =
-                this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10);
-            this.entityExcess++;
-        } else {
-            if (this.allowLegacyEntity()) {
-                this.emitNumericEntity(false);
-            } else {
-                this.state = this.baseState;
+            // If we encountered an entity, we already emitted the current section.
+            if (length > 0) {
+                this.sectionStart = this.index;
             }
-            this.index--;
         }
     }
 
-    private allowLegacyEntity() {
-        return (
-            !this.xmlMode &&
-            (this.baseState === State.Text ||
-                this.baseState === State.InSpecialTag)
-        );
-    }
-
     /**
      * Remove data that has already been consumed from the buffer.
      */
@@ -918,26 +763,10 @@ export default class Tokenizer {
                     this.stateInProcessingInstruction(c);
                     break;
                 }
-                case State.InNamedEntity: {
-                    this.stateInNamedEntity(c);
-                    break;
-                }
-                case State.BeforeEntity: {
-                    this.stateBeforeEntity(c);
+                case State.InEntity: {
+                    this.stateInEntity();
                     break;
                 }
-                case State.InHexEntity: {
-                    this.stateInHexEntity(c);
-                    break;
-                }
-                case State.InNumericEntity: {
-                    this.stateInNumericEntity(c);
-                    break;
-                }
-                default: {
-                    // `this._state === State.BeforeNumericEntity`
-                    this.stateBeforeNumericEntity(c);
-                }
             }
             this.index++;
         }
@@ -945,8 +774,8 @@ export default class Tokenizer {
     }
 
     private finish() {
-        if (this.state === State.InNamedEntity) {
-            this.emitNamedEntity();
+        if (this.state === State.InEntity) {
+            this.entityDecoder.end();
         }
 
         // If there is remaining data, emit it in a reasonable way
@@ -965,18 +794,6 @@ export default class Tokenizer {
             } else {
                 this.cbs.oncomment(this.sectionStart, endIndex, 0);
             }
-        } else if (
-            this.state === State.InNumericEntity &&
-            this.allowLegacyEntity()
-        ) {
-            this.emitNumericEntity(false);
-            // All trailing data will have been consumed
-        } else if (
-            this.state === State.InHexEntity &&
-            this.allowLegacyEntity()
-        ) {
-            this.emitNumericEntity(false);
-            // All trailing data will have been consumed
         } else if (
             this.state === State.InTagName ||
             this.state === State.BeforeAttributeName ||
@@ -997,23 +814,23 @@ export default class Tokenizer {
         }
     }
 
-    private emitPartial(start: number, endIndex: number): void {
-        if (
-            this.baseState !== State.Text &&
-            this.baseState !== State.InSpecialTag
-        ) {
-            this.cbs.onattribdata(start, endIndex);
-        } else {
-            this.cbs.ontext(start, endIndex);
-        }
-    }
     private emitCodePoint(cp: number): void {
         if (
             this.baseState !== State.Text &&
             this.baseState !== State.InSpecialTag
         ) {
+            if (this.sectionStart < this.entityStart) {
+                this.cbs.onattribdata(this.sectionStart, this.entityStart);
+                this.sectionStart = this.entityStart;
+            }
+
             this.cbs.onattribentity(cp);
         } else {
+            if (this.sectionStart < this.entityStart) {
+                this.cbs.ontext(this.sectionStart, this.entityStart);
+                this.sectionStart = this.entityStart;
+            }
+
             this.cbs.ontextentity(cp);
         }
     }

From 80647b52992286f34bcc79490ab9dfb512138006 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 09:58:01 +0100
Subject: [PATCH 02/12] Fix up indices

---
 src/Tokenizer.ts | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 71538bc44..6276d9a79 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -617,7 +617,10 @@ export default class Tokenizer {
     }
 
     private stateInEntity(): void {
-        const length = this.entityDecoder.write(this.buffer, this.index);
+        const length = this.entityDecoder.write(
+            this.buffer,
+            this.index - this.offset
+        );
 
         // If `length` is negative, we need to wait for more data.
         if (length >= 0) {
@@ -775,7 +778,8 @@ export default class Tokenizer {
 
     private finish() {
         if (this.state === State.InEntity) {
-            this.entityDecoder.end();
+            this.index += this.entityDecoder.end();
+            this.state = this.baseState;
         }
 
         // If there is remaining data, emit it in a reasonable way

From e3843f225d418bf4fc9b1e5b40174dff83d3690c Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 11:56:27 +0100
Subject: [PATCH 03/12] Add consumed to decoder callback

---
 src/Tokenizer.ts | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 6276d9a79..1e9ff0a5e 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -171,7 +171,7 @@ export default class Tokenizer {
         this.decodeEntities = decodeEntities;
         this.entityDecoder = new EntityDecoder(
             xmlMode ? xmlDecodeTree : htmlDecodeTree,
-            (cp) => this.emitCodePoint(cp)
+            (cp, consumed) => this.emitCodePoint(cp, consumed)
         );
     }
 
@@ -622,15 +622,9 @@ export default class Tokenizer {
             this.index - this.offset
         );
 
-        // If `length` is negative, we need to wait for more data.
+        // If `length` is positive, we are done with the entity.
         if (length >= 0) {
-            this.index = this.entityStart + length;
             this.state = this.baseState;
-
-            // If we encountered an entity, we already emitted the current section.
-            if (length > 0) {
-                this.sectionStart = this.index;
-            }
         }
     }
 
@@ -778,7 +772,7 @@ export default class Tokenizer {
 
     private finish() {
         if (this.state === State.InEntity) {
-            this.index += this.entityDecoder.end();
+            this.entityDecoder.end();
             this.state = this.baseState;
         }
 
@@ -818,22 +812,24 @@ export default class Tokenizer {
         }
     }
 
-    private emitCodePoint(cp: number): void {
+    private emitCodePoint(cp: number, consumed: number): void {
         if (
             this.baseState !== State.Text &&
             this.baseState !== State.InSpecialTag
         ) {
             if (this.sectionStart < this.entityStart) {
                 this.cbs.onattribdata(this.sectionStart, this.entityStart);
-                this.sectionStart = this.entityStart;
             }
+            this.sectionStart = this.entityStart + consumed;
+            this.index = this.sectionStart - 1;
 
             this.cbs.onattribentity(cp);
         } else {
             if (this.sectionStart < this.entityStart) {
                 this.cbs.ontext(this.sectionStart, this.entityStart);
-                this.sectionStart = this.entityStart;
             }
+            this.sectionStart = this.entityStart + consumed;
+            this.index = this.sectionStart - 1;
 
             this.cbs.ontextentity(cp);
         }

From 0b2e8b13fd44760a71d234a0721b243b81963e0c Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 11:56:42 +0100
Subject: [PATCH 04/12] Add xml mode entity tokenizer test

---
 src/Tokenizer.spec.ts | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts
index 438e23bae..dc8d92297 100644
--- a/src/Tokenizer.spec.ts
+++ b/src/Tokenizer.spec.ts
@@ -1,10 +1,10 @@
 import { Tokenizer } from "./index.js";
 import type { Callbacks } from "./Tokenizer.js";
 
-function tokenize(data: string) {
+function tokenize(data: string, options = {}) {
     const log: unknown[][] = [];
     const tokenizer = new Tokenizer(
-        {},
+        options,
         new Proxy(
             {},
             {
@@ -56,6 +56,13 @@ describe("Tokenizer", () => {
         });
     });
 
+    it("should support XML entities", () =>
+        expect(
+            tokenize("&amp;&gt;&amp&lt;&uuml;&#x61;&#x62&#99;&#100&#101", {
+                xmlMode: true,
+            })
+        ).toMatchSnapshot());
+
     it("should not lose data when pausing", () => {
         const log: unknown[][] = [];
         const tokenizer = new Tokenizer(

From 85d3056afda3436a77da55006030a898b221c818 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:44:10 +0100
Subject: [PATCH 05/12] Fix non-entity

---
 src/Tokenizer.ts                         |  4 ++
 src/__snapshots__/Tokenizer.spec.ts.snap | 48 ++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 1e9ff0a5e..feebe346d 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -625,6 +625,10 @@ export default class Tokenizer {
         // If `length` is positive, we are done with the entity.
         if (length >= 0) {
             this.state = this.baseState;
+
+            if (length === 0) {
+                this.index = this.entityStart + 1;
+            }
         }
     }
 
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index ec3de47da..2f3fe8ea2 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -87,6 +87,54 @@ exports[`Tokenizer should not lose data when pausing 1`] = `
 ]
 `;
 
+exports[`Tokenizer should support XML entities 1`] = `
+[
+  [
+    "ontextentity",
+    38,
+  ],
+  [
+    "ontextentity",
+    62,
+  ],
+  [
+    "ontext",
+    9,
+    13,
+  ],
+  [
+    "ontextentity",
+    60,
+  ],
+  [
+    "ontext",
+    17,
+    23,
+  ],
+  [
+    "ontextentity",
+    97,
+  ],
+  [
+    "ontext",
+    29,
+    34,
+  ],
+  [
+    "ontextentity",
+    99,
+  ],
+  [
+    "ontext",
+    39,
+    49,
+  ],
+  [
+    "onend",
+  ],
+]
+`;
+
 exports[`Tokenizer should support self-closing special tags for self-closing script tag 1`] = `
 [
   [

From d2813b873ec138076f83b24bc9d79ff3a0a029c9 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:54:33 +0100
Subject: [PATCH 06/12] Remove most base state writes

---
 src/Tokenizer.ts | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index feebe346d..d42616b2a 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -444,7 +444,6 @@ export default class Tokenizer {
         // Skip everything until ">"
         if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
             this.state = State.Text;
-            this.baseState = State.Text;
             this.sectionStart = this.index + 1;
         }
     }
@@ -457,7 +456,6 @@ export default class Tokenizer {
             } else {
                 this.state = State.Text;
             }
-            this.baseState = this.state;
             this.sectionStart = this.index + 1;
         } else if (c === CharCodes.Slash) {
             this.state = State.InSelfClosingTag;
@@ -470,7 +468,6 @@ export default class Tokenizer {
         if (c === CharCodes.Gt) {
             this.cbs.onselfclosingtag(this.index);
             this.state = State.Text;
-            this.baseState = State.Text;
             this.sectionStart = this.index + 1;
             this.isSpecial = false; // Reset special state, in case of self-closing special tags
         } else if (!isWhitespace(c)) {

From 8e48cf7b3275d51a32cf17fbb425366651c875c6 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 21:56:50 +0100
Subject: [PATCH 07/12] Add tokenizer tests

---
 src/Tokenizer.spec.ts                    |  27 ++++-
 src/__snapshots__/Tokenizer.spec.ts.snap | 140 +++++++++++++++++++++++
 2 files changed, 161 insertions(+), 6 deletions(-)

diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts
index dc8d92297..df07b97aa 100644
--- a/src/Tokenizer.spec.ts
+++ b/src/Tokenizer.spec.ts
@@ -56,12 +56,27 @@ describe("Tokenizer", () => {
         });
     });
 
-    it("should support XML entities", () =>
-        expect(
-            tokenize("&amp;&gt;&amp&lt;&uuml;&#x61;&#x62&#99;&#100&#101", {
-                xmlMode: true,
-            })
-        ).toMatchSnapshot());
+    describe("should handle entities", () => {
+        it("for XML entities", () =>
+            expect(
+                tokenize("&amp;&gt;&amp&lt;&uuml;&#x61;&#x62&#99;&#100&#101", {
+                    xmlMode: true,
+                })
+            ).toMatchSnapshot());
+
+        it("for entities in attributes (#276)", () =>
+            expect(
+                tokenize(
+                    '<img src="?&image_uri=1&&image;=2&image=3"/>?&image_uri=1&&image;=2&image=3'
+                )
+            ).toMatchSnapshot());
+
+        it("for trailing legacy entity", () =>
+            expect(tokenize("&timesbar;&timesbar")).toMatchSnapshot());
+
+        it("for multi-byte entities", () =>
+            expect(tokenize("&NotGreaterFullEqual;")).toMatchSnapshot());
+    });
 
     it("should not lose data when pausing", () => {
         const log: unknown[][] = [];
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 2f3fe8ea2..02954ceaa 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -1,5 +1,145 @@
 // Jest Snapshot v1, https://goo.gl/fbAQLP
 
+exports[`Tokenizer should handle entities for XML entities 1`] = `
+[
+  [
+    "ontextentity",
+    38,
+  ],
+  [
+    "ontextentity",
+    62,
+  ],
+  [
+    "ontext",
+    9,
+    13,
+  ],
+  [
+    "ontextentity",
+    60,
+  ],
+  [
+    "ontext",
+    17,
+    23,
+  ],
+  [
+    "ontextentity",
+    97,
+  ],
+  [
+    "ontext",
+    29,
+    34,
+  ],
+  [
+    "ontextentity",
+    99,
+  ],
+  [
+    "ontext",
+    39,
+    49,
+  ],
+  [
+    "onend",
+  ],
+]
+`;
+
+exports[`Tokenizer should handle entities for entities in attributes (#276) 1`] = `
+[
+  [
+    "onopentagname",
+    1,
+    4,
+  ],
+  [
+    "onattribname",
+    5,
+    8,
+  ],
+  [
+    "onattribdata",
+    10,
+    24,
+  ],
+  [
+    "onattribentity",
+    8465,
+  ],
+  [
+    "onattribdata",
+    31,
+    41,
+  ],
+  [
+    "onattribend",
+    3,
+    41,
+  ],
+  [
+    "onselfclosingtag",
+    43,
+  ],
+  [
+    "ontext",
+    44,
+    58,
+  ],
+  [
+    "ontextentity",
+    8465,
+  ],
+  [
+    "ontext",
+    65,
+    75,
+  ],
+  [
+    "onend",
+  ],
+]
+`;
+
+exports[`Tokenizer should handle entities for multi-byte entities 1`] = `
+[
+  [
+    "ontextentity",
+    8807,
+  ],
+  [
+    "ontextentity",
+    824,
+  ],
+  [
+    "onend",
+  ],
+]
+`;
+
+exports[`Tokenizer should handle entities for trailing legacy entity 1`] = `
+[
+  [
+    "ontextentity",
+    10801,
+  ],
+  [
+    "ontextentity",
+    215,
+  ],
+  [
+    "ontext",
+    16,
+    19,
+  ],
+  [
+    "onend",
+  ],
+]
+`;
+
 exports[`Tokenizer should not break after special tag followed by an entity for normal special tag 1`] = `
 [
   [

From 22364d7662e0f791d3427058d52ae12896790c76 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Sun, 2 Apr 2023 22:25:14 +0100
Subject: [PATCH 08/12] Fix trailing data handling

---
 src/Tokenizer.ts                         | 14 ++++---
 src/__snapshots__/Tokenizer.spec.ts.snap | 48 ------------------------
 2 files changed, 9 insertions(+), 53 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index d42616b2a..902e73c83 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -624,7 +624,7 @@ export default class Tokenizer {
             this.state = this.baseState;
 
             if (length === 0) {
-                this.index = this.entityStart + 1;
+                this.index = this.entityStart;
             }
         }
     }
@@ -777,16 +777,20 @@ export default class Tokenizer {
             this.state = this.baseState;
         }
 
-        // If there is remaining data, emit it in a reasonable way
-        if (this.sectionStart < this.index) {
-            this.handleTrailingData();
-        }
+        this.handleTrailingData();
+
         this.cbs.onend();
     }
 
     /** Handle any trailing data. */
     private handleTrailingData() {
         const endIndex = this.buffer.length + this.offset;
+
+        // If there is no remaining data, we are done.
+        if (this.sectionStart >= endIndex) {
+            return;
+        }
+
         if (this.state === State.InCommentLike) {
             if (this.currentSequence === Sequences.CdataEnd) {
                 this.cbs.oncdata(this.sectionStart, endIndex, 0);
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 02954ceaa..70b3ce473 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -227,54 +227,6 @@ exports[`Tokenizer should not lose data when pausing 1`] = `
 ]
 `;
 
-exports[`Tokenizer should support XML entities 1`] = `
-[
-  [
-    "ontextentity",
-    38,
-  ],
-  [
-    "ontextentity",
-    62,
-  ],
-  [
-    "ontext",
-    9,
-    13,
-  ],
-  [
-    "ontextentity",
-    60,
-  ],
-  [
-    "ontext",
-    17,
-    23,
-  ],
-  [
-    "ontextentity",
-    97,
-  ],
-  [
-    "ontext",
-    29,
-    34,
-  ],
-  [
-    "ontextentity",
-    99,
-  ],
-  [
-    "ontext",
-    39,
-    49,
-  ],
-  [
-    "onend",
-  ],
-]
-`;
-
 exports[`Tokenizer should support self-closing special tags for self-closing script tag 1`] = `
 [
   [

From 7e969186d086ad5fa5d02d27fe0a1a52e9e9f9ef Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Wed, 5 Apr 2023 11:14:02 +0100
Subject: [PATCH 09/12] fix: Increase index to mark buffer as consumed

---
 src/Tokenizer.spec.ts | 3 ++-
 src/Tokenizer.ts      | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/Tokenizer.spec.ts b/src/Tokenizer.spec.ts
index df07b97aa..4c635272c 100644
--- a/src/Tokenizer.spec.ts
+++ b/src/Tokenizer.spec.ts
@@ -97,7 +97,8 @@ describe("Tokenizer", () => {
             ) as Callbacks
         );
 
-        tokenizer.write("&amp; it up!");
+        tokenizer.write("&am");
+        tokenizer.write("p; it up!");
         tokenizer.resume();
         tokenizer.resume();
 
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 902e73c83..7a784c953 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -626,6 +626,9 @@ export default class Tokenizer {
             if (length === 0) {
                 this.index = this.entityStart;
             }
+        } else {
+            // Mark buffer as consumed.
+            this.index = this.offset + this.buffer.length - 1;
         }
     }
 

From e018a5c074743b96323fa3db930d6e7910db2738 Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Wed, 5 Apr 2023 11:24:25 +0100
Subject: [PATCH 10/12] Add `endIndex` to `ontextentity`

And remove unused getter methods
---
 src/Parser.ts                            | 11 +++--------
 src/Tokenizer.ts                         | 18 ++----------------
 src/__snapshots__/Tokenizer.spec.ts.snap | 13 +++++++++++++
 3 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/src/Parser.ts b/src/Parser.ts
index 710f44274..e0bb79abe 100644
--- a/src/Parser.ts
+++ b/src/Parser.ts
@@ -251,15 +251,10 @@ export class Parser implements Callbacks {
     }
 
     /** @internal */
-    ontextentity(cp: number): void {
-        /*
-         * Entities can be emitted on the character, or directly after.
-         * We use the section start here to get accurate indices.
-         */
-        const index = this.tokenizer.getSectionStart();
-        this.endIndex = index - 1;
+    ontextentity(cp: number, endIndex: number): void {
+        this.endIndex = endIndex - 1;
         this.cbs.ontext?.(fromCodePoint(cp));
-        this.startIndex = index;
+        this.startIndex = endIndex;
     }
 
     protected isVoidElement(name: string): boolean {
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 7a784c953..31218aeae 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -118,7 +118,7 @@ export interface Callbacks {
     onprocessinginstruction(start: number, endIndex: number): void;
     onselfclosingtag(endIndex: number): void;
     ontext(start: number, endIndex: number): void;
-    ontextentity(codepoint: number): void;
+    ontextentity(codepoint: number, endIndex: number): void;
 }
 
 /**
@@ -207,20 +207,6 @@ export default class Tokenizer {
         }
     }
 
-    /**
-     * The current index within all of the written data.
-     */
-    public getIndex(): number {
-        return this.index;
-    }
-
-    /**
-     * The start of the current section.
-     */
-    public getSectionStart(): number {
-        return this.sectionStart;
-    }
-
     private stateText(c: number): void {
         if (
             c === CharCodes.Lt ||
@@ -839,7 +825,7 @@ export default class Tokenizer {
             this.sectionStart = this.entityStart + consumed;
             this.index = this.sectionStart - 1;
 
-            this.cbs.ontextentity(cp);
+            this.cbs.ontextentity(cp, this.sectionStart);
         }
     }
 }
diff --git a/src/__snapshots__/Tokenizer.spec.ts.snap b/src/__snapshots__/Tokenizer.spec.ts.snap
index 70b3ce473..36722cdc3 100644
--- a/src/__snapshots__/Tokenizer.spec.ts.snap
+++ b/src/__snapshots__/Tokenizer.spec.ts.snap
@@ -5,10 +5,12 @@ exports[`Tokenizer should handle entities for XML entities 1`] = `
   [
     "ontextentity",
     38,
+    5,
   ],
   [
     "ontextentity",
     62,
+    9,
   ],
   [
     "ontext",
@@ -18,6 +20,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = `
   [
     "ontextentity",
     60,
+    17,
   ],
   [
     "ontext",
@@ -27,6 +30,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = `
   [
     "ontextentity",
     97,
+    29,
   ],
   [
     "ontext",
@@ -36,6 +40,7 @@ exports[`Tokenizer should handle entities for XML entities 1`] = `
   [
     "ontextentity",
     99,
+    39,
   ],
   [
     "ontext",
@@ -91,6 +96,7 @@ exports[`Tokenizer should handle entities for entities in attributes (#276) 1`]
   [
     "ontextentity",
     8465,
+    65,
   ],
   [
     "ontext",
@@ -108,10 +114,12 @@ exports[`Tokenizer should handle entities for multi-byte entities 1`] = `
   [
     "ontextentity",
     8807,
+    21,
   ],
   [
     "ontextentity",
     824,
+    21,
   ],
   [
     "onend",
@@ -124,10 +132,12 @@ exports[`Tokenizer should handle entities for trailing legacy entity 1`] = `
   [
     "ontextentity",
     10801,
+    10,
   ],
   [
     "ontextentity",
     215,
+    16,
   ],
   [
     "ontext",
@@ -164,6 +174,7 @@ exports[`Tokenizer should not break after special tag followed by an entity for
   [
     "ontextentity",
     39,
+    24,
   ],
   [
     "onopentagname",
@@ -194,6 +205,7 @@ exports[`Tokenizer should not break after special tag followed by an entity for
   [
     "ontextentity",
     39,
+    15,
   ],
   [
     "onopentagname",
@@ -215,6 +227,7 @@ exports[`Tokenizer should not lose data when pausing 1`] = `
   [
     "ontextentity",
     38,
+    5,
   ],
   [
     "ontext",

From 202edccab236d0b112c859ac571ebcecf0a3c4cf Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Thu, 6 Apr 2023 11:55:43 +0100
Subject: [PATCH 11/12] Update enum name

---
 src/Tokenizer.ts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index 31218aeae..40332be1d 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -1,6 +1,6 @@
 import {
     EntityDecoder,
-    EntityDecoderMode,
+    DecodingMode,
     htmlDecodeTree,
     xmlDecodeTree,
 } from "entities/lib/decode.js";
@@ -591,11 +591,11 @@ export default class Tokenizer {
         this.entityStart = this.index;
         this.entityDecoder.startEntity(
             this.xmlMode
-                ? EntityDecoderMode.Strict
+                ? DecodingMode.Strict
                 : this.baseState === State.Text ||
                   this.baseState === State.InSpecialTag
-                ? EntityDecoderMode.Text
-                : EntityDecoderMode.Attribute
+                ? DecodingMode.Legacy
+                : DecodingMode.Attribute
         );
     }
 

From 07946daf85f300b908f195b6f843e68193a7cf8f Mon Sep 17 00:00:00 2001
From: Felix <188768+fb55@users.noreply.github.com>
Date: Thu, 13 Apr 2023 19:10:52 +0100
Subject: [PATCH 12/12] Bump `entities`

---
 package-lock.json | 14 +++++++-------
 package.json      |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index af26f8034..6c53e67dc 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -19,7 +19,7 @@
                 "domelementtype": "^2.3.0",
                 "domhandler": "^5.0.3",
                 "domutils": "^3.0.1",
-                "entities": "^4.4.0"
+                "entities": "^4.5.0"
             },
             "devDependencies": {
                 "@types/jest": "^29.5.0",
@@ -2135,9 +2135,9 @@
             "dev": true
         },
         "node_modules/entities": {
-            "version": "4.4.0",
-            "resolved": "https://registry.npmjs.org/entities/-/entities-4.4.0.tgz",
-            "integrity": "sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA==",
+            "version": "4.5.0",
+            "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+            "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
             "engines": {
                 "node": ">=0.12"
             },
@@ -6752,9 +6752,9 @@
             "dev": true
         },
         "entities": {
-            "version": "4.4.0",
-            "resolved": "https://registry.npmjs.org/entities/-/entities-4.4.0.tgz",
-            "integrity": "sha512-oYp7156SP8LkeGD0GF85ad1X9Ai79WtRsZ2gxJqtBuzH+98YUV6jkHEKlZkMbcrjJjIVJNIDP/3WL9wQkoPbWA=="
+            "version": "4.5.0",
+            "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
+            "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="
         },
         "error-ex": {
             "version": "1.3.2",
diff --git a/package.json b/package.json
index 2154dcd82..dce9f7d86 100644
--- a/package.json
+++ b/package.json
@@ -64,7 +64,7 @@
         "domelementtype": "^2.3.0",
         "domhandler": "^5.0.3",
         "domutils": "^3.0.1",
-        "entities": "^4.4.0"
+        "entities": "^4.5.0"
     },
     "devDependencies": {
         "@types/jest": "^29.5.0",