From 0abb77182e901eee2b74bc6cabdefae0a62b3abf Mon Sep 17 00:00:00 2001
From: Paul Berberian <paul.berberian@proton.me>
Date: Tue, 28 Jan 2025 16:56:27 +0100
Subject: [PATCH] Implement Fragmented WebVTT vtte/vttc

As #1638 reported, we do not support some (or all?) ways of
communicating webvtt subtitles in mp4 files.

I initially assumed that webvtt embedded in an mp4 file worked like for
TTML (a format we're more used to): the plain subtitle format directly
inserted in an `mdat` box.

Turns out, some contents (like https://demo.unified-streaming.com/k8s/features/stable/video/tears-of-steel/tears-of-steel-wvtt.ism/.mpd),
actually rely on the metadata from other ISOBMFF boxes (mainly the
tfdt and trun boxes) to provide timing information with the `mdat`
reserved for text, identifiers and style information in a new binary
format following the ISOBMFF way of doing things (new webvtt-specific
boxes).

Weirdly enough, that format does not look at all like the WEBVTT we're
used to beside the fact that it uses the same identifier and "settings"
concept.

---

As now our subtitle parser has to have the context of the whole mp4 file
(and not of the mp4 segment), and as that parser also has to rely on
state (a `timescale` value) provided by an initialization segment, I had
to update the architecture of how subtitles are communicated: they can
now be communicated as string or `BufferSource` types (the latter
leading the text encoding detection), and a supplementary `timescale`
argument (defaulting to `1`) is now always provided to parsers.

The vast majority of parsers now do not make use of that `timescale`
value which is kind of ugly though, we may want to find a better
solution.
---
 .../implementations/text/text_segment_sink.ts |  50 +++--
 .../text_displayer/html/html_parsers.ts       |   7 +-
 .../html/html_text_displayer.ts               |  23 +-
 .../text_displayer/native/native_parsers.ts   |   9 +-
 .../native/native_text_displayer.ts           |  19 +-
 src/parsers/containers/isobmff/utils.ts       |  89 ++++++++
 src/parsers/texttracks/sami/html.ts           |  20 +-
 src/parsers/texttracks/sami/native.ts         |  16 +-
 src/parsers/texttracks/srt/html.ts            |  16 +-
 src/parsers/texttracks/srt/native.ts          |  16 +-
 .../__global__/html_ttml_parser.test.ts       |   2 +-
 .../texttracks/ttml/html/parse_ttml_to_div.ts |  19 +-
 .../ttml/native/parse_ttml_to_vtt.ts          |  19 +-
 src/parsers/texttracks/types.ts               |   6 +-
 .../__tests__/parse_webvtt_to_div.test.ts     |   8 +-
 .../webvtt/html/parse_webvtt_to_div.ts        | 196 +++++++++++++++++-
 .../webvtt/native/parse_vtt_to_cues.ts        |  16 +-
 .../TextTrackRenderer/text_track_renderer.ts  |   6 +
 src/transports/dash/text_parser.ts            |  12 +-
 src/transports/local/text_parser.ts           |  12 +-
 src/transports/smooth/pipelines.ts            |   1 +
 src/transports/types.ts                       |  16 +-
 src/transports/utils/parse_text_track.ts      |  37 +++-
 src/utils/buffer_source_to_uint8.ts           |  14 ++
 src/utils/byte_parsing.ts                     |  20 ++
 25 files changed, 585 insertions(+), 64 deletions(-)
 create mode 100644 src/utils/buffer_source_to_uint8.ts

diff --git a/src/core/segment_sinks/implementations/text/text_segment_sink.ts b/src/core/segment_sinks/implementations/text/text_segment_sink.ts
index 39b9366f9f..eb83277aa5 100644
--- a/src/core/segment_sinks/implementations/text/text_segment_sink.ts
+++ b/src/core/segment_sinks/implementations/text/text_segment_sink.ts
@@ -1,6 +1,7 @@
 import log from "../../../../log";
 import type { ITextDisplayer } from "../../../../main_thread/types";
 import type { ITextTrackSegmentData } from "../../../../transports";
+import isNullOrUndefined from "../../../../utils/is_null_or_undefined";
 import getMonotonicTimeStamp from "../../../../utils/monotonic_timestamp";
 import type { IRange } from "../../../../utils/ranges";
 import type { ICompleteSegmentInfo, IPushChunkInfos, ISBOperation } from "../types";
@@ -142,9 +143,9 @@ export default class TextSegmentSink extends SegmentSink {
 }
 
 /** Data of chunks that should be pushed to the HTMLTextSegmentSink. */
-export interface ITextTracksBufferSegmentData {
+export interface ITextTracksBufferSegmentData<T extends string | BufferSource> {
   /** The text track data, in the format indicated in `type`. */
-  data: string;
+  data: T;
   /** The format of `data` (examples: "ttml", "srt" or "vtt") */
   type: string;
   /**
@@ -153,6 +154,11 @@ export interface ITextTracksBufferSegmentData {
    * be parsed.
    */
   language?: string | undefined;
+  /**
+   * Optional timescale data context that is used to convert timing information
+   * into seconds.
+   */
+  timescale: number | null;
   /** start time from which the segment apply, in seconds. */
   start?: number | undefined;
   /** end time until which the segment apply, in seconds. */
@@ -167,7 +173,7 @@ export interface ITextTracksBufferSegmentData {
  */
 function assertChunkIsTextTrackSegmentData(
   chunk: unknown,
-): asserts chunk is ITextTracksBufferSegmentData {
+): asserts chunk is ITextTracksBufferSegmentData<string | BufferSource> {
   if (
     (__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.PRODUCTION as number)
   ) {
@@ -176,14 +182,30 @@ function assertChunkIsTextTrackSegmentData(
   if (
     typeof chunk !== "object" ||
     chunk === null ||
-    typeof (chunk as ITextTracksBufferSegmentData).data !== "string" ||
-    typeof (chunk as ITextTracksBufferSegmentData).type !== "string" ||
-    ((chunk as ITextTracksBufferSegmentData).language !== undefined &&
-      typeof (chunk as ITextTracksBufferSegmentData).language !== "string") ||
-    ((chunk as ITextTracksBufferSegmentData).start !== undefined &&
-      typeof (chunk as ITextTracksBufferSegmentData).start !== "number") ||
-    ((chunk as ITextTracksBufferSegmentData).end !== undefined &&
-      typeof (chunk as ITextTracksBufferSegmentData).end !== "number")
+    isNullOrUndefined((chunk as ITextTracksBufferSegmentData<string | BufferSource>).data)
+  ) {
+    throw new Error("Invalid format given to a TextSegmentSink");
+  }
+  if (
+    typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).type !==
+      "string" ||
+    ((chunk as ITextTracksBufferSegmentData<string | BufferSource>).language !==
+      undefined &&
+      typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).language !==
+        "string") ||
+    ((chunk as ITextTracksBufferSegmentData<string | BufferSource>).start !== undefined &&
+      typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).start !==
+        "number") ||
+    ((chunk as ITextTracksBufferSegmentData<string | BufferSource>).end !== undefined &&
+      typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).end !==
+        "number")
+  ) {
+    throw new Error("Invalid format given to a TextSegmentSink");
+  }
+  if (
+    typeof (chunk as ITextTracksBufferSegmentData<string>).data !== "string" &&
+    typeof (chunk as ITextTracksBufferSegmentData<BufferSource>).data.byteLength !==
+      "number"
   ) {
     throw new Error("Invalid format given to a TextSegmentSink");
   }
@@ -229,8 +251,10 @@ export interface ITextDisplayerInterface {
  */
 if ((__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.DEV as number)) {
   // @ts-expect-error: unused function for type checking
-  function _checkType(input: ITextTrackSegmentData): void {
-    function checkEqual(_arg: ITextTracksBufferSegmentData): void {
+  function _checkType<T extends string | BufferSource>(
+    input: ITextTrackSegmentData<T>,
+  ): void {
+    function checkEqual(_arg: ITextTracksBufferSegmentData<T>): void {
       /* nothing */
     }
     checkEqual(input);
diff --git a/src/main_thread/text_displayer/html/html_parsers.ts b/src/main_thread/text_displayer/html/html_parsers.ts
index 1d77d69b4a..13ca4de0bd 100644
--- a/src/main_thread/text_displayer/html/html_parsers.ts
+++ b/src/main_thread/text_displayer/html/html_parsers.ts
@@ -11,6 +11,8 @@ export interface IHTMLCue {
  * Convert text track data into timed HTML Cues.
  * @param {string} type - Text track format wanted
  * @param {string} data - Text track data
+ * @param {Number} timescale - Potential external timescale to convert timing
+ * information into seconds.
  * @param {Number} timestampOffset - offset to apply to every timed text
  * @param {string} [language] - language of the text tracks
  * @returns {Array.<Object>}
@@ -18,7 +20,8 @@ export interface IHTMLCue {
  */
 export default function parseTextTrackToElements(
   type: string,
-  data: string,
+  data: string | BufferSource,
+  timescale: number,
   timestampOffset: number,
   language?: string,
 ): IHTMLCue[] {
@@ -29,7 +32,7 @@ export default function parseTextTrackToElements(
     throw new Error("no parser found for the given text track");
   }
   log.debug("HTSB: Parser found, parsing...");
-  const parsed = parser(data, timestampOffset, language);
+  const parsed = parser(data, timescale, timestampOffset, language);
   log.debug("HTTB: Parsed successfully!", parsed.length);
   return parsed;
 }
diff --git a/src/main_thread/text_displayer/html/html_text_displayer.ts b/src/main_thread/text_displayer/html/html_text_displayer.ts
index 91b1b2858f..dfc8c10e4a 100644
--- a/src/main_thread/text_displayer/html/html_text_displayer.ts
+++ b/src/main_thread/text_displayer/html/html_text_displayer.ts
@@ -126,12 +126,25 @@ export default class HTMLTextDisplayer implements ITextDisplayer {
       return convertToRanges(this._buffered);
     }
 
-    const { start: startTime, end: endTime, data: dataString, type, language } = chunk;
+    const {
+      start: startTime,
+      end: endTime,
+      data: dataRaw,
+      type,
+      language,
+      timescale,
+    } = chunk;
 
     const appendWindowStart = appendWindow[0] ?? 0;
     const appendWindowEnd = appendWindow[1] ?? Infinity;
 
-    const cues = parseTextTrackToElements(type, dataString, timestampOffset, language);
+    const cues = parseTextTrackToElements(
+      type,
+      dataRaw,
+      timescale ?? 1,
+      timestampOffset,
+      language,
+    );
 
     if (appendWindowStart !== 0 && appendWindowEnd !== Infinity) {
       // Removing before window start
@@ -398,7 +411,7 @@ export default class HTMLTextDisplayer implements ITextDisplayer {
 /** Data of chunks that should be pushed to the `HTMLTextDisplayer`. */
 export interface ITextTracksBufferSegmentData {
   /** The text track data, in the format indicated in `type`. */
-  data: string;
+  data: string | BufferSource;
   /** The format of `data` (examples: "ttml", "srt" or "vtt") */
   type: string;
   /**
@@ -424,7 +437,9 @@ export interface ITextTracksBufferSegmentData {
  */
 if ((__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.DEV as number)) {
   // @ts-expect-error: uncalled function just for type-checking
-  function _checkType(input: ITextTrackSegmentData): void {
+  function _checkType<T extends string | BufferSource>(
+    input: ITextTrackSegmentData<T>,
+  ): void {
     function checkEqual(_arg: ITextTracksBufferSegmentData): void {
       /* nothing */
     }
diff --git a/src/main_thread/text_displayer/native/native_parsers.ts b/src/main_thread/text_displayer/native/native_parsers.ts
index 9aec18dcc5..6fb16f6c9e 100644
--- a/src/main_thread/text_displayer/native/native_parsers.ts
+++ b/src/main_thread/text_displayer/native/native_parsers.ts
@@ -5,7 +5,9 @@ import log from "../../../log";
 /**
  * Convert text track data into timed VTT Cues.
  * @param {string} type - Text track format wanted
- * @param {string} data - Text track data
+ * @param {string|BufferSource} data - Text track data
+ * @param {Number} timescale - Potential external timescale to convert timing
+ * information into seconds.
  * @param {Number} timestampOffset - offset to apply to every timed text
  * @param {string} [language] - language of the text tracks
  * @returns {Array.<VTTCue>}
@@ -13,7 +15,8 @@ import log from "../../../log";
  */
 export default function parseTextTrackToCues(
   type: string,
-  data: string,
+  data: string | BufferSource,
+  timescale: number,
   timestampOffset: number,
   language?: string,
 ): Array<ICompatVTTCue | TextTrackCue> {
@@ -25,7 +28,7 @@ export default function parseTextTrackToCues(
   }
 
   log.debug("NTSB: Parser found, parsing...");
-  const parsed = parser(data, timestampOffset, language);
+  const parsed = parser(data, timescale, timestampOffset, language);
   log.debug("NTSB: Parsed successfully!", parsed.length);
   return parsed;
 }
diff --git a/src/main_thread/text_displayer/native/native_text_displayer.ts b/src/main_thread/text_displayer/native/native_text_displayer.ts
index b768f62cdf..147f75e8b7 100644
--- a/src/main_thread/text_displayer/native/native_text_displayer.ts
+++ b/src/main_thread/text_displayer/native/native_text_displayer.ts
@@ -48,10 +48,23 @@ export default class NativeTextDisplayer implements ITextDisplayer {
       return convertToRanges(this._buffered);
     }
     const { timestampOffset, appendWindow, chunk } = infos;
-    const { start: startTime, end: endTime, data: dataString, type, language } = chunk;
+    const {
+      start: startTime,
+      end: endTime,
+      data: dataString,
+      type,
+      language,
+      timescale,
+    } = chunk;
     const appendWindowStart = appendWindow[0] ?? 0;
     const appendWindowEnd = appendWindow[1] ?? Infinity;
-    const cues = parseTextTrackToCues(type, dataString, timestampOffset, language);
+    const cues = parseTextTrackToCues(
+      type,
+      dataString,
+      timescale ?? 1,
+      timestampOffset,
+      language,
+    );
 
     if (appendWindowStart !== 0 && appendWindowEnd !== Infinity) {
       // Removing before window start
@@ -222,7 +235,7 @@ export default class NativeTextDisplayer implements ITextDisplayer {
 /** Data of chunks that should be pushed to the NativeTextDisplayer. */
 export interface INativeTextTracksBufferSegmentData {
   /** The text track data, in the format indicated in `type`. */
-  data: string;
+  data: string | BufferSource;
   /** The format of `data` (examples: "ttml", "srt" or "vtt") */
   type: string;
   /**
diff --git a/src/parsers/containers/isobmff/utils.ts b/src/parsers/containers/isobmff/utils.ts
index 2f21752d45..ce7737c3ff 100644
--- a/src/parsers/containers/isobmff/utils.ts
+++ b/src/parsers/containers/isobmff/utils.ts
@@ -20,6 +20,7 @@ import {
   be2toi,
   be3toi,
   be4toi,
+  be4toiSigned,
   be8toi,
   concat,
   itobe4,
@@ -233,6 +234,92 @@ function getDefaultDurationFromTFHDInTRAF(traf: Uint8Array): number | undefined
   return defaultDuration;
 }
 
+interface ITrunSampleInfo {
+  duration: number;
+  compositionTimeOffset: number | undefined;
+  size: number | undefined;
+  flags: number | undefined;
+}
+
+function getTrunSamples(buffer: Uint8Array): ITrunSampleInfo[] {
+  const trafs = getTRAFs(buffer);
+  const samples: ITrunSampleInfo[] = [];
+  for (const traf of trafs) {
+    const trun = getBoxContent(traf, 0x7472756e /* trun */);
+    if (trun === null) {
+      continue;
+    }
+    let cursor = 0;
+    const version = trun[cursor];
+    cursor += 1;
+    if (version > 1) {
+      return [];
+    }
+
+    const flags = be3toi(trun, cursor);
+    cursor += 3;
+    const hasSampleDuration = (flags & 0x000100) > 0;
+
+    let defaultDuration: number | undefined = 0;
+    if (!hasSampleDuration) {
+      defaultDuration = getDefaultDurationFromTFHDInTRAF(traf);
+      if (defaultDuration === undefined) {
+        return [];
+      }
+    }
+
+    const hasDataOffset = (flags & 0x000001) > 0;
+    const hasFirstSampleFlags = (flags & 0x000004) > 0;
+    const hasSampleSize = (flags & 0x000200) > 0;
+    const hasSampleFlags = (flags & 0x000400) > 0;
+    const hasSampleCompositionOffset = (flags & 0x000800) > 0;
+
+    const sampleCounts = be4toi(trun, cursor);
+    cursor += 4;
+
+    if (hasDataOffset) {
+      cursor += 4;
+    }
+    if (hasFirstSampleFlags) {
+      cursor += 4;
+    }
+
+    let i = sampleCounts;
+    while (i-- > 0) {
+      let duration;
+      let size;
+      let sampleFlags;
+      let compositionTimeOffset;
+      if (hasSampleDuration) {
+        duration = be4toi(trun, cursor);
+        cursor += 4;
+      } else {
+        duration = defaultDuration;
+      }
+      if (hasSampleSize) {
+        size = be4toi(trun, cursor);
+        cursor += 4;
+      }
+      if (hasSampleFlags) {
+        sampleFlags = be4toi(trun, cursor);
+        cursor += 4;
+      }
+      if (hasSampleCompositionOffset) {
+        compositionTimeOffset =
+          version === 0 ? be4toi(trun, cursor) : be4toiSigned(trun, cursor);
+        cursor += 4;
+      }
+      samples.push({
+        duration,
+        compositionTimeOffset,
+        size,
+        flags: sampleFlags,
+      });
+    }
+  }
+  return samples;
+}
+
 /**
  * Calculate segment duration approximation by additioning the duration from
  * every samples in a trun ISOBMFF box.
@@ -563,6 +650,7 @@ function getKeyIdFromInitSegment(segment: Uint8Array): Uint8Array | null {
   return keyId.every((b) => b === 0) ? null : keyId;
 }
 
+export type { ITrunSampleInfo };
 export {
   getKeyIdFromInitSegment,
   getMDHDTimescale,
@@ -573,4 +661,5 @@ export {
   patchPssh,
   updateBoxLength,
   parseEmsgBoxes,
+  getTrunSamples,
 };
diff --git a/src/parsers/texttracks/sami/html.ts b/src/parsers/texttracks/sami/html.ts
index c73ed84fbb..dbac926cfd 100644
--- a/src/parsers/texttracks/sami/html.ts
+++ b/src/parsers/texttracks/sami/html.ts
@@ -29,8 +29,10 @@
  * It always should be imported through the `features` object.
  */
 
+import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8";
 import isNonEmptyString from "../../../utils/is_non_empty_string";
 import isNullOrUndefined from "../../../utils/is_null_or_undefined";
+import { utf8ToStr } from "../../../utils/string_parsing";
 import type { IHTMLCue } from "../types";
 
 const HTML_ENTITIES = /&#([0-9]+);/g;
@@ -99,11 +101,25 @@ function decodeEntities(text: string): string {
  * The specification being quite clunky, this parser
  * may not work for every sami input.
  *
- * @param {string} smi
+ * @param {string|BufferSource} input
+ * @param {Number} _timescale
  * @param {Number} timeOffset
  * @param {string} lang
  */
-function parseSami(smi: string, timeOffset: number, lang?: string): IHTMLCue[] {
+function parseSami(
+  input: string | BufferSource,
+  _timescale: number,
+  timeOffset: number,
+  lang?: string,
+): IHTMLCue[] {
+  let smi: string;
+  if (typeof input !== "string") {
+    // Assume UTF-8
+    // TODO: detection?
+    smi = utf8ToStr(bufferSourceToUint8(input));
+  } else {
+    smi = input;
+  }
   const syncOpen = /<sync[ >]/gi;
   const syncClose = /<sync[ >]|<\/body>/gi;
 
diff --git a/src/parsers/texttracks/sami/native.ts b/src/parsers/texttracks/sami/native.ts
index 441007da30..bb1c850686 100644
--- a/src/parsers/texttracks/sami/native.ts
+++ b/src/parsers/texttracks/sami/native.ts
@@ -21,8 +21,10 @@
 
 import type { ICompatVTTCue } from "../../../compat/browser_compatibility_types";
 import makeVTTCue from "../../../compat/make_vtt_cue";
+import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8";
 import isNonEmptyString from "../../../utils/is_non_empty_string";
 import isNullOrUndefined from "../../../utils/is_null_or_undefined";
+import { utf8ToStr } from "../../../utils/string_parsing";
 
 const HTML_ENTITIES = /&#([0-9]+);/g;
 const BR = /<br>/gi;
@@ -104,16 +106,26 @@ function decodeEntities(text: string): string {
  * The specification being quite clunky, this parser
  * may not work for every sami input.
  *
- * @param {string} smi
+ * @param {string|BufferSource} input
+ * @param {Number} _timescale
  * @param {Number} timeOffset
  * @param {string} lang
  * @returns {Array.<VTTCue|TextTrackCue>}
  */
 function parseSami(
-  smi: string,
+  input: string | BufferSource,
+  _timescale: number,
   timeOffset: number,
   lang?: string,
 ): Array<TextTrackCue | ICompatVTTCue> {
+  let smi: string;
+  if (typeof input !== "string") {
+    // Assume UTF-8
+    // TODO: detection?
+    smi = utf8ToStr(bufferSourceToUint8(input));
+  } else {
+    smi = input;
+  }
   const syncOpen = /<sync[ >]/gi;
   const syncClose = /<sync[ >]|<\/body>/gi;
 
diff --git a/src/parsers/texttracks/srt/html.ts b/src/parsers/texttracks/srt/html.ts
index d123106b9b..43b0178cb1 100644
--- a/src/parsers/texttracks/srt/html.ts
+++ b/src/parsers/texttracks/srt/html.ts
@@ -24,6 +24,8 @@
 // Done for fun. Understand <b>, <i>, <u> and <font color="#ff0000" /> type
 // of tags.
 
+import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8";
+import { utf8ToStr } from "../../../utils/string_parsing";
 import getCueBlocks from "./get_cue_blocks";
 import parseCueBlock from "./parse_cue";
 
@@ -34,14 +36,24 @@ export interface ISRTHTMLCue {
 }
 
 /**
- * @param {string} srtStr
+ * @param {string|BufferSource} input
+ * @param {Number} _timescale
  * @param {Number} timeOffset
  * @returns {Array.<Object>}
  */
 export default function parseSRTStringToHTML(
-  srtStr: string,
+  input: string | BufferSource,
+  _timescale: number,
   timeOffset: number,
 ): ISRTHTMLCue[] {
+  let srtStr: string;
+  if (typeof input !== "string") {
+    // Assume UTF-8
+    // TODO: detection?
+    srtStr = utf8ToStr(bufferSourceToUint8(input));
+  } else {
+    srtStr = input;
+  }
   // Even if srt only authorize CRLF, we will also take LF or CR as line
   // terminators for resilience
   const lines = srtStr.split(/\r\n|\n|\r/);
diff --git a/src/parsers/texttracks/srt/native.ts b/src/parsers/texttracks/srt/native.ts
index 80485059ff..b14f32e4e6 100644
--- a/src/parsers/texttracks/srt/native.ts
+++ b/src/parsers/texttracks/srt/native.ts
@@ -24,20 +24,32 @@
 
 import type { ICompatVTTCue } from "../../../compat/browser_compatibility_types";
 import makeVTTCue from "../../../compat/make_vtt_cue";
+import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8";
+import { utf8ToStr } from "../../../utils/string_parsing";
 import getCueBlocks from "./get_cue_blocks";
 import parseCueBlock from "./parse_cue";
 
 /**
  * Parse whole srt file into an array of cues, to be inserted in a video's
  * TrackElement.
- * @param {string} srtStr
+ * @param {string|bufferSource} input
+ * @param {Number} _timescale
  * @param {Number} timeOffset
  * @returns {Array.<VTTCue|TextTrackCue>}
  */
 export default function parseSRTStringToVTTCues(
-  srtStr: string,
+  input: string | BufferSource,
+  _timescale: number,
   timeOffset: number,
 ): Array<ICompatVTTCue | TextTrackCue> {
+  let srtStr: string;
+  if (typeof input !== "string") {
+    // Assume UTF-8
+    // TODO: detection?
+    srtStr = utf8ToStr(bufferSourceToUint8(input));
+  } else {
+    srtStr = input;
+  }
   // Even if srt only authorize CRLF, we will also take LF or CR as line
   // terminators for resilience
   const lines = srtStr.split(/\r\n|\n|\r/);
diff --git a/src/parsers/texttracks/ttml/html/__tests__/__global__/html_ttml_parser.test.ts b/src/parsers/texttracks/ttml/html/__tests__/__global__/html_ttml_parser.test.ts
index 5716ea18e7..943d757080 100644
--- a/src/parsers/texttracks/ttml/html/__tests__/__global__/html_ttml_parser.test.ts
+++ b/src/parsers/texttracks/ttml/html/__tests__/__global__/html_ttml_parser.test.ts
@@ -79,7 +79,7 @@ const testingText = `<?xml version="1.0" encoding="UTF-8"?>
 </tt>`;
 
 describe("Global TTML HTML parsing tests", () => {
-  const res = parseTTMLToDiv(testingText, 0);
+  const res = parseTTMLToDiv(testingText, 1, 0);
   it("should parse the right amount of cues at the right time", () => {
     expect(res).toHaveLength(11);
     expect(res[0].start).toEqual(0.76);
diff --git a/src/parsers/texttracks/ttml/html/parse_ttml_to_div.ts b/src/parsers/texttracks/ttml/html/parse_ttml_to_div.ts
index d6b87561e9..ff0dbe8c97 100644
--- a/src/parsers/texttracks/ttml/html/parse_ttml_to_div.ts
+++ b/src/parsers/texttracks/ttml/html/parse_ttml_to_div.ts
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+import bufferSourceToUint8 from "../../../../utils/buffer_source_to_uint8";
+import { utf8ToStr } from "../../../../utils/string_parsing";
 import parseTtml from "../parse_ttml";
 import {
   applyDefaultTTMLStyle,
@@ -36,10 +38,23 @@ import parseCue from "./parse_cue";
  * TODO TTML parsing is still pretty heavy on the CPU.
  * Optimizations have been done, principally to avoid using too much XML APIs,
  * but we can still do better.
- * @param {string} str
+ * @param {string|BufferSource} input
+ * @param {number} _timescale
  * @param {number} timeOffset
  */
-export default function parseTTMLToDiv(str: string, timeOffset: number): ITTMLHTMLCue[] {
+export default function parseTTMLToDiv(
+  input: string | BufferSource,
+  _timescale: number,
+  timeOffset: number,
+): ITTMLHTMLCue[] {
+  let str: string;
+  if (typeof input !== "string") {
+    // Assume UTF-8
+    // TODO: detection?
+    str = utf8ToStr(bufferSourceToUint8(input));
+  } else {
+    str = input;
+  }
   const parsedCues = parseTtml(str, timeOffset);
   const cues: ITTMLHTMLCue[] = [];
   for (const parsedCue of parsedCues) {
diff --git a/src/parsers/texttracks/ttml/native/parse_ttml_to_vtt.ts b/src/parsers/texttracks/ttml/native/parse_ttml_to_vtt.ts
index d2c11be02b..64f49d98b7 100644
--- a/src/parsers/texttracks/ttml/native/parse_ttml_to_vtt.ts
+++ b/src/parsers/texttracks/ttml/native/parse_ttml_to_vtt.ts
@@ -15,17 +15,30 @@
  */
 
 import type { ICompatVTTCue } from "../../../../compat/browser_compatibility_types";
+import bufferSourceToUint8 from "../../../../utils/buffer_source_to_uint8";
+import { utf8ToStr } from "../../../../utils/string_parsing";
 import parseTtml from "../parse_ttml";
 import parseCue from "./parse_cue";
 
 /**
- * @param str
- * @param timeOffset
+ * @param {string|BufferSource} input
+ * @param {number} _timescale
+ * @param {number} timeOffset
+ * @returns {Array.<VTTCue|TextTrackCue>}
  */
 export default function parseTtmlToNative(
-  str: string,
+  input: string | BufferSource,
+  _timescale: number,
   timeOffset: number,
 ): Array<TextTrackCue | ICompatVTTCue> {
+  let str: string;
+  if (typeof input !== "string") {
+    // Assume UTF-8
+    // TODO: detection?
+    str = utf8ToStr(bufferSourceToUint8(input));
+  } else {
+    str = input;
+  }
   const parsedCues = parseTtml(str, timeOffset);
   const cues: Array<TextTrackCue | ICompatVTTCue> = [];
   for (const parsedCue of parsedCues) {
diff --git a/src/parsers/texttracks/types.ts b/src/parsers/texttracks/types.ts
index 529e8ca32f..d13c900a04 100644
--- a/src/parsers/texttracks/types.ts
+++ b/src/parsers/texttracks/types.ts
@@ -25,14 +25,16 @@ export interface IHTMLCue {
 
 // Function to parse texttracks into native VTT cues
 export type INativeTextTracksParserFn = (
-  texttrack: string,
+  texttrack: string | BufferSource,
+  timescale: number,
   timeOffset: number,
   language?: string,
 ) => Array<ICompatVTTCue | TextTrackCue>;
 
 // Function to parse texttracks into HTML cues
 export type IHTMLTextTracksParserFn = (
-  texttrack: string,
+  texttrack: string | BufferSource,
+  timescale: number,
   timeOffset: number,
   language?: string,
 ) => IHTMLCue[];
diff --git a/src/parsers/texttracks/webvtt/html/__tests__/parse_webvtt_to_div.test.ts b/src/parsers/texttracks/webvtt/html/__tests__/parse_webvtt_to_div.test.ts
index f221f0d220..cfc77df075 100644
--- a/src/parsers/texttracks/webvtt/html/__tests__/parse_webvtt_to_div.test.ts
+++ b/src/parsers/texttracks/webvtt/html/__tests__/parse_webvtt_to_div.test.ts
@@ -9,13 +9,13 @@ describe("parsers - webvtt - parseWebVTT", () => {
   it("should throw if text is empty", async () => {
     const parseWebVTT = (await vi.importActual("../parse_webvtt_to_div"))
       .default as typeof IParseWebVTT;
-    expect(() => parseWebVTT("", 0)).toThrowError("Can't parse WebVTT: Invalid File.");
+    expect(() => parseWebVTT("", 1, 0)).toThrowError("Can't parse WebVTT: Invalid File.");
   });
 
   it("should throw if file seems to be invalid", async () => {
     const parseWebVTT = (await vi.importActual("../parse_webvtt_to_div"))
       .default as typeof IParseWebVTT;
-    expect(() => parseWebVTT("WEBWTT\n", 0)).toThrowError(
+    expect(() => parseWebVTT("WEBWTT\n", 1, 0)).toThrowError(
       "Can't parse WebVTT: Invalid File.",
     );
   });
@@ -73,7 +73,7 @@ describe("parsers - webvtt - parseWebVTT", () => {
 
     const parseWebVTT = (await vi.importActual("../parse_webvtt_to_div"))
       .default as typeof IParseWebVTT;
-    expect(parseWebVTT("WEBVTT\n", 0)).toEqual([
+    expect(parseWebVTT("WEBVTT\n", 1, 0)).toEqual([
       {
         element: document.createElement("div"),
         end: 100,
@@ -140,7 +140,7 @@ describe("parsers - webvtt - parseWebVTT", () => {
 
     const parseWebVTT = (await vi.importActual("../parse_webvtt_to_div"))
       .default as typeof IParseWebVTT;
-    expect(parseWebVTT("WEBVTT\n", 0)).toEqual([]);
+    expect(parseWebVTT("WEBVTT\n", 1, 0)).toEqual([]);
     expect(spyGetFirstLineAfterHeader).toHaveBeenCalledTimes(1);
     expect(spyGetStyleBlock).toHaveBeenCalledTimes(1);
     expect(spyGetCueBlock).toHaveBeenCalledTimes(1);
diff --git a/src/parsers/texttracks/webvtt/html/parse_webvtt_to_div.ts b/src/parsers/texttracks/webvtt/html/parse_webvtt_to_div.ts
index f48fb82bb6..b6bd6c457c 100644
--- a/src/parsers/texttracks/webvtt/html/parse_webvtt_to_div.ts
+++ b/src/parsers/texttracks/webvtt/html/parse_webvtt_to_div.ts
@@ -14,6 +14,16 @@
  * limitations under the License.
  */
 
+import log from "../../../../log";
+import bufferSourceToUint8 from "../../../../utils/buffer_source_to_uint8";
+import { be4toi } from "../../../../utils/byte_parsing";
+import { strToUtf8, utf8ToStr } from "../../../../utils/string_parsing";
+import {
+  getBoxContent,
+  getMDAT,
+  getTrackFragmentDecodeTime,
+} from "../../../containers/isobmff";
+import { getTrunSamples } from "../../../containers/isobmff/utils";
 import getCueBlocks from "../get_cue_blocks";
 import getStyleBlocks from "../get_style_blocks";
 import parseCueBlock from "../parse_cue_block";
@@ -32,13 +42,27 @@ import toHTML from "./to_html";
  * Specific style is parsed and applied to class element.
  *
  * @throws Error - Throws if the given WebVTT string is invalid.
- * @param {string} text - The whole webvtt subtitles to parse
+ * @param {string | BufferSource} text - The whole webvtt subtitles to parse
+ * @param {Number} timescale
  * @param {Number} timeOffset - Offset to add to start and end times, in seconds
  * @return {Array.<Object>}
  */
-export default function parseWebVTT(text: string, timeOffset: number): IVTTHTMLCue[] {
+export default function parseWebVTT(
+  text: string | BufferSource,
+  timescale: number,
+  timeOffset: number,
+): IVTTHTMLCue[] {
+  let textStr: string;
+  if (typeof text !== "string") {
+    // Assume UTF-8
+    // XXX TODO:
+    // textStr = utf8ToStr(bufferSourceToUint8(text));
+    return parseWebVTTInMp4(text, timescale, timeOffset);
+  } else {
+    textStr = text;
+  }
   const newLineChar = /\r\n|\n|\r/g; // CRLF|LF|CR
-  const linified = text.split(newLineChar);
+  const linified = textStr.split(newLineChar);
 
   const cuesArray: IVTTHTMLCue[] = [];
   if (/^WEBVTT( |\t|\n|\r|$)/.exec(linified[0]) === null) {
@@ -61,3 +85,169 @@ export default function parseWebVTT(text: string, timeOffset: number): IVTTHTMLC
   }
   return cuesArray;
 }
+
+function parseWebVTTInMp4(
+  segment: BufferSource | string,
+  timescale: number,
+  timeOffset: number,
+): IVTTHTMLCue[] {
+  let buffer: Uint8Array;
+  if (typeof segment === "string") {
+    buffer = strToUtf8(segment);
+  } else {
+    buffer = bufferSourceToUint8(segment);
+  }
+  if (buffer.length === 0) {
+    return [];
+  }
+
+  const cuesArray = [];
+  const trackDecodeTime = getTrackFragmentDecodeTime(buffer);
+  if (trackDecodeTime === undefined) {
+    return [];
+  }
+  const trunSamples = getTrunSamples(buffer);
+  const mdat = getMDAT(buffer);
+  if (mdat === null) {
+    return [];
+  }
+  let mdatOffset = 0;
+  let lastTime = trackDecodeTime;
+  /** @type {!shaka.util.DataViewReader} */
+  // const reader = new shaka.util.DataViewReader(
+  //     rawPayload, shaka.util.DataViewReader.Endianness.BIG_ENDIAN);
+
+  for (const sample of trunSamples) {
+    const duration = sample.duration ?? 0;
+    const startTime =
+      sample.compositionTimeOffset !== undefined
+        ? lastTime + sample.compositionTimeOffset
+        : lastTime;
+    lastTime = startTime + duration;
+
+    // Read samples until it adds up to the given size.
+    let totalSize = 0;
+    // No sample size == a single sample
+    while (totalSize < (sample.size ?? 0)) {
+      // Read the payload size.
+      const payloadSize = be4toi(mdat, mdatOffset);
+      mdatOffset += 4;
+      totalSize += payloadSize;
+
+      const currentBoxName = utf8ToStr(mdat.slice(mdatOffset, mdatOffset + 4));
+      mdatOffset += 4;
+
+      let currentBoxData: Uint8Array | null = null;
+      if (currentBoxName === "vttc") {
+        if (payloadSize > 8) {
+          currentBoxData = mdat.slice(mdatOffset, mdatOffset + (payloadSize - 8));
+          mdatOffset += payloadSize - 8;
+        }
+      } else if (currentBoxName === "vtte") {
+        if (payloadSize > 8) {
+          mdatOffset += payloadSize - 8;
+        }
+      } else {
+        log.error("webvtt: encountered unknown fragmented vtt box: ", currentBoxName);
+        mdatOffset += Math.min(payloadSize - 8, 1);
+      }
+
+      if (duration > 0) {
+        if (currentBoxData !== null) {
+          const cue = parseVttC(
+            currentBoxData,
+            timeOffset + startTime / timescale,
+            timeOffset + lastTime / timescale,
+          );
+          if (cue !== null) {
+            cuesArray.push(cue);
+          }
+        }
+      } else {
+        log.error("webvtt: cue duration missing");
+      }
+      //
+      // goog.asserts.assert(
+      //     !sample.sampleSize || totalSize <= sample.sampleSize,
+      //     'The samples do not fit evenly into the sample sizes given in ' +
+      //     'the TRUN box!');
+      //
+      // If no sampleSize was specified, it's assumed that this sample
+      // corresponds to only a single cue.
+    }
+  }
+
+  // goog.asserts.assert(
+  //   !reader.hasMoreData(),
+  //   "MDAT which contain VTT cues and non-VTT data are not currently " + "supported!",
+  // );
+
+  return cuesArray;
+}
+
+function parseVttC(
+  data: Uint8Array,
+  startTime: number,
+  endTime: number,
+): IVTTHTMLCue | null {
+  const payload = getPayl(data);
+  // const iden = getIden(data);
+  // const settings = getSttg(data);
+  if (payload === null) {
+    return null;
+  }
+  const cueHtml = toHTML(
+    {
+      start: startTime,
+      end: endTime,
+      settings: {},
+      header: undefined,
+      payload: [utf8ToStr(payload)],
+    },
+    {
+      classes: {},
+      global: undefined,
+    },
+  );
+
+  // XXX TODO:
+  // if (settings) {
+  //   const parser = new shaka.util.TextParser(settings);
+  //
+  //   let word = parser.readWord();
+  //
+  //   while (word) {
+  //     // TODO: Check WebVTTConfigurationBox for region info.
+  //     if (
+  //       !shaka.text.VttTextParser.parseCueSetting(cue, word, /* VTTRegions= */ [])
+  //     ) {
+  //       shaka.log.warning(
+  //         "VTT parser encountered an invalid VTT setting: ",
+  //         word,
+  //         " The setting will be ignored.",
+  //       );
+  //     }
+  //
+  //     parser.skipWhitespace();
+  //     word = parser.readWord();
+  //   }
+  // }
+
+  return cueHtml;
+}
+/**
+ * Returns the content of the first "payl" box encountered in the given ISOBMFF
+ * data.
+ * Returns null if not found.
+ * @param {Uint8Array} buffer
+ * @returns {Uint8Array|null}
+ */
+function getPayl(buf: Uint8Array): Uint8Array | null {
+  return getBoxContent(buf, 0x7061796c /* "payl" */);
+}
+// function getIden(buf: Uint8Array): Uint8Array | null {
+//   return getBoxContent(buf, 0x6964656e /* "iden" */);
+// }
+// function getSttg(buf: Uint8Array): Uint8Array | null {
+//   return getBoxContent(buf, 0x73747467 /* "sttg" */);
+// }
diff --git a/src/parsers/texttracks/webvtt/native/parse_vtt_to_cues.ts b/src/parsers/texttracks/webvtt/native/parse_vtt_to_cues.ts
index d531fedc58..e6af28476c 100644
--- a/src/parsers/texttracks/webvtt/native/parse_vtt_to_cues.ts
+++ b/src/parsers/texttracks/webvtt/native/parse_vtt_to_cues.ts
@@ -21,6 +21,8 @@
 
 import type { ICompatVTTCue } from "../../../../compat/browser_compatibility_types";
 import isVTTCue from "../../../../compat/is_vtt_cue";
+import bufferSourceToUint8 from "../../../../utils/buffer_source_to_uint8";
+import { utf8ToStr } from "../../../../utils/string_parsing";
 import getCueBlocks from "../get_cue_blocks";
 import parseCueBlock from "../parse_cue_block";
 import { getFirstLineAfterHeader } from "../utils";
@@ -34,14 +36,24 @@ import toNativeCue from "./to_native_cue";
 /**
  * Parse whole WEBVTT file into an array of cues, to be inserted in a video's
  * TrackElement.
- * @param {string} vttStr
+ * @param {string|BufferSource} input
+ * @param {Number} _timescale
  * @param {Number} timeOffset
  * @returns {Array.<ICompatVTTCue|TextTrackCue>}
  */
 export default function parseVTTStringToVTTCues(
-  vttStr: string,
+  input: string | BufferSource,
+  _timescale: number,
   timeOffset: number,
 ): Array<TextTrackCue | ICompatVTTCue> {
+  let vttStr: string;
+  if (typeof input !== "string") {
+    // Assume UTF-8
+    // TODO: detection?
+    vttStr = utf8ToStr(bufferSourceToUint8(input));
+  } else {
+    vttStr = input;
+  }
   // WEBVTT authorize CRLF, LF or CR as line terminators
   const lines = vttStr.split(/\r\n|\n|\r/);
 
diff --git a/src/tools/TextTrackRenderer/text_track_renderer.ts b/src/tools/TextTrackRenderer/text_track_renderer.ts
index 4376a5f442..119585f366 100644
--- a/src/tools/TextTrackRenderer/text_track_renderer.ts
+++ b/src/tools/TextTrackRenderer/text_track_renderer.ts
@@ -24,6 +24,11 @@ export interface ISetTextTrackArguments {
   data: string;
   /** The format the text track is in (e.g. "ttml" or "vtt") */
   type: string;
+  /**
+   * Optional timescale data context that is used to convert timing information
+   * into seconds.
+   */
+  timescale: number | null;
   /** Offset, in seconds, that will be added to each subtitle's start and end time. */
   timeOffset?: number;
   /**
@@ -81,6 +86,7 @@ export default class TextTrackRenderer {
       chunk: {
         start: 0,
         end: Number.MAX_VALUE,
+        timescale: args.timescale,
         data: args.data,
         language: args.language,
         type: args.type,
diff --git a/src/transports/dash/text_parser.ts b/src/transports/dash/text_parser.ts
index ce08275bb1..b849989d28 100644
--- a/src/transports/dash/text_parser.ts
+++ b/src/transports/dash/text_parser.ts
@@ -108,6 +108,7 @@ function parseISOBMFFEmbeddedTextTrack(
   const chunkData = getISOBMFFEmbeddedTextTrackData(
     context,
     chunkBytes,
+    initTimescale,
     chunkInfos,
     isChunked,
   );
@@ -127,6 +128,7 @@ function parseISOBMFFEmbeddedTextTrack(
  * Parse TextTrack data when it is in plain text form.
  *
  * @param {ArrayBuffer|Uint8Array|string} data - The segment data.
+ * @param {number|undefined} initTimescale
  * @param {boolean} isChunked - If `true`, the `data` may contain only a
  * decodable subpart of the full data in the linked segment.
  * @param {Object} context - Object describing the context of the given
@@ -136,6 +138,7 @@ function parseISOBMFFEmbeddedTextTrack(
  */
 function parsePlainTextTrack(
   data: ArrayBuffer | Uint8Array | string,
+  initTimescale: number | undefined,
   isChunked: boolean,
   context: ISegmentContext,
 ):
@@ -162,7 +165,12 @@ function parsePlainTextTrack(
   } else {
     textTrackData = data;
   }
-  const chunkData = getPlainTextTrackData(context, textTrackData, isChunked);
+  const chunkData = getPlainTextTrackData(
+    context,
+    textTrackData,
+    initTimescale,
+    isChunked,
+  );
   return {
     segmentType: "media",
     chunkData,
@@ -244,7 +252,7 @@ export default function generateTextTrackParser({
         __priv_patchLastSegmentInSidx,
       );
     } else {
-      return parsePlainTextTrack(data, isChunked, context);
+      return parsePlainTextTrack(data, initTimescale, isChunked, context);
     }
   };
 }
diff --git a/src/transports/local/text_parser.ts b/src/transports/local/text_parser.ts
index 2949a9c6bc..f8b7dd6247 100644
--- a/src/transports/local/text_parser.ts
+++ b/src/transports/local/text_parser.ts
@@ -78,6 +78,7 @@ function parseISOBMFFEmbeddedTextTrack(
   const chunkData = getISOBMFFEmbeddedTextTrackData(
     context,
     chunkBytes,
+    initTimescale,
     chunkInfos,
     isChunked,
   );
@@ -96,6 +97,7 @@ function parseISOBMFFEmbeddedTextTrack(
 /**
  * Parse TextTrack data when it is in plain text form.
  * @param {ArrayBuffer|Uint8Array|string} data - The segment data.
+ * @param {number|undefined} initTimescale
  * @param {boolean} isChunked - If `true`, the `data` may contain only a
  * decodable subpart of the full data in the linked segment.
  * @param {Object} context - Object describing the context of the given
@@ -105,6 +107,7 @@ function parseISOBMFFEmbeddedTextTrack(
  */
 function parsePlainTextTrack(
   data: string | Uint8Array | ArrayBuffer,
+  initTimescale: number | undefined,
   isChunked: boolean,
   context: ISegmentContext,
 ):
@@ -130,7 +133,12 @@ function parsePlainTextTrack(
   } else {
     textTrackData = data;
   }
-  const chunkData = getPlainTextTrackData(context, textTrackData, isChunked);
+  const chunkData = getPlainTextTrackData(
+    context,
+    textTrackData,
+    initTimescale,
+    isChunked,
+  );
   const chunkOffset = segment.timestampOffset ?? 0;
   return {
     segmentType: "media",
@@ -192,6 +200,6 @@ export default function textTrackParser(
   } else if (containerType === "mp4") {
     return parseISOBMFFEmbeddedTextTrack(data, isChunked, context, initTimescale);
   } else {
-    return parsePlainTextTrack(data, isChunked, context);
+    return parsePlainTextTrack(data, initTimescale, isChunked, context);
   }
 }
diff --git a/src/transports/smooth/pipelines.ts b/src/transports/smooth/pipelines.ts
index 3378391d5e..2948c1639f 100644
--- a/src/transports/smooth/pipelines.ts
+++ b/src/transports/smooth/pipelines.ts
@@ -411,6 +411,7 @@ export default function (transportOptions: ITransportOptions): ITransportPipelin
           data: _sdData,
           start: segmentStart,
           end: segmentEnd,
+          timescale: initTimescale ?? null,
           language,
         },
         chunkSize,
diff --git a/src/transports/types.ts b/src/transports/types.ts
index 2b7a137ca3..fbac565b4a 100644
--- a/src/transports/types.ts
+++ b/src/transports/types.ts
@@ -419,9 +419,11 @@ export interface IChunkTimeInfo {
 }
 
 /** Text track segment data, once parsed. */
-export interface ITextTrackSegmentData {
+export interface ITextTrackSegmentData<
+  T extends string | BufferSource = string | BufferSource,
+> {
   /** The text track data, in the format indicated in `type`. */
-  data: string;
+  data: T;
   /** The format of `data` (examples: "ttml", "srt" or "vtt") */
   type: string;
   /**
@@ -430,6 +432,11 @@ export interface ITextTrackSegmentData {
    * be parsed.
    */
   language?: string | undefined;
+  /**
+   * Optional timescale data context that is used to convert timing information
+   * found inside the segment into seconds.
+   */
+  timescale: number | null;
   /** start time from which the segment apply, in seconds. */
   start?: number | undefined;
   /** end time until which the segment apply, in seconds. */
@@ -469,7 +476,10 @@ export interface ITransportAudioVideoSegmentPipeline {
 
 export interface ITransportTextSegmentPipeline {
   loadSegment: ISegmentLoader<ILoadedTextSegmentFormat>;
-  parseSegment: ISegmentParser<ILoadedTextSegmentFormat, ITextTrackSegmentData | null>;
+  parseSegment: ISegmentParser<
+    ILoadedTextSegmentFormat,
+    ITextTrackSegmentData<Uint8Array | string> | null
+  >;
 }
 
 export type ITransportSegmentPipeline =
diff --git a/src/transports/utils/parse_text_track.ts b/src/transports/utils/parse_text_track.ts
index 96e0db180b..83902e177d 100644
--- a/src/transports/utils/parse_text_track.ts
+++ b/src/transports/utils/parse_text_track.ts
@@ -83,7 +83,8 @@ export function getPlainTextTrackFormat(
 
 /**
  * @param {Object} content
- * @param {ArrayBuffer|UInt8Array|null} chunkData
+ * @param {ArrayBuffer|UInt8Array|null} chunkBytes
+ * @param {number|undefined} initTimescale
  * @param {Object|null} chunkInfos
  * @param {boolean} isChunked
  * @returns {Object|null}
@@ -99,6 +100,7 @@ export function getISOBMFFEmbeddedTextTrackData(
     language?: string | undefined;
   },
   chunkBytes: Uint8Array,
+  initTimescale: number | undefined,
   chunkInfos: IChunkTimeInfo | null,
   isChunked: boolean,
 ): ITextTrackSegmentData | null {
@@ -124,20 +126,34 @@ export function getISOBMFFEmbeddedTextTrackData(
   }
 
   const type = getISOBMFFTextTrackFormat(codecs);
-  const textData = extractTextTrackFromISOBMFF(chunkBytes);
-  return { data: textData, type, language, start: startTime, end: endTime };
+  let textData: string | BufferSource;
+  if (codecs === "wvtt") {
+    // XXX TODO: check if WEBVTT header first?
+    textData = chunkBytes;
+  } else {
+    textData = extractTextTrackFromISOBMFF(chunkBytes);
+  }
+  return {
+    data: textData,
+    type,
+    language,
+    start: startTime,
+    end: endTime,
+    timescale: initTimescale ?? null,
+  };
 }
 
 /**
- * @param {Object} content
- * @param {ArrayBuffer|UInt8Array|null} chunkData
- * @param {Object|null} chunkInfos
+ * @param {Object} context
+ * @param {ArrayBuffer|UInt8Array|null} textTrackData
+ * @param {number|undefined} initTimescale
  * @param {boolean} isChunked
  * @returns {Object|null}
  */
 export function getPlainTextTrackData(
   context: ISegmentContext,
   textTrackData: string,
+  initTimescale: number | undefined,
   isChunked: boolean,
 ): ITextTrackSegmentData | null {
   const { segment } = context;
@@ -157,5 +173,12 @@ export function getPlainTextTrackData(
   }
 
   const type = getPlainTextTrackFormat(context.codecs, context.mimeType);
-  return { data: textTrackData, type, language: context.language, start, end };
+  return {
+    data: textTrackData,
+    type,
+    language: context.language,
+    start,
+    end,
+    timescale: initTimescale ?? null,
+  };
 }
diff --git a/src/utils/buffer_source_to_uint8.ts b/src/utils/buffer_source_to_uint8.ts
new file mode 100644
index 0000000000..b03b8adec8
--- /dev/null
+++ b/src/utils/buffer_source_to_uint8.ts
@@ -0,0 +1,14 @@
+/**
+ * Convert a vague "BufferSource" binary data into a more exploitable and known
+ * `Uint8Array`.
+ * @param {BufferSource} bs
+ * @returns {Uint8Array}
+ */
+export default function bufferSourceToUint8(bs: BufferSource): Uint8Array {
+  if (bs instanceof Uint8Array) {
+    return bs;
+  } else if (bs instanceof ArrayBuffer) {
+    return new Uint8Array(bs);
+  }
+  return new Uint8Array(bs.buffer);
+}
diff --git a/src/utils/byte_parsing.ts b/src/utils/byte_parsing.ts
index 0d3bff9a6e..2f2ce9465f 100644
--- a/src/utils/byte_parsing.ts
+++ b/src/utils/byte_parsing.ts
@@ -82,6 +82,25 @@ function be4toi(bytes: Uint8Array, offset: number): number {
   );
 }
 
+/**
+ * Translate groups of 4 big-endian bytes representing a two's complement signed
+ * integer to directly that value.
+ * @param {Uint8Array} bytes
+ * @param {Number} offset - The offset (from the start of the given array)
+ * @returns {Number}
+ */
+function be4toiSigned(bytes: Uint8Array, offset: number): number {
+  // Didn't bother overthinking that one though it may be fun
+  return new Int32Array(
+    new Uint8Array([
+      bytes[offset + 0],
+      bytes[offset + 1],
+      bytes[offset + 2],
+      bytes[offset + 3],
+    ]).buffer,
+  )[0];
+}
+
 /**
  * Translate groups of 8 big-endian bytes to Integer.
  * @param {Uint8Array} bytes
@@ -260,6 +279,7 @@ export {
   be2toi,
   be3toi,
   be4toi,
+  be4toiSigned,
   be8toi,
   le2toi,
   le4toi,