Implement Fragmented WebVTT vtte/vttc

As #1638 reported, we do not support some (or all?) ways of communicating webvtt subtitles in mp4 files. I initially assumed that webvtt embedded in an mp4 file worked like for TTML (a format we're more used to): the plain subtitle format directly inserted in an `mdat` box. Turns out, some contents (like https://demo.unified-streaming.com/k8s/features/stable/video/tears-of-steel/tears-of-steel-wvtt.ism/.mpd), actually rely on the metadata from other ISOBMFF boxes (mainly the tfdt and trun boxes) to provide timing information with the `mdat` reserved for text, identifiers and style information in a new binary format following the ISOBMFF way of doing things (new webvtt-specific boxes). Weirdly enough, that format does not look at all like the WEBVTT we're used to beside the fact that it uses the same identifier and "settings" concept. --- As now our subtitle parser has to have the context of the whole mp4 file (and not of the mp4 segment), and as that parser also has to rely on state (a `timescale` value) provided by an initialization segment, I had to update the architecture of how subtitles are communicated: they can now be communicated as string or `BufferSource` types (the latter leading the text encoding detection), and a supplementary `timescale` argument (defaulting to `1`) is now always provided to parsers. The vast majority of parsers now do not make use of that `timescale` value which is kind of ugly though, we may want to find a better solution.
canalplus · Jan 28, 2025 · 0abb771 · 0abb771
1 parent 169e6d6
commit 0abb771
Show file tree

Hide file tree

Showing 25 changed files with 585 additions and 64 deletions.
diff --git a/src/core/segment_sinks/implementations/text/text_segment_sink.ts b/src/core/segment_sinks/implementations/text/text_segment_sink.ts
@@ -1,6 +1,7 @@
 import log from "../../../../log";
 import type { ITextDisplayer } from "../../../../main_thread/types";
 import type { ITextTrackSegmentData } from "../../../../transports";
+import isNullOrUndefined from "../../../../utils/is_null_or_undefined";
 import getMonotonicTimeStamp from "../../../../utils/monotonic_timestamp";
 import type { IRange } from "../../../../utils/ranges";
 import type { ICompleteSegmentInfo, IPushChunkInfos, ISBOperation } from "../types";
@@ -142,9 +143,9 @@ export default class TextSegmentSink extends SegmentSink {
 }
 
 /** Data of chunks that should be pushed to the HTMLTextSegmentSink. */
-export interface ITextTracksBufferSegmentData {
+export interface ITextTracksBufferSegmentData<T extends string | BufferSource> {
   /** The text track data, in the format indicated in `type`. */
-  data: string;
+  data: T;
   /** The format of `data` (examples: "ttml", "srt" or "vtt") */
   type: string;
   /**
@@ -153,6 +154,11 @@ export interface ITextTracksBufferSegmentData {
    * be parsed.
    */
   language?: string | undefined;
+  /**
+   * Optional timescale data context that is used to convert timing information
+   * into seconds.
+   */
+  timescale: number | null;
   /** start time from which the segment apply, in seconds. */
   start?: number | undefined;
   /** end time until which the segment apply, in seconds. */
@@ -167,7 +173,7 @@ export interface ITextTracksBufferSegmentData {
  */
 function assertChunkIsTextTrackSegmentData(
   chunk: unknown,
-): asserts chunk is ITextTracksBufferSegmentData {
+): asserts chunk is ITextTracksBufferSegmentData<string | BufferSource> {
   if (
     (__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.PRODUCTION as number)
   ) {
@@ -176,14 +182,30 @@ function assertChunkIsTextTrackSegmentData(
   if (
     typeof chunk !== "object" ||
     chunk === null ||
-    typeof (chunk as ITextTracksBufferSegmentData).data !== "string" ||
-    typeof (chunk as ITextTracksBufferSegmentData).type !== "string" ||
-    ((chunk as ITextTracksBufferSegmentData).language !== undefined &&
-      typeof (chunk as ITextTracksBufferSegmentData).language !== "string") ||
-    ((chunk as ITextTracksBufferSegmentData).start !== undefined &&
-      typeof (chunk as ITextTracksBufferSegmentData).start !== "number") ||
-    ((chunk as ITextTracksBufferSegmentData).end !== undefined &&
-      typeof (chunk as ITextTracksBufferSegmentData).end !== "number")
+    isNullOrUndefined((chunk as ITextTracksBufferSegmentData<string | BufferSource>).data)
+  ) {
+    throw new Error("Invalid format given to a TextSegmentSink");
+  }
+  if (
+    typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).type !==
+      "string" ||
+    ((chunk as ITextTracksBufferSegmentData<string | BufferSource>).language !==
+      undefined &&
+      typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).language !==
+        "string") ||
+    ((chunk as ITextTracksBufferSegmentData<string | BufferSource>).start !== undefined &&
+      typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).start !==
+        "number") ||
+    ((chunk as ITextTracksBufferSegmentData<string | BufferSource>).end !== undefined &&
+      typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).end !==
+        "number")
+  ) {
+    throw new Error("Invalid format given to a TextSegmentSink");
+  }
+  if (
+    typeof (chunk as ITextTracksBufferSegmentData<string>).data !== "string" &&
+    typeof (chunk as ITextTracksBufferSegmentData<BufferSource>).data.byteLength !==
+      "number"
   ) {
     throw new Error("Invalid format given to a TextSegmentSink");
   }
@@ -229,8 +251,10 @@ export interface ITextDisplayerInterface {
  */
 if ((__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.DEV as number)) {
   // @ts-expect-error: unused function for type checking
-  function _checkType(input: ITextTrackSegmentData): void {
-    function checkEqual(_arg: ITextTracksBufferSegmentData): void {
+  function _checkType<T extends string | BufferSource>(
+    input: ITextTrackSegmentData<T>,
+  ): void {
+    function checkEqual(_arg: ITextTracksBufferSegmentData<T>): void {
       /* nothing */
     }
     checkEqual(input);

diff --git a/src/main_thread/text_displayer/html/html_parsers.ts b/src/main_thread/text_displayer/html/html_parsers.ts
@@ -11,14 +11,17 @@ export interface IHTMLCue {
  * Convert text track data into timed HTML Cues.
  * @param {string} type - Text track format wanted
  * @param {string} data - Text track data
+ * @param {Number} timescale - Potential external timescale to convert timing
+ * information into seconds.
  * @param {Number} timestampOffset - offset to apply to every timed text
  * @param {string} [language] - language of the text tracks
  * @returns {Array.<Object>}
  * @throws Error - Throw if no parser is found for the given type
  */
 export default function parseTextTrackToElements(
   type: string,
-  data: string,
+  data: string | BufferSource,
+  timescale: number,
   timestampOffset: number,
   language?: string,
 ): IHTMLCue[] {
@@ -29,7 +32,7 @@ export default function parseTextTrackToElements(
     throw new Error("no parser found for the given text track");
   }
   log.debug("HTSB: Parser found, parsing...");
-  const parsed = parser(data, timestampOffset, language);
+  const parsed = parser(data, timescale, timestampOffset, language);
   log.debug("HTTB: Parsed successfully!", parsed.length);
   return parsed;
 }
diff --git a/src/main_thread/text_displayer/html/html_text_displayer.ts b/src/main_thread/text_displayer/html/html_text_displayer.ts
@@ -126,12 +126,25 @@ export default class HTMLTextDisplayer implements ITextDisplayer {
       return convertToRanges(this._buffered);
     }
 
-    const { start: startTime, end: endTime, data: dataString, type, language } = chunk;
+    const {
+      start: startTime,
+      end: endTime,
+      data: dataRaw,
+      type,
+      language,
+      timescale,
+    } = chunk;
 
     const appendWindowStart = appendWindow[0] ?? 0;
     const appendWindowEnd = appendWindow[1] ?? Infinity;
 
-    const cues = parseTextTrackToElements(type, dataString, timestampOffset, language);
+    const cues = parseTextTrackToElements(
+      type,
+      dataRaw,
+      timescale ?? 1,
+      timestampOffset,
+      language,
+    );
 
     if (appendWindowStart !== 0 && appendWindowEnd !== Infinity) {
       // Removing before window start
@@ -398,7 +411,7 @@ export default class HTMLTextDisplayer implements ITextDisplayer {
 /** Data of chunks that should be pushed to the `HTMLTextDisplayer`. */
 export interface ITextTracksBufferSegmentData {
   /** The text track data, in the format indicated in `type`. */
-  data: string;
+  data: string | BufferSource;
   /** The format of `data` (examples: "ttml", "srt" or "vtt") */
   type: string;
   /**
@@ -424,7 +437,9 @@ export interface ITextTracksBufferSegmentData {
  */
 if ((__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.DEV as number)) {
   // @ts-expect-error: uncalled function just for type-checking
-  function _checkType(input: ITextTrackSegmentData): void {
+  function _checkType<T extends string | BufferSource>(
+    input: ITextTrackSegmentData<T>,
+  ): void {
     function checkEqual(_arg: ITextTracksBufferSegmentData): void {
       /* nothing */
     }

diff --git a/src/main_thread/text_displayer/native/native_parsers.ts b/src/main_thread/text_displayer/native/native_parsers.ts
@@ -5,15 +5,18 @@ import log from "../../../log";
 /**
  * Convert text track data into timed VTT Cues.
  * @param {string} type - Text track format wanted
- * @param {string} data - Text track data
+ * @param {string|BufferSource} data - Text track data
+ * @param {Number} timescale - Potential external timescale to convert timing
+ * information into seconds.
  * @param {Number} timestampOffset - offset to apply to every timed text
  * @param {string} [language] - language of the text tracks
  * @returns {Array.<VTTCue>}
  * @throws Error - Throw if no parser is found for the given type
  */
 export default function parseTextTrackToCues(
   type: string,
-  data: string,
+  data: string | BufferSource,
+  timescale: number,
   timestampOffset: number,
   language?: string,
 ): Array<ICompatVTTCue | TextTrackCue> {
@@ -25,7 +28,7 @@ export default function parseTextTrackToCues(
   }
 
   log.debug("NTSB: Parser found, parsing...");
-  const parsed = parser(data, timestampOffset, language);
+  const parsed = parser(data, timescale, timestampOffset, language);
   log.debug("NTSB: Parsed successfully!", parsed.length);
   return parsed;
 }
diff --git a/src/main_thread/text_displayer/native/native_text_displayer.ts b/src/main_thread/text_displayer/native/native_text_displayer.ts
@@ -48,10 +48,23 @@ export default class NativeTextDisplayer implements ITextDisplayer {
       return convertToRanges(this._buffered);
     }
     const { timestampOffset, appendWindow, chunk } = infos;
-    const { start: startTime, end: endTime, data: dataString, type, language } = chunk;
+    const {
+      start: startTime,
+      end: endTime,
+      data: dataString,
+      type,
+      language,
+      timescale,
+    } = chunk;
     const appendWindowStart = appendWindow[0] ?? 0;
     const appendWindowEnd = appendWindow[1] ?? Infinity;
-    const cues = parseTextTrackToCues(type, dataString, timestampOffset, language);
+    const cues = parseTextTrackToCues(
+      type,
+      dataString,
+      timescale ?? 1,
+      timestampOffset,
+      language,
+    );
 
     if (appendWindowStart !== 0 && appendWindowEnd !== Infinity) {
       // Removing before window start
@@ -222,7 +235,7 @@ export default class NativeTextDisplayer implements ITextDisplayer {
 /** Data of chunks that should be pushed to the NativeTextDisplayer. */
 export interface INativeTextTracksBufferSegmentData {
   /** The text track data, in the format indicated in `type`. */
-  data: string;
+  data: string | BufferSource;
   /** The format of `data` (examples: "ttml", "srt" or "vtt") */
   type: string;
   /**

diff --git a/src/parsers/containers/isobmff/utils.ts b/src/parsers/containers/isobmff/utils.ts
@@ -20,6 +20,7 @@ import {
   be2toi,
   be3toi,
   be4toi,
+  be4toiSigned,
   be8toi,
   concat,
   itobe4,
@@ -233,6 +234,92 @@ function getDefaultDurationFromTFHDInTRAF(traf: Uint8Array): number | undefined
   return defaultDuration;
 }
 
+interface ITrunSampleInfo {
+  duration: number;
+  compositionTimeOffset: number | undefined;
+  size: number | undefined;
+  flags: number | undefined;
+}
+
+function getTrunSamples(buffer: Uint8Array): ITrunSampleInfo[] {
+  const trafs = getTRAFs(buffer);
+  const samples: ITrunSampleInfo[] = [];
+  for (const traf of trafs) {
+    const trun = getBoxContent(traf, 0x7472756e /* trun */);
+    if (trun === null) {
+      continue;
+    }
+    let cursor = 0;
+    const version = trun[cursor];
+    cursor += 1;
+    if (version > 1) {
+      return [];
+    }
+
+    const flags = be3toi(trun, cursor);
+    cursor += 3;
+    const hasSampleDuration = (flags & 0x000100) > 0;
+
+    let defaultDuration: number | undefined = 0;
+    if (!hasSampleDuration) {
+      defaultDuration = getDefaultDurationFromTFHDInTRAF(traf);
+      if (defaultDuration === undefined) {
+        return [];
+      }
+    }
+
+    const hasDataOffset = (flags & 0x000001) > 0;
+    const hasFirstSampleFlags = (flags & 0x000004) > 0;
+    const hasSampleSize = (flags & 0x000200) > 0;
+    const hasSampleFlags = (flags & 0x000400) > 0;
+    const hasSampleCompositionOffset = (flags & 0x000800) > 0;
+
+    const sampleCounts = be4toi(trun, cursor);
+    cursor += 4;
+
+    if (hasDataOffset) {
+      cursor += 4;
+    }
+    if (hasFirstSampleFlags) {
+      cursor += 4;
+    }
+
+    let i = sampleCounts;
+    while (i-- > 0) {
+      let duration;
+      let size;
+      let sampleFlags;
+      let compositionTimeOffset;
+      if (hasSampleDuration) {
+        duration = be4toi(trun, cursor);
+        cursor += 4;
+      } else {
+        duration = defaultDuration;
+      }
+      if (hasSampleSize) {
+        size = be4toi(trun, cursor);
+        cursor += 4;
+      }
+      if (hasSampleFlags) {
+        sampleFlags = be4toi(trun, cursor);
+        cursor += 4;
+      }
+      if (hasSampleCompositionOffset) {
+        compositionTimeOffset =
+          version === 0 ? be4toi(trun, cursor) : be4toiSigned(trun, cursor);
+        cursor += 4;
+      }
+      samples.push({
+        duration,
+        compositionTimeOffset,
+        size,
+        flags: sampleFlags,
+      });
+    }
+  }
+  return samples;
+}
+
 /**
  * Calculate segment duration approximation by additioning the duration from
  * every samples in a trun ISOBMFF box.
@@ -563,6 +650,7 @@ function getKeyIdFromInitSegment(segment: Uint8Array): Uint8Array | null {
   return keyId.every((b) => b === 0) ? null : keyId;
 }
 
+export type { ITrunSampleInfo };
 export {
   getKeyIdFromInitSegment,
   getMDHDTimescale,
@@ -573,4 +661,5 @@ export {
   patchPssh,
   updateBoxLength,
   parseEmsgBoxes,
+  getTrunSamples,
 };
diff --git a/src/parsers/texttracks/sami/html.ts b/src/parsers/texttracks/sami/html.ts
@@ -29,8 +29,10 @@
  * It always should be imported through the `features` object.
  */
 
+import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8";
 import isNonEmptyString from "../../../utils/is_non_empty_string";
 import isNullOrUndefined from "../../../utils/is_null_or_undefined";
+import { utf8ToStr } from "../../../utils/string_parsing";
 import type { IHTMLCue } from "../types";
 
 const HTML_ENTITIES = /&#([0-9]+);/g;
@@ -99,11 +101,25 @@ function decodeEntities(text: string): string {
  * The specification being quite clunky, this parser
  * may not work for every sami input.
  *
- * @param {string} smi
+ * @param {string|BufferSource} input
+ * @param {Number} _timescale
  * @param {Number} timeOffset
  * @param {string} lang
  */
-function parseSami(smi: string, timeOffset: number, lang?: string): IHTMLCue[] {
+function parseSami(
+  input: string | BufferSource,
+  _timescale: number,
+  timeOffset: number,
+  lang?: string,
+): IHTMLCue[] {
+  let smi: string;
+  if (typeof input !== "string") {
+    // Assume UTF-8
+    // TODO: detection?
+    smi = utf8ToStr(bufferSourceToUint8(input));
+  } else {
+    smi = input;
+  }
   const syncOpen = /<sync[ >]/gi;
   const syncClose = /<sync[ >]|<\/body>/gi;