Skip to content

Commit

Permalink
Implement Fragmented WebVTT vtte/vttc
Browse files Browse the repository at this point in the history
As #1638 reported, we do not support some (or all?) ways of
communicating webvtt subtitles in mp4 files.

I initially assumed that webvtt embedded in an mp4 file worked like for
TTML (a format we're more used to): the plain subtitle format directly
inserted in an `mdat` box.

Turns out, some contents (like https://demo.unified-streaming.com/k8s/features/stable/video/tears-of-steel/tears-of-steel-wvtt.ism/.mpd),
actually rely on the metadata from other ISOBMFF boxes (mainly the
tfdt and trun boxes) to provide timing information with the `mdat`
reserved for text, identifiers and style information in a new binary
format following the ISOBMFF way of doing things (new webvtt-specific
boxes).

Weirdly enough, that format does not look at all like the WEBVTT we're
used to beside the fact that it uses the same identifier and "settings"
concept.

---

As now our subtitle parser has to have the context of the whole mp4 file
(and not of the mp4 segment), and as that parser also has to rely on
state (a `timescale` value) provided by an initialization segment, I had
to update the architecture of how subtitles are communicated: they can
now be communicated as string or `BufferSource` types (the latter
leading the text encoding detection), and a supplementary `timescale`
argument (defaulting to `1`) is now always provided to parsers.

The vast majority of parsers now do not make use of that `timescale`
value which is kind of ugly though, we may want to find a better
solution.
  • Loading branch information
peaBerberian committed Jan 28, 2025
1 parent 169e6d6 commit 0abb771
Show file tree
Hide file tree
Showing 25 changed files with 585 additions and 64 deletions.
50 changes: 37 additions & 13 deletions src/core/segment_sinks/implementations/text/text_segment_sink.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import log from "../../../../log";
import type { ITextDisplayer } from "../../../../main_thread/types";
import type { ITextTrackSegmentData } from "../../../../transports";
import isNullOrUndefined from "../../../../utils/is_null_or_undefined";
import getMonotonicTimeStamp from "../../../../utils/monotonic_timestamp";
import type { IRange } from "../../../../utils/ranges";
import type { ICompleteSegmentInfo, IPushChunkInfos, ISBOperation } from "../types";
Expand Down Expand Up @@ -142,9 +143,9 @@ export default class TextSegmentSink extends SegmentSink {
}

/** Data of chunks that should be pushed to the HTMLTextSegmentSink. */
export interface ITextTracksBufferSegmentData {
export interface ITextTracksBufferSegmentData<T extends string | BufferSource> {
/** The text track data, in the format indicated in `type`. */
data: string;
data: T;
/** The format of `data` (examples: "ttml", "srt" or "vtt") */
type: string;
/**
Expand All @@ -153,6 +154,11 @@ export interface ITextTracksBufferSegmentData {
* be parsed.
*/
language?: string | undefined;
/**
* Optional timescale data context that is used to convert timing information
* into seconds.
*/
timescale: number | null;
/** start time from which the segment apply, in seconds. */
start?: number | undefined;
/** end time until which the segment apply, in seconds. */
Expand All @@ -167,7 +173,7 @@ export interface ITextTracksBufferSegmentData {
*/
function assertChunkIsTextTrackSegmentData(
chunk: unknown,
): asserts chunk is ITextTracksBufferSegmentData {
): asserts chunk is ITextTracksBufferSegmentData<string | BufferSource> {
if (
(__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.PRODUCTION as number)
) {
Expand All @@ -176,14 +182,30 @@ function assertChunkIsTextTrackSegmentData(
if (
typeof chunk !== "object" ||
chunk === null ||
typeof (chunk as ITextTracksBufferSegmentData).data !== "string" ||
typeof (chunk as ITextTracksBufferSegmentData).type !== "string" ||
((chunk as ITextTracksBufferSegmentData).language !== undefined &&
typeof (chunk as ITextTracksBufferSegmentData).language !== "string") ||
((chunk as ITextTracksBufferSegmentData).start !== undefined &&
typeof (chunk as ITextTracksBufferSegmentData).start !== "number") ||
((chunk as ITextTracksBufferSegmentData).end !== undefined &&
typeof (chunk as ITextTracksBufferSegmentData).end !== "number")
isNullOrUndefined((chunk as ITextTracksBufferSegmentData<string | BufferSource>).data)
) {
throw new Error("Invalid format given to a TextSegmentSink");
}
if (
typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).type !==
"string" ||
((chunk as ITextTracksBufferSegmentData<string | BufferSource>).language !==
undefined &&
typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).language !==
"string") ||
((chunk as ITextTracksBufferSegmentData<string | BufferSource>).start !== undefined &&
typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).start !==
"number") ||
((chunk as ITextTracksBufferSegmentData<string | BufferSource>).end !== undefined &&
typeof (chunk as ITextTracksBufferSegmentData<string | BufferSource>).end !==
"number")
) {
throw new Error("Invalid format given to a TextSegmentSink");
}
if (
typeof (chunk as ITextTracksBufferSegmentData<string>).data !== "string" &&
typeof (chunk as ITextTracksBufferSegmentData<BufferSource>).data.byteLength !==
"number"
) {
throw new Error("Invalid format given to a TextSegmentSink");
}
Expand Down Expand Up @@ -229,8 +251,10 @@ export interface ITextDisplayerInterface {
*/
if ((__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.DEV as number)) {
// @ts-expect-error: unused function for type checking
function _checkType(input: ITextTrackSegmentData): void {
function checkEqual(_arg: ITextTracksBufferSegmentData): void {
function _checkType<T extends string | BufferSource>(
input: ITextTrackSegmentData<T>,
): void {
function checkEqual(_arg: ITextTracksBufferSegmentData<T>): void {
/* nothing */
}
checkEqual(input);
Expand Down
7 changes: 5 additions & 2 deletions src/main_thread/text_displayer/html/html_parsers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,17 @@ export interface IHTMLCue {
* Convert text track data into timed HTML Cues.
* @param {string} type - Text track format wanted
* @param {string} data - Text track data
* @param {Number} timescale - Potential external timescale to convert timing
* information into seconds.
* @param {Number} timestampOffset - offset to apply to every timed text
* @param {string} [language] - language of the text tracks
* @returns {Array.<Object>}
* @throws Error - Throw if no parser is found for the given type
*/
export default function parseTextTrackToElements(
type: string,
data: string,
data: string | BufferSource,
timescale: number,
timestampOffset: number,
language?: string,
): IHTMLCue[] {
Expand All @@ -29,7 +32,7 @@ export default function parseTextTrackToElements(
throw new Error("no parser found for the given text track");
}
log.debug("HTSB: Parser found, parsing...");
const parsed = parser(data, timestampOffset, language);
const parsed = parser(data, timescale, timestampOffset, language);
log.debug("HTTB: Parsed successfully!", parsed.length);
return parsed;
}
23 changes: 19 additions & 4 deletions src/main_thread/text_displayer/html/html_text_displayer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,25 @@ export default class HTMLTextDisplayer implements ITextDisplayer {
return convertToRanges(this._buffered);
}

const { start: startTime, end: endTime, data: dataString, type, language } = chunk;
const {
start: startTime,
end: endTime,
data: dataRaw,
type,
language,
timescale,
} = chunk;

const appendWindowStart = appendWindow[0] ?? 0;
const appendWindowEnd = appendWindow[1] ?? Infinity;

const cues = parseTextTrackToElements(type, dataString, timestampOffset, language);
const cues = parseTextTrackToElements(
type,
dataRaw,
timescale ?? 1,
timestampOffset,
language,
);

if (appendWindowStart !== 0 && appendWindowEnd !== Infinity) {
// Removing before window start
Expand Down Expand Up @@ -398,7 +411,7 @@ export default class HTMLTextDisplayer implements ITextDisplayer {
/** Data of chunks that should be pushed to the `HTMLTextDisplayer`. */
export interface ITextTracksBufferSegmentData {
/** The text track data, in the format indicated in `type`. */
data: string;
data: string | BufferSource;
/** The format of `data` (examples: "ttml", "srt" or "vtt") */
type: string;
/**
Expand All @@ -424,7 +437,9 @@ export interface ITextTracksBufferSegmentData {
*/
if ((__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.DEV as number)) {
// @ts-expect-error: uncalled function just for type-checking
function _checkType(input: ITextTrackSegmentData): void {
function _checkType<T extends string | BufferSource>(
input: ITextTrackSegmentData<T>,
): void {
function checkEqual(_arg: ITextTracksBufferSegmentData): void {
/* nothing */
}
Expand Down
9 changes: 6 additions & 3 deletions src/main_thread/text_displayer/native/native_parsers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@ import log from "../../../log";
/**
* Convert text track data into timed VTT Cues.
* @param {string} type - Text track format wanted
* @param {string} data - Text track data
* @param {string|BufferSource} data - Text track data
* @param {Number} timescale - Potential external timescale to convert timing
* information into seconds.
* @param {Number} timestampOffset - offset to apply to every timed text
* @param {string} [language] - language of the text tracks
* @returns {Array.<VTTCue>}
* @throws Error - Throw if no parser is found for the given type
*/
export default function parseTextTrackToCues(
type: string,
data: string,
data: string | BufferSource,
timescale: number,
timestampOffset: number,
language?: string,
): Array<ICompatVTTCue | TextTrackCue> {
Expand All @@ -25,7 +28,7 @@ export default function parseTextTrackToCues(
}

log.debug("NTSB: Parser found, parsing...");
const parsed = parser(data, timestampOffset, language);
const parsed = parser(data, timescale, timestampOffset, language);
log.debug("NTSB: Parsed successfully!", parsed.length);
return parsed;
}
19 changes: 16 additions & 3 deletions src/main_thread/text_displayer/native/native_text_displayer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,23 @@ export default class NativeTextDisplayer implements ITextDisplayer {
return convertToRanges(this._buffered);
}
const { timestampOffset, appendWindow, chunk } = infos;
const { start: startTime, end: endTime, data: dataString, type, language } = chunk;
const {
start: startTime,
end: endTime,
data: dataString,
type,
language,
timescale,
} = chunk;
const appendWindowStart = appendWindow[0] ?? 0;
const appendWindowEnd = appendWindow[1] ?? Infinity;
const cues = parseTextTrackToCues(type, dataString, timestampOffset, language);
const cues = parseTextTrackToCues(
type,
dataString,
timescale ?? 1,
timestampOffset,
language,
);

if (appendWindowStart !== 0 && appendWindowEnd !== Infinity) {
// Removing before window start
Expand Down Expand Up @@ -222,7 +235,7 @@ export default class NativeTextDisplayer implements ITextDisplayer {
/** Data of chunks that should be pushed to the NativeTextDisplayer. */
export interface INativeTextTracksBufferSegmentData {
/** The text track data, in the format indicated in `type`. */
data: string;
data: string | BufferSource;
/** The format of `data` (examples: "ttml", "srt" or "vtt") */
type: string;
/**
Expand Down
89 changes: 89 additions & 0 deletions src/parsers/containers/isobmff/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
be2toi,
be3toi,
be4toi,
be4toiSigned,
be8toi,
concat,
itobe4,
Expand Down Expand Up @@ -233,6 +234,92 @@ function getDefaultDurationFromTFHDInTRAF(traf: Uint8Array): number | undefined
return defaultDuration;
}

interface ITrunSampleInfo {
duration: number;
compositionTimeOffset: number | undefined;
size: number | undefined;
flags: number | undefined;
}

function getTrunSamples(buffer: Uint8Array): ITrunSampleInfo[] {
const trafs = getTRAFs(buffer);
const samples: ITrunSampleInfo[] = [];
for (const traf of trafs) {
const trun = getBoxContent(traf, 0x7472756e /* trun */);
if (trun === null) {
continue;
}
let cursor = 0;
const version = trun[cursor];
cursor += 1;
if (version > 1) {
return [];
}

const flags = be3toi(trun, cursor);
cursor += 3;
const hasSampleDuration = (flags & 0x000100) > 0;

let defaultDuration: number | undefined = 0;
if (!hasSampleDuration) {
defaultDuration = getDefaultDurationFromTFHDInTRAF(traf);
if (defaultDuration === undefined) {
return [];
}
}

const hasDataOffset = (flags & 0x000001) > 0;
const hasFirstSampleFlags = (flags & 0x000004) > 0;
const hasSampleSize = (flags & 0x000200) > 0;
const hasSampleFlags = (flags & 0x000400) > 0;
const hasSampleCompositionOffset = (flags & 0x000800) > 0;

const sampleCounts = be4toi(trun, cursor);
cursor += 4;

if (hasDataOffset) {
cursor += 4;
}
if (hasFirstSampleFlags) {
cursor += 4;
}

let i = sampleCounts;
while (i-- > 0) {
let duration;
let size;
let sampleFlags;
let compositionTimeOffset;
if (hasSampleDuration) {
duration = be4toi(trun, cursor);
cursor += 4;
} else {
duration = defaultDuration;
}
if (hasSampleSize) {
size = be4toi(trun, cursor);
cursor += 4;
}
if (hasSampleFlags) {
sampleFlags = be4toi(trun, cursor);
cursor += 4;
}
if (hasSampleCompositionOffset) {
compositionTimeOffset =
version === 0 ? be4toi(trun, cursor) : be4toiSigned(trun, cursor);
cursor += 4;
}
samples.push({
duration,
compositionTimeOffset,
size,
flags: sampleFlags,
});
}
}
return samples;
}

/**
* Calculate segment duration approximation by additioning the duration from
* every samples in a trun ISOBMFF box.
Expand Down Expand Up @@ -563,6 +650,7 @@ function getKeyIdFromInitSegment(segment: Uint8Array): Uint8Array | null {
return keyId.every((b) => b === 0) ? null : keyId;
}

export type { ITrunSampleInfo };
export {
getKeyIdFromInitSegment,
getMDHDTimescale,
Expand All @@ -573,4 +661,5 @@ export {
patchPssh,
updateBoxLength,
parseEmsgBoxes,
getTrunSamples,
};
20 changes: 18 additions & 2 deletions src/parsers/texttracks/sami/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
* It always should be imported through the `features` object.
*/

import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8";
import isNonEmptyString from "../../../utils/is_non_empty_string";
import isNullOrUndefined from "../../../utils/is_null_or_undefined";
import { utf8ToStr } from "../../../utils/string_parsing";
import type { IHTMLCue } from "../types";

const HTML_ENTITIES = /&#([0-9]+);/g;
Expand Down Expand Up @@ -99,11 +101,25 @@ function decodeEntities(text: string): string {
* The specification being quite clunky, this parser
* may not work for every sami input.
*
* @param {string} smi
* @param {string|BufferSource} input
* @param {Number} _timescale
* @param {Number} timeOffset
* @param {string} lang
*/
function parseSami(smi: string, timeOffset: number, lang?: string): IHTMLCue[] {
function parseSami(
input: string | BufferSource,
_timescale: number,
timeOffset: number,
lang?: string,
): IHTMLCue[] {
let smi: string;
if (typeof input !== "string") {
// Assume UTF-8
// TODO: detection?
smi = utf8ToStr(bufferSourceToUint8(input));
} else {
smi = input;
}
const syncOpen = /<sync[ >]/gi;
const syncClose = /<sync[ >]|<\/body>/gi;

Expand Down
Loading

0 comments on commit 0abb771

Please sign in to comment.