diff --git a/src/core/segment_sinks/implementations/text/text_segment_sink.ts b/src/core/segment_sinks/implementations/text/text_segment_sink.ts index 39b9366f9f..eb83277aa5 100644 --- a/src/core/segment_sinks/implementations/text/text_segment_sink.ts +++ b/src/core/segment_sinks/implementations/text/text_segment_sink.ts @@ -1,6 +1,7 @@ import log from "../../../../log"; import type { ITextDisplayer } from "../../../../main_thread/types"; import type { ITextTrackSegmentData } from "../../../../transports"; +import isNullOrUndefined from "../../../../utils/is_null_or_undefined"; import getMonotonicTimeStamp from "../../../../utils/monotonic_timestamp"; import type { IRange } from "../../../../utils/ranges"; import type { ICompleteSegmentInfo, IPushChunkInfos, ISBOperation } from "../types"; @@ -142,9 +143,9 @@ export default class TextSegmentSink extends SegmentSink { } /** Data of chunks that should be pushed to the HTMLTextSegmentSink. */ -export interface ITextTracksBufferSegmentData { +export interface ITextTracksBufferSegmentData { /** The text track data, in the format indicated in `type`. */ - data: string; + data: T; /** The format of `data` (examples: "ttml", "srt" or "vtt") */ type: string; /** @@ -153,6 +154,11 @@ export interface ITextTracksBufferSegmentData { * be parsed. */ language?: string | undefined; + /** + * Optional timescale data context that is used to convert timing information + * into seconds. + */ + timescale: number | null; /** start time from which the segment apply, in seconds. */ start?: number | undefined; /** end time until which the segment apply, in seconds. */ @@ -167,7 +173,7 @@ export interface ITextTracksBufferSegmentData { */ function assertChunkIsTextTrackSegmentData( chunk: unknown, -): asserts chunk is ITextTracksBufferSegmentData { +): asserts chunk is ITextTracksBufferSegmentData { if ( (__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.PRODUCTION as number) ) { @@ -176,14 +182,30 @@ function assertChunkIsTextTrackSegmentData( if ( typeof chunk !== "object" || chunk === null || - typeof (chunk as ITextTracksBufferSegmentData).data !== "string" || - typeof (chunk as ITextTracksBufferSegmentData).type !== "string" || - ((chunk as ITextTracksBufferSegmentData).language !== undefined && - typeof (chunk as ITextTracksBufferSegmentData).language !== "string") || - ((chunk as ITextTracksBufferSegmentData).start !== undefined && - typeof (chunk as ITextTracksBufferSegmentData).start !== "number") || - ((chunk as ITextTracksBufferSegmentData).end !== undefined && - typeof (chunk as ITextTracksBufferSegmentData).end !== "number") + isNullOrUndefined((chunk as ITextTracksBufferSegmentData).data) + ) { + throw new Error("Invalid format given to a TextSegmentSink"); + } + if ( + typeof (chunk as ITextTracksBufferSegmentData).type !== + "string" || + ((chunk as ITextTracksBufferSegmentData).language !== + undefined && + typeof (chunk as ITextTracksBufferSegmentData).language !== + "string") || + ((chunk as ITextTracksBufferSegmentData).start !== undefined && + typeof (chunk as ITextTracksBufferSegmentData).start !== + "number") || + ((chunk as ITextTracksBufferSegmentData).end !== undefined && + typeof (chunk as ITextTracksBufferSegmentData).end !== + "number") + ) { + throw new Error("Invalid format given to a TextSegmentSink"); + } + if ( + typeof (chunk as ITextTracksBufferSegmentData).data !== "string" && + typeof (chunk as ITextTracksBufferSegmentData).data.byteLength !== + "number" ) { throw new Error("Invalid format given to a TextSegmentSink"); } @@ -229,8 +251,10 @@ export interface ITextDisplayerInterface { */ if ((__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.DEV as number)) { // @ts-expect-error: unused function for type checking - function _checkType(input: ITextTrackSegmentData): void { - function checkEqual(_arg: ITextTracksBufferSegmentData): void { + function _checkType( + input: ITextTrackSegmentData, + ): void { + function checkEqual(_arg: ITextTracksBufferSegmentData): void { /* nothing */ } checkEqual(input); diff --git a/src/main_thread/text_displayer/html/html_parsers.ts b/src/main_thread/text_displayer/html/html_parsers.ts index 1d77d69b4a..13ca4de0bd 100644 --- a/src/main_thread/text_displayer/html/html_parsers.ts +++ b/src/main_thread/text_displayer/html/html_parsers.ts @@ -11,6 +11,8 @@ export interface IHTMLCue { * Convert text track data into timed HTML Cues. * @param {string} type - Text track format wanted * @param {string} data - Text track data + * @param {Number} timescale - Potential external timescale to convert timing + * information into seconds. * @param {Number} timestampOffset - offset to apply to every timed text * @param {string} [language] - language of the text tracks * @returns {Array.} @@ -18,7 +20,8 @@ export interface IHTMLCue { */ export default function parseTextTrackToElements( type: string, - data: string, + data: string | BufferSource, + timescale: number, timestampOffset: number, language?: string, ): IHTMLCue[] { @@ -29,7 +32,7 @@ export default function parseTextTrackToElements( throw new Error("no parser found for the given text track"); } log.debug("HTSB: Parser found, parsing..."); - const parsed = parser(data, timestampOffset, language); + const parsed = parser(data, timescale, timestampOffset, language); log.debug("HTTB: Parsed successfully!", parsed.length); return parsed; } diff --git a/src/main_thread/text_displayer/html/html_text_displayer.ts b/src/main_thread/text_displayer/html/html_text_displayer.ts index 91b1b2858f..dfc8c10e4a 100644 --- a/src/main_thread/text_displayer/html/html_text_displayer.ts +++ b/src/main_thread/text_displayer/html/html_text_displayer.ts @@ -126,12 +126,25 @@ export default class HTMLTextDisplayer implements ITextDisplayer { return convertToRanges(this._buffered); } - const { start: startTime, end: endTime, data: dataString, type, language } = chunk; + const { + start: startTime, + end: endTime, + data: dataRaw, + type, + language, + timescale, + } = chunk; const appendWindowStart = appendWindow[0] ?? 0; const appendWindowEnd = appendWindow[1] ?? Infinity; - const cues = parseTextTrackToElements(type, dataString, timestampOffset, language); + const cues = parseTextTrackToElements( + type, + dataRaw, + timescale ?? 1, + timestampOffset, + language, + ); if (appendWindowStart !== 0 && appendWindowEnd !== Infinity) { // Removing before window start @@ -398,7 +411,7 @@ export default class HTMLTextDisplayer implements ITextDisplayer { /** Data of chunks that should be pushed to the `HTMLTextDisplayer`. */ export interface ITextTracksBufferSegmentData { /** The text track data, in the format indicated in `type`. */ - data: string; + data: string | BufferSource; /** The format of `data` (examples: "ttml", "srt" or "vtt") */ type: string; /** @@ -424,7 +437,9 @@ export interface ITextTracksBufferSegmentData { */ if ((__ENVIRONMENT__.CURRENT_ENV as number) === (__ENVIRONMENT__.DEV as number)) { // @ts-expect-error: uncalled function just for type-checking - function _checkType(input: ITextTrackSegmentData): void { + function _checkType( + input: ITextTrackSegmentData, + ): void { function checkEqual(_arg: ITextTracksBufferSegmentData): void { /* nothing */ } diff --git a/src/main_thread/text_displayer/native/native_parsers.ts b/src/main_thread/text_displayer/native/native_parsers.ts index 9aec18dcc5..6fb16f6c9e 100644 --- a/src/main_thread/text_displayer/native/native_parsers.ts +++ b/src/main_thread/text_displayer/native/native_parsers.ts @@ -5,7 +5,9 @@ import log from "../../../log"; /** * Convert text track data into timed VTT Cues. * @param {string} type - Text track format wanted - * @param {string} data - Text track data + * @param {string|BufferSource} data - Text track data + * @param {Number} timescale - Potential external timescale to convert timing + * information into seconds. * @param {Number} timestampOffset - offset to apply to every timed text * @param {string} [language] - language of the text tracks * @returns {Array.} @@ -13,7 +15,8 @@ import log from "../../../log"; */ export default function parseTextTrackToCues( type: string, - data: string, + data: string | BufferSource, + timescale: number, timestampOffset: number, language?: string, ): Array { @@ -25,7 +28,7 @@ export default function parseTextTrackToCues( } log.debug("NTSB: Parser found, parsing..."); - const parsed = parser(data, timestampOffset, language); + const parsed = parser(data, timescale, timestampOffset, language); log.debug("NTSB: Parsed successfully!", parsed.length); return parsed; } diff --git a/src/main_thread/text_displayer/native/native_text_displayer.ts b/src/main_thread/text_displayer/native/native_text_displayer.ts index b768f62cdf..147f75e8b7 100644 --- a/src/main_thread/text_displayer/native/native_text_displayer.ts +++ b/src/main_thread/text_displayer/native/native_text_displayer.ts @@ -48,10 +48,23 @@ export default class NativeTextDisplayer implements ITextDisplayer { return convertToRanges(this._buffered); } const { timestampOffset, appendWindow, chunk } = infos; - const { start: startTime, end: endTime, data: dataString, type, language } = chunk; + const { + start: startTime, + end: endTime, + data: dataString, + type, + language, + timescale, + } = chunk; const appendWindowStart = appendWindow[0] ?? 0; const appendWindowEnd = appendWindow[1] ?? Infinity; - const cues = parseTextTrackToCues(type, dataString, timestampOffset, language); + const cues = parseTextTrackToCues( + type, + dataString, + timescale ?? 1, + timestampOffset, + language, + ); if (appendWindowStart !== 0 && appendWindowEnd !== Infinity) { // Removing before window start @@ -222,7 +235,7 @@ export default class NativeTextDisplayer implements ITextDisplayer { /** Data of chunks that should be pushed to the NativeTextDisplayer. */ export interface INativeTextTracksBufferSegmentData { /** The text track data, in the format indicated in `type`. */ - data: string; + data: string | BufferSource; /** The format of `data` (examples: "ttml", "srt" or "vtt") */ type: string; /** diff --git a/src/parsers/containers/isobmff/utils.ts b/src/parsers/containers/isobmff/utils.ts index 2f21752d45..ce7737c3ff 100644 --- a/src/parsers/containers/isobmff/utils.ts +++ b/src/parsers/containers/isobmff/utils.ts @@ -20,6 +20,7 @@ import { be2toi, be3toi, be4toi, + be4toiSigned, be8toi, concat, itobe4, @@ -233,6 +234,92 @@ function getDefaultDurationFromTFHDInTRAF(traf: Uint8Array): number | undefined return defaultDuration; } +interface ITrunSampleInfo { + duration: number; + compositionTimeOffset: number | undefined; + size: number | undefined; + flags: number | undefined; +} + +function getTrunSamples(buffer: Uint8Array): ITrunSampleInfo[] { + const trafs = getTRAFs(buffer); + const samples: ITrunSampleInfo[] = []; + for (const traf of trafs) { + const trun = getBoxContent(traf, 0x7472756e /* trun */); + if (trun === null) { + continue; + } + let cursor = 0; + const version = trun[cursor]; + cursor += 1; + if (version > 1) { + return []; + } + + const flags = be3toi(trun, cursor); + cursor += 3; + const hasSampleDuration = (flags & 0x000100) > 0; + + let defaultDuration: number | undefined = 0; + if (!hasSampleDuration) { + defaultDuration = getDefaultDurationFromTFHDInTRAF(traf); + if (defaultDuration === undefined) { + return []; + } + } + + const hasDataOffset = (flags & 0x000001) > 0; + const hasFirstSampleFlags = (flags & 0x000004) > 0; + const hasSampleSize = (flags & 0x000200) > 0; + const hasSampleFlags = (flags & 0x000400) > 0; + const hasSampleCompositionOffset = (flags & 0x000800) > 0; + + const sampleCounts = be4toi(trun, cursor); + cursor += 4; + + if (hasDataOffset) { + cursor += 4; + } + if (hasFirstSampleFlags) { + cursor += 4; + } + + let i = sampleCounts; + while (i-- > 0) { + let duration; + let size; + let sampleFlags; + let compositionTimeOffset; + if (hasSampleDuration) { + duration = be4toi(trun, cursor); + cursor += 4; + } else { + duration = defaultDuration; + } + if (hasSampleSize) { + size = be4toi(trun, cursor); + cursor += 4; + } + if (hasSampleFlags) { + sampleFlags = be4toi(trun, cursor); + cursor += 4; + } + if (hasSampleCompositionOffset) { + compositionTimeOffset = + version === 0 ? be4toi(trun, cursor) : be4toiSigned(trun, cursor); + cursor += 4; + } + samples.push({ + duration, + compositionTimeOffset, + size, + flags: sampleFlags, + }); + } + } + return samples; +} + /** * Calculate segment duration approximation by additioning the duration from * every samples in a trun ISOBMFF box. @@ -563,6 +650,7 @@ function getKeyIdFromInitSegment(segment: Uint8Array): Uint8Array | null { return keyId.every((b) => b === 0) ? null : keyId; } +export type { ITrunSampleInfo }; export { getKeyIdFromInitSegment, getMDHDTimescale, @@ -573,4 +661,5 @@ export { patchPssh, updateBoxLength, parseEmsgBoxes, + getTrunSamples, }; diff --git a/src/parsers/texttracks/sami/html.ts b/src/parsers/texttracks/sami/html.ts index c73ed84fbb..dbac926cfd 100644 --- a/src/parsers/texttracks/sami/html.ts +++ b/src/parsers/texttracks/sami/html.ts @@ -29,8 +29,10 @@ * It always should be imported through the `features` object. */ +import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8"; import isNonEmptyString from "../../../utils/is_non_empty_string"; import isNullOrUndefined from "../../../utils/is_null_or_undefined"; +import { utf8ToStr } from "../../../utils/string_parsing"; import type { IHTMLCue } from "../types"; const HTML_ENTITIES = /&#([0-9]+);/g; @@ -99,11 +101,25 @@ function decodeEntities(text: string): string { * The specification being quite clunky, this parser * may not work for every sami input. * - * @param {string} smi + * @param {string|BufferSource} input + * @param {Number} _timescale * @param {Number} timeOffset * @param {string} lang */ -function parseSami(smi: string, timeOffset: number, lang?: string): IHTMLCue[] { +function parseSami( + input: string | BufferSource, + _timescale: number, + timeOffset: number, + lang?: string, +): IHTMLCue[] { + let smi: string; + if (typeof input !== "string") { + // Assume UTF-8 + // TODO: detection? + smi = utf8ToStr(bufferSourceToUint8(input)); + } else { + smi = input; + } const syncOpen = /]/gi; const syncClose = /]|<\/body>/gi; diff --git a/src/parsers/texttracks/sami/native.ts b/src/parsers/texttracks/sami/native.ts index 441007da30..bb1c850686 100644 --- a/src/parsers/texttracks/sami/native.ts +++ b/src/parsers/texttracks/sami/native.ts @@ -21,8 +21,10 @@ import type { ICompatVTTCue } from "../../../compat/browser_compatibility_types"; import makeVTTCue from "../../../compat/make_vtt_cue"; +import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8"; import isNonEmptyString from "../../../utils/is_non_empty_string"; import isNullOrUndefined from "../../../utils/is_null_or_undefined"; +import { utf8ToStr } from "../../../utils/string_parsing"; const HTML_ENTITIES = /&#([0-9]+);/g; const BR = /
/gi; @@ -104,16 +106,26 @@ function decodeEntities(text: string): string { * The specification being quite clunky, this parser * may not work for every sami input. * - * @param {string} smi + * @param {string|BufferSource} input + * @param {Number} _timescale * @param {Number} timeOffset * @param {string} lang * @returns {Array.} */ function parseSami( - smi: string, + input: string | BufferSource, + _timescale: number, timeOffset: number, lang?: string, ): Array { + let smi: string; + if (typeof input !== "string") { + // Assume UTF-8 + // TODO: detection? + smi = utf8ToStr(bufferSourceToUint8(input)); + } else { + smi = input; + } const syncOpen = /]/gi; const syncClose = /]|<\/body>/gi; diff --git a/src/parsers/texttracks/srt/html.ts b/src/parsers/texttracks/srt/html.ts index d123106b9b..43b0178cb1 100644 --- a/src/parsers/texttracks/srt/html.ts +++ b/src/parsers/texttracks/srt/html.ts @@ -24,6 +24,8 @@ // Done for fun. Understand , , and type // of tags. +import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8"; +import { utf8ToStr } from "../../../utils/string_parsing"; import getCueBlocks from "./get_cue_blocks"; import parseCueBlock from "./parse_cue"; @@ -34,14 +36,24 @@ export interface ISRTHTMLCue { } /** - * @param {string} srtStr + * @param {string|BufferSource} input + * @param {Number} _timescale * @param {Number} timeOffset * @returns {Array.} */ export default function parseSRTStringToHTML( - srtStr: string, + input: string | BufferSource, + _timescale: number, timeOffset: number, ): ISRTHTMLCue[] { + let srtStr: string; + if (typeof input !== "string") { + // Assume UTF-8 + // TODO: detection? + srtStr = utf8ToStr(bufferSourceToUint8(input)); + } else { + srtStr = input; + } // Even if srt only authorize CRLF, we will also take LF or CR as line // terminators for resilience const lines = srtStr.split(/\r\n|\n|\r/); diff --git a/src/parsers/texttracks/srt/native.ts b/src/parsers/texttracks/srt/native.ts index 80485059ff..b14f32e4e6 100644 --- a/src/parsers/texttracks/srt/native.ts +++ b/src/parsers/texttracks/srt/native.ts @@ -24,20 +24,32 @@ import type { ICompatVTTCue } from "../../../compat/browser_compatibility_types"; import makeVTTCue from "../../../compat/make_vtt_cue"; +import bufferSourceToUint8 from "../../../utils/buffer_source_to_uint8"; +import { utf8ToStr } from "../../../utils/string_parsing"; import getCueBlocks from "./get_cue_blocks"; import parseCueBlock from "./parse_cue"; /** * Parse whole srt file into an array of cues, to be inserted in a video's * TrackElement. - * @param {string} srtStr + * @param {string|bufferSource} input + * @param {Number} _timescale * @param {Number} timeOffset * @returns {Array.} */ export default function parseSRTStringToVTTCues( - srtStr: string, + input: string | BufferSource, + _timescale: number, timeOffset: number, ): Array { + let srtStr: string; + if (typeof input !== "string") { + // Assume UTF-8 + // TODO: detection? + srtStr = utf8ToStr(bufferSourceToUint8(input)); + } else { + srtStr = input; + } // Even if srt only authorize CRLF, we will also take LF or CR as line // terminators for resilience const lines = srtStr.split(/\r\n|\n|\r/); diff --git a/src/parsers/texttracks/ttml/html/__tests__/__global__/html_ttml_parser.test.ts b/src/parsers/texttracks/ttml/html/__tests__/__global__/html_ttml_parser.test.ts index 5716ea18e7..943d757080 100644 --- a/src/parsers/texttracks/ttml/html/__tests__/__global__/html_ttml_parser.test.ts +++ b/src/parsers/texttracks/ttml/html/__tests__/__global__/html_ttml_parser.test.ts @@ -79,7 +79,7 @@ const testingText = ` `; describe("Global TTML HTML parsing tests", () => { - const res = parseTTMLToDiv(testingText, 0); + const res = parseTTMLToDiv(testingText, 1, 0); it("should parse the right amount of cues at the right time", () => { expect(res).toHaveLength(11); expect(res[0].start).toEqual(0.76); diff --git a/src/parsers/texttracks/ttml/html/parse_ttml_to_div.ts b/src/parsers/texttracks/ttml/html/parse_ttml_to_div.ts index d6b87561e9..ff0dbe8c97 100644 --- a/src/parsers/texttracks/ttml/html/parse_ttml_to_div.ts +++ b/src/parsers/texttracks/ttml/html/parse_ttml_to_div.ts @@ -14,6 +14,8 @@ * limitations under the License. */ +import bufferSourceToUint8 from "../../../../utils/buffer_source_to_uint8"; +import { utf8ToStr } from "../../../../utils/string_parsing"; import parseTtml from "../parse_ttml"; import { applyDefaultTTMLStyle, @@ -36,10 +38,23 @@ import parseCue from "./parse_cue"; * TODO TTML parsing is still pretty heavy on the CPU. * Optimizations have been done, principally to avoid using too much XML APIs, * but we can still do better. - * @param {string} str + * @param {string|BufferSource} input + * @param {number} _timescale * @param {number} timeOffset */ -export default function parseTTMLToDiv(str: string, timeOffset: number): ITTMLHTMLCue[] { +export default function parseTTMLToDiv( + input: string | BufferSource, + _timescale: number, + timeOffset: number, +): ITTMLHTMLCue[] { + let str: string; + if (typeof input !== "string") { + // Assume UTF-8 + // TODO: detection? + str = utf8ToStr(bufferSourceToUint8(input)); + } else { + str = input; + } const parsedCues = parseTtml(str, timeOffset); const cues: ITTMLHTMLCue[] = []; for (const parsedCue of parsedCues) { diff --git a/src/parsers/texttracks/ttml/native/parse_ttml_to_vtt.ts b/src/parsers/texttracks/ttml/native/parse_ttml_to_vtt.ts index d2c11be02b..64f49d98b7 100644 --- a/src/parsers/texttracks/ttml/native/parse_ttml_to_vtt.ts +++ b/src/parsers/texttracks/ttml/native/parse_ttml_to_vtt.ts @@ -15,17 +15,30 @@ */ import type { ICompatVTTCue } from "../../../../compat/browser_compatibility_types"; +import bufferSourceToUint8 from "../../../../utils/buffer_source_to_uint8"; +import { utf8ToStr } from "../../../../utils/string_parsing"; import parseTtml from "../parse_ttml"; import parseCue from "./parse_cue"; /** - * @param str - * @param timeOffset + * @param {string|BufferSource} input + * @param {number} _timescale + * @param {number} timeOffset + * @returns {Array.} */ export default function parseTtmlToNative( - str: string, + input: string | BufferSource, + _timescale: number, timeOffset: number, ): Array { + let str: string; + if (typeof input !== "string") { + // Assume UTF-8 + // TODO: detection? + str = utf8ToStr(bufferSourceToUint8(input)); + } else { + str = input; + } const parsedCues = parseTtml(str, timeOffset); const cues: Array = []; for (const parsedCue of parsedCues) { diff --git a/src/parsers/texttracks/types.ts b/src/parsers/texttracks/types.ts index 529e8ca32f..d13c900a04 100644 --- a/src/parsers/texttracks/types.ts +++ b/src/parsers/texttracks/types.ts @@ -25,14 +25,16 @@ export interface IHTMLCue { // Function to parse texttracks into native VTT cues export type INativeTextTracksParserFn = ( - texttrack: string, + texttrack: string | BufferSource, + timescale: number, timeOffset: number, language?: string, ) => Array; // Function to parse texttracks into HTML cues export type IHTMLTextTracksParserFn = ( - texttrack: string, + texttrack: string | BufferSource, + timescale: number, timeOffset: number, language?: string, ) => IHTMLCue[]; diff --git a/src/parsers/texttracks/webvtt/html/__tests__/parse_webvtt_to_div.test.ts b/src/parsers/texttracks/webvtt/html/__tests__/parse_webvtt_to_div.test.ts index f221f0d220..cfc77df075 100644 --- a/src/parsers/texttracks/webvtt/html/__tests__/parse_webvtt_to_div.test.ts +++ b/src/parsers/texttracks/webvtt/html/__tests__/parse_webvtt_to_div.test.ts @@ -9,13 +9,13 @@ describe("parsers - webvtt - parseWebVTT", () => { it("should throw if text is empty", async () => { const parseWebVTT = (await vi.importActual("../parse_webvtt_to_div")) .default as typeof IParseWebVTT; - expect(() => parseWebVTT("", 0)).toThrowError("Can't parse WebVTT: Invalid File."); + expect(() => parseWebVTT("", 1, 0)).toThrowError("Can't parse WebVTT: Invalid File."); }); it("should throw if file seems to be invalid", async () => { const parseWebVTT = (await vi.importActual("../parse_webvtt_to_div")) .default as typeof IParseWebVTT; - expect(() => parseWebVTT("WEBWTT\n", 0)).toThrowError( + expect(() => parseWebVTT("WEBWTT\n", 1, 0)).toThrowError( "Can't parse WebVTT: Invalid File.", ); }); @@ -73,7 +73,7 @@ describe("parsers - webvtt - parseWebVTT", () => { const parseWebVTT = (await vi.importActual("../parse_webvtt_to_div")) .default as typeof IParseWebVTT; - expect(parseWebVTT("WEBVTT\n", 0)).toEqual([ + expect(parseWebVTT("WEBVTT\n", 1, 0)).toEqual([ { element: document.createElement("div"), end: 100, @@ -140,7 +140,7 @@ describe("parsers - webvtt - parseWebVTT", () => { const parseWebVTT = (await vi.importActual("../parse_webvtt_to_div")) .default as typeof IParseWebVTT; - expect(parseWebVTT("WEBVTT\n", 0)).toEqual([]); + expect(parseWebVTT("WEBVTT\n", 1, 0)).toEqual([]); expect(spyGetFirstLineAfterHeader).toHaveBeenCalledTimes(1); expect(spyGetStyleBlock).toHaveBeenCalledTimes(1); expect(spyGetCueBlock).toHaveBeenCalledTimes(1); diff --git a/src/parsers/texttracks/webvtt/html/parse_webvtt_to_div.ts b/src/parsers/texttracks/webvtt/html/parse_webvtt_to_div.ts index f48fb82bb6..b6bd6c457c 100644 --- a/src/parsers/texttracks/webvtt/html/parse_webvtt_to_div.ts +++ b/src/parsers/texttracks/webvtt/html/parse_webvtt_to_div.ts @@ -14,6 +14,16 @@ * limitations under the License. */ +import log from "../../../../log"; +import bufferSourceToUint8 from "../../../../utils/buffer_source_to_uint8"; +import { be4toi } from "../../../../utils/byte_parsing"; +import { strToUtf8, utf8ToStr } from "../../../../utils/string_parsing"; +import { + getBoxContent, + getMDAT, + getTrackFragmentDecodeTime, +} from "../../../containers/isobmff"; +import { getTrunSamples } from "../../../containers/isobmff/utils"; import getCueBlocks from "../get_cue_blocks"; import getStyleBlocks from "../get_style_blocks"; import parseCueBlock from "../parse_cue_block"; @@ -32,13 +42,27 @@ import toHTML from "./to_html"; * Specific style is parsed and applied to class element. * * @throws Error - Throws if the given WebVTT string is invalid. - * @param {string} text - The whole webvtt subtitles to parse + * @param {string | BufferSource} text - The whole webvtt subtitles to parse + * @param {Number} timescale * @param {Number} timeOffset - Offset to add to start and end times, in seconds * @return {Array.} */ -export default function parseWebVTT(text: string, timeOffset: number): IVTTHTMLCue[] { +export default function parseWebVTT( + text: string | BufferSource, + timescale: number, + timeOffset: number, +): IVTTHTMLCue[] { + let textStr: string; + if (typeof text !== "string") { + // Assume UTF-8 + // XXX TODO: + // textStr = utf8ToStr(bufferSourceToUint8(text)); + return parseWebVTTInMp4(text, timescale, timeOffset); + } else { + textStr = text; + } const newLineChar = /\r\n|\n|\r/g; // CRLF|LF|CR - const linified = text.split(newLineChar); + const linified = textStr.split(newLineChar); const cuesArray: IVTTHTMLCue[] = []; if (/^WEBVTT( |\t|\n|\r|$)/.exec(linified[0]) === null) { @@ -61,3 +85,169 @@ export default function parseWebVTT(text: string, timeOffset: number): IVTTHTMLC } return cuesArray; } + +function parseWebVTTInMp4( + segment: BufferSource | string, + timescale: number, + timeOffset: number, +): IVTTHTMLCue[] { + let buffer: Uint8Array; + if (typeof segment === "string") { + buffer = strToUtf8(segment); + } else { + buffer = bufferSourceToUint8(segment); + } + if (buffer.length === 0) { + return []; + } + + const cuesArray = []; + const trackDecodeTime = getTrackFragmentDecodeTime(buffer); + if (trackDecodeTime === undefined) { + return []; + } + const trunSamples = getTrunSamples(buffer); + const mdat = getMDAT(buffer); + if (mdat === null) { + return []; + } + let mdatOffset = 0; + let lastTime = trackDecodeTime; + /** @type {!shaka.util.DataViewReader} */ + // const reader = new shaka.util.DataViewReader( + // rawPayload, shaka.util.DataViewReader.Endianness.BIG_ENDIAN); + + for (const sample of trunSamples) { + const duration = sample.duration ?? 0; + const startTime = + sample.compositionTimeOffset !== undefined + ? lastTime + sample.compositionTimeOffset + : lastTime; + lastTime = startTime + duration; + + // Read samples until it adds up to the given size. + let totalSize = 0; + // No sample size == a single sample + while (totalSize < (sample.size ?? 0)) { + // Read the payload size. + const payloadSize = be4toi(mdat, mdatOffset); + mdatOffset += 4; + totalSize += payloadSize; + + const currentBoxName = utf8ToStr(mdat.slice(mdatOffset, mdatOffset + 4)); + mdatOffset += 4; + + let currentBoxData: Uint8Array | null = null; + if (currentBoxName === "vttc") { + if (payloadSize > 8) { + currentBoxData = mdat.slice(mdatOffset, mdatOffset + (payloadSize - 8)); + mdatOffset += payloadSize - 8; + } + } else if (currentBoxName === "vtte") { + if (payloadSize > 8) { + mdatOffset += payloadSize - 8; + } + } else { + log.error("webvtt: encountered unknown fragmented vtt box: ", currentBoxName); + mdatOffset += Math.min(payloadSize - 8, 1); + } + + if (duration > 0) { + if (currentBoxData !== null) { + const cue = parseVttC( + currentBoxData, + timeOffset + startTime / timescale, + timeOffset + lastTime / timescale, + ); + if (cue !== null) { + cuesArray.push(cue); + } + } + } else { + log.error("webvtt: cue duration missing"); + } + // + // goog.asserts.assert( + // !sample.sampleSize || totalSize <= sample.sampleSize, + // 'The samples do not fit evenly into the sample sizes given in ' + + // 'the TRUN box!'); + // + // If no sampleSize was specified, it's assumed that this sample + // corresponds to only a single cue. + } + } + + // goog.asserts.assert( + // !reader.hasMoreData(), + // "MDAT which contain VTT cues and non-VTT data are not currently " + "supported!", + // ); + + return cuesArray; +} + +function parseVttC( + data: Uint8Array, + startTime: number, + endTime: number, +): IVTTHTMLCue | null { + const payload = getPayl(data); + // const iden = getIden(data); + // const settings = getSttg(data); + if (payload === null) { + return null; + } + const cueHtml = toHTML( + { + start: startTime, + end: endTime, + settings: {}, + header: undefined, + payload: [utf8ToStr(payload)], + }, + { + classes: {}, + global: undefined, + }, + ); + + // XXX TODO: + // if (settings) { + // const parser = new shaka.util.TextParser(settings); + // + // let word = parser.readWord(); + // + // while (word) { + // // TODO: Check WebVTTConfigurationBox for region info. + // if ( + // !shaka.text.VttTextParser.parseCueSetting(cue, word, /* VTTRegions= */ []) + // ) { + // shaka.log.warning( + // "VTT parser encountered an invalid VTT setting: ", + // word, + // " The setting will be ignored.", + // ); + // } + // + // parser.skipWhitespace(); + // word = parser.readWord(); + // } + // } + + return cueHtml; +} +/** + * Returns the content of the first "payl" box encountered in the given ISOBMFF + * data. + * Returns null if not found. + * @param {Uint8Array} buffer + * @returns {Uint8Array|null} + */ +function getPayl(buf: Uint8Array): Uint8Array | null { + return getBoxContent(buf, 0x7061796c /* "payl" */); +} +// function getIden(buf: Uint8Array): Uint8Array | null { +// return getBoxContent(buf, 0x6964656e /* "iden" */); +// } +// function getSttg(buf: Uint8Array): Uint8Array | null { +// return getBoxContent(buf, 0x73747467 /* "sttg" */); +// } diff --git a/src/parsers/texttracks/webvtt/native/parse_vtt_to_cues.ts b/src/parsers/texttracks/webvtt/native/parse_vtt_to_cues.ts index d531fedc58..e6af28476c 100644 --- a/src/parsers/texttracks/webvtt/native/parse_vtt_to_cues.ts +++ b/src/parsers/texttracks/webvtt/native/parse_vtt_to_cues.ts @@ -21,6 +21,8 @@ import type { ICompatVTTCue } from "../../../../compat/browser_compatibility_types"; import isVTTCue from "../../../../compat/is_vtt_cue"; +import bufferSourceToUint8 from "../../../../utils/buffer_source_to_uint8"; +import { utf8ToStr } from "../../../../utils/string_parsing"; import getCueBlocks from "../get_cue_blocks"; import parseCueBlock from "../parse_cue_block"; import { getFirstLineAfterHeader } from "../utils"; @@ -34,14 +36,24 @@ import toNativeCue from "./to_native_cue"; /** * Parse whole WEBVTT file into an array of cues, to be inserted in a video's * TrackElement. - * @param {string} vttStr + * @param {string|BufferSource} input + * @param {Number} _timescale * @param {Number} timeOffset * @returns {Array.} */ export default function parseVTTStringToVTTCues( - vttStr: string, + input: string | BufferSource, + _timescale: number, timeOffset: number, ): Array { + let vttStr: string; + if (typeof input !== "string") { + // Assume UTF-8 + // TODO: detection? + vttStr = utf8ToStr(bufferSourceToUint8(input)); + } else { + vttStr = input; + } // WEBVTT authorize CRLF, LF or CR as line terminators const lines = vttStr.split(/\r\n|\n|\r/); diff --git a/src/tools/TextTrackRenderer/text_track_renderer.ts b/src/tools/TextTrackRenderer/text_track_renderer.ts index 4376a5f442..119585f366 100644 --- a/src/tools/TextTrackRenderer/text_track_renderer.ts +++ b/src/tools/TextTrackRenderer/text_track_renderer.ts @@ -24,6 +24,11 @@ export interface ISetTextTrackArguments { data: string; /** The format the text track is in (e.g. "ttml" or "vtt") */ type: string; + /** + * Optional timescale data context that is used to convert timing information + * into seconds. + */ + timescale: number | null; /** Offset, in seconds, that will be added to each subtitle's start and end time. */ timeOffset?: number; /** @@ -81,6 +86,7 @@ export default class TextTrackRenderer { chunk: { start: 0, end: Number.MAX_VALUE, + timescale: args.timescale, data: args.data, language: args.language, type: args.type, diff --git a/src/transports/dash/text_parser.ts b/src/transports/dash/text_parser.ts index ce08275bb1..b849989d28 100644 --- a/src/transports/dash/text_parser.ts +++ b/src/transports/dash/text_parser.ts @@ -108,6 +108,7 @@ function parseISOBMFFEmbeddedTextTrack( const chunkData = getISOBMFFEmbeddedTextTrackData( context, chunkBytes, + initTimescale, chunkInfos, isChunked, ); @@ -127,6 +128,7 @@ function parseISOBMFFEmbeddedTextTrack( * Parse TextTrack data when it is in plain text form. * * @param {ArrayBuffer|Uint8Array|string} data - The segment data. + * @param {number|undefined} initTimescale * @param {boolean} isChunked - If `true`, the `data` may contain only a * decodable subpart of the full data in the linked segment. * @param {Object} context - Object describing the context of the given @@ -136,6 +138,7 @@ function parseISOBMFFEmbeddedTextTrack( */ function parsePlainTextTrack( data: ArrayBuffer | Uint8Array | string, + initTimescale: number | undefined, isChunked: boolean, context: ISegmentContext, ): @@ -162,7 +165,12 @@ function parsePlainTextTrack( } else { textTrackData = data; } - const chunkData = getPlainTextTrackData(context, textTrackData, isChunked); + const chunkData = getPlainTextTrackData( + context, + textTrackData, + initTimescale, + isChunked, + ); return { segmentType: "media", chunkData, @@ -244,7 +252,7 @@ export default function generateTextTrackParser({ __priv_patchLastSegmentInSidx, ); } else { - return parsePlainTextTrack(data, isChunked, context); + return parsePlainTextTrack(data, initTimescale, isChunked, context); } }; } diff --git a/src/transports/local/text_parser.ts b/src/transports/local/text_parser.ts index 2949a9c6bc..f8b7dd6247 100644 --- a/src/transports/local/text_parser.ts +++ b/src/transports/local/text_parser.ts @@ -78,6 +78,7 @@ function parseISOBMFFEmbeddedTextTrack( const chunkData = getISOBMFFEmbeddedTextTrackData( context, chunkBytes, + initTimescale, chunkInfos, isChunked, ); @@ -96,6 +97,7 @@ function parseISOBMFFEmbeddedTextTrack( /** * Parse TextTrack data when it is in plain text form. * @param {ArrayBuffer|Uint8Array|string} data - The segment data. + * @param {number|undefined} initTimescale * @param {boolean} isChunked - If `true`, the `data` may contain only a * decodable subpart of the full data in the linked segment. * @param {Object} context - Object describing the context of the given @@ -105,6 +107,7 @@ function parseISOBMFFEmbeddedTextTrack( */ function parsePlainTextTrack( data: string | Uint8Array | ArrayBuffer, + initTimescale: number | undefined, isChunked: boolean, context: ISegmentContext, ): @@ -130,7 +133,12 @@ function parsePlainTextTrack( } else { textTrackData = data; } - const chunkData = getPlainTextTrackData(context, textTrackData, isChunked); + const chunkData = getPlainTextTrackData( + context, + textTrackData, + initTimescale, + isChunked, + ); const chunkOffset = segment.timestampOffset ?? 0; return { segmentType: "media", @@ -192,6 +200,6 @@ export default function textTrackParser( } else if (containerType === "mp4") { return parseISOBMFFEmbeddedTextTrack(data, isChunked, context, initTimescale); } else { - return parsePlainTextTrack(data, isChunked, context); + return parsePlainTextTrack(data, initTimescale, isChunked, context); } } diff --git a/src/transports/smooth/pipelines.ts b/src/transports/smooth/pipelines.ts index 3378391d5e..2948c1639f 100644 --- a/src/transports/smooth/pipelines.ts +++ b/src/transports/smooth/pipelines.ts @@ -411,6 +411,7 @@ export default function (transportOptions: ITransportOptions): ITransportPipelin data: _sdData, start: segmentStart, end: segmentEnd, + timescale: initTimescale ?? null, language, }, chunkSize, diff --git a/src/transports/types.ts b/src/transports/types.ts index 2b7a137ca3..fbac565b4a 100644 --- a/src/transports/types.ts +++ b/src/transports/types.ts @@ -419,9 +419,11 @@ export interface IChunkTimeInfo { } /** Text track segment data, once parsed. */ -export interface ITextTrackSegmentData { +export interface ITextTrackSegmentData< + T extends string | BufferSource = string | BufferSource, +> { /** The text track data, in the format indicated in `type`. */ - data: string; + data: T; /** The format of `data` (examples: "ttml", "srt" or "vtt") */ type: string; /** @@ -430,6 +432,11 @@ export interface ITextTrackSegmentData { * be parsed. */ language?: string | undefined; + /** + * Optional timescale data context that is used to convert timing information + * found inside the segment into seconds. + */ + timescale: number | null; /** start time from which the segment apply, in seconds. */ start?: number | undefined; /** end time until which the segment apply, in seconds. */ @@ -469,7 +476,10 @@ export interface ITransportAudioVideoSegmentPipeline { export interface ITransportTextSegmentPipeline { loadSegment: ISegmentLoader; - parseSegment: ISegmentParser; + parseSegment: ISegmentParser< + ILoadedTextSegmentFormat, + ITextTrackSegmentData | null + >; } export type ITransportSegmentPipeline = diff --git a/src/transports/utils/parse_text_track.ts b/src/transports/utils/parse_text_track.ts index 96e0db180b..83902e177d 100644 --- a/src/transports/utils/parse_text_track.ts +++ b/src/transports/utils/parse_text_track.ts @@ -83,7 +83,8 @@ export function getPlainTextTrackFormat( /** * @param {Object} content - * @param {ArrayBuffer|UInt8Array|null} chunkData + * @param {ArrayBuffer|UInt8Array|null} chunkBytes + * @param {number|undefined} initTimescale * @param {Object|null} chunkInfos * @param {boolean} isChunked * @returns {Object|null} @@ -99,6 +100,7 @@ export function getISOBMFFEmbeddedTextTrackData( language?: string | undefined; }, chunkBytes: Uint8Array, + initTimescale: number | undefined, chunkInfos: IChunkTimeInfo | null, isChunked: boolean, ): ITextTrackSegmentData | null { @@ -124,20 +126,34 @@ export function getISOBMFFEmbeddedTextTrackData( } const type = getISOBMFFTextTrackFormat(codecs); - const textData = extractTextTrackFromISOBMFF(chunkBytes); - return { data: textData, type, language, start: startTime, end: endTime }; + let textData: string | BufferSource; + if (codecs === "wvtt") { + // XXX TODO: check if WEBVTT header first? + textData = chunkBytes; + } else { + textData = extractTextTrackFromISOBMFF(chunkBytes); + } + return { + data: textData, + type, + language, + start: startTime, + end: endTime, + timescale: initTimescale ?? null, + }; } /** - * @param {Object} content - * @param {ArrayBuffer|UInt8Array|null} chunkData - * @param {Object|null} chunkInfos + * @param {Object} context + * @param {ArrayBuffer|UInt8Array|null} textTrackData + * @param {number|undefined} initTimescale * @param {boolean} isChunked * @returns {Object|null} */ export function getPlainTextTrackData( context: ISegmentContext, textTrackData: string, + initTimescale: number | undefined, isChunked: boolean, ): ITextTrackSegmentData | null { const { segment } = context; @@ -157,5 +173,12 @@ export function getPlainTextTrackData( } const type = getPlainTextTrackFormat(context.codecs, context.mimeType); - return { data: textTrackData, type, language: context.language, start, end }; + return { + data: textTrackData, + type, + language: context.language, + start, + end, + timescale: initTimescale ?? null, + }; } diff --git a/src/utils/buffer_source_to_uint8.ts b/src/utils/buffer_source_to_uint8.ts new file mode 100644 index 0000000000..b03b8adec8 --- /dev/null +++ b/src/utils/buffer_source_to_uint8.ts @@ -0,0 +1,14 @@ +/** + * Convert a vague "BufferSource" binary data into a more exploitable and known + * `Uint8Array`. + * @param {BufferSource} bs + * @returns {Uint8Array} + */ +export default function bufferSourceToUint8(bs: BufferSource): Uint8Array { + if (bs instanceof Uint8Array) { + return bs; + } else if (bs instanceof ArrayBuffer) { + return new Uint8Array(bs); + } + return new Uint8Array(bs.buffer); +} diff --git a/src/utils/byte_parsing.ts b/src/utils/byte_parsing.ts index 0d3bff9a6e..2f2ce9465f 100644 --- a/src/utils/byte_parsing.ts +++ b/src/utils/byte_parsing.ts @@ -82,6 +82,25 @@ function be4toi(bytes: Uint8Array, offset: number): number { ); } +/** + * Translate groups of 4 big-endian bytes representing a two's complement signed + * integer to directly that value. + * @param {Uint8Array} bytes + * @param {Number} offset - The offset (from the start of the given array) + * @returns {Number} + */ +function be4toiSigned(bytes: Uint8Array, offset: number): number { + // Didn't bother overthinking that one though it may be fun + return new Int32Array( + new Uint8Array([ + bytes[offset + 0], + bytes[offset + 1], + bytes[offset + 2], + bytes[offset + 3], + ]).buffer, + )[0]; +} + /** * Translate groups of 8 big-endian bytes to Integer. * @param {Uint8Array} bytes @@ -260,6 +279,7 @@ export { be2toi, be3toi, be4toi, + be4toiSigned, be8toi, le2toi, le4toi,