diff --git a/iiify/app.py b/iiify/app.py index e78e60a..432e159 100755 --- a/iiify/app.py +++ b/iiify/app.py @@ -3,12 +3,12 @@ import os import time import requests -from flask import Flask, send_file, jsonify, abort, request, render_template, redirect +from flask import Flask, send_file, jsonify, abort, request, render_template, redirect, make_response from flask_cors import CORS from flask_caching import Cache from iiif2 import iiif, web from .resolver import ia_resolver, create_manifest, create_manifest3, getids, collection, \ - purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations + purify_domain, cantaloupe_resolver, create_collection3, IsCollection, create_annotations, create_vtt_stream from .configs import options, cors, approot, cache_root, media_root, \ cache_expr, version, image_server, cache_timeouts from urllib.parse import quote @@ -197,6 +197,13 @@ def annnotations(version, identifier, fileName, canvas_no): domain = purify_domain(request.args.get('domain', request.url_root)) return ldjsonify(create_annotations(version, identifier, fileName, canvas_no, domain=domain)) +@app.route('/iiif/vtt/streaming/.vtt') +@cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust) +def vtt_stream(identifier): + response = make_response(create_vtt_stream(identifier)) + response.headers['Content-Type'] = 'text/vtt' + return response + @app.route('/iiif//manifest.json') @cache.cached(timeout=cache_timeouts["long"], forced_update=cache_bust) def manifest(identifier): diff --git a/iiify/resolver.py b/iiify/resolver.py index f41f6ae..e090304 100644 --- a/iiify/resolver.py +++ b/iiify/resolver.py @@ -12,6 +12,7 @@ import math import re import xml.etree.ElementTree as ET +from datetime import timedelta IMG_CTX = 'http://iiif.io/api/image/2/context.json' PRZ_CTX = 'http://iiif.io/api/presentation/2/context.json' @@ -658,77 +659,131 @@ def create_manifest3(identifier, domain=None, page=None): vttfiles[sourceFilename] = [] vttfiles[sourceFilename].append(f) - - # create the canvases for each original - for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]: - normalised_id = file['name'].rsplit(".", 1)[0] + + if 'access-restricted-item' in metadata['metadata'] and metadata['metadata']['access-restricted-item']: + # this is a news item so has to be treated differently + # https://ia801803.us.archive.org/29/items/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.mp4?start=0&end=360&ignore=x.mp4&cnt=0 + mp4File = None + duration = 0.0 + filedata = None + for file in metadata['files']: + if file['name'].endswith('.mp4'): + mp4File = file['name'] + duration = float(file['length']) + filedata = file + + normalised_id = mp4File.rsplit(".", 1)[0] slugged_id = normalised_id.replace(" ", "-") c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas" - c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width'])) - - # Add vtt if present - if vttfiles and normalised_id in vttfiles: - vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt" - - vttNo = 1 - for vttFile in vttfiles[normalised_id]: - vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", - motivation="supplementing", - target=c.id, - anno_page_id=vttAPId, - body={"id": f"{domain}resource/{identifier}/{vttFile['name']}", - "type": "Text", - "format": "text/vtt", - }) - # add label and language - if vttFile['name'].endswith("autogenerated.vtt"): - vtAnno.body.label = { 'en': ['autogenerated']} - else: - # Assume language - splitName = vttFile['name'].split(".") - lang = splitName[-2] - vtAnno.body.add_label(lang, language="none") - vtAnno.body.language = lang - - vttNo += 1 - - # create intermediary objects + c = Canvas(id=c_id, label=normalised_id, duration=duration, height=int(filedata['height']), width=int(filedata['width'])) ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page") - anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id) - # create body based on whether there are derivatives or not: - if file['name'] in derivatives: - body = Choice(items=[]) - # add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734 - for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']: - if format in derivatives[file['name']]: - r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}", - type='Video', - format=to_mimetype(format), - label={"none": [format]}, - duration=float(file['length']), - height=int(file['height']), - width=int(file['width']), - ) - body.items.append(r) - elif file['format'] == format: - r = ResourceItem( - id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}", - type='Video', - format=to_mimetype(format), - label={"none": [format]}, - duration=float(file['length']), - height=int(file['height']), - width=int(file['width'])) - body.items.append(r) - else: - # todo: deal with instances where there are no derivatives for whatever reason - pass - - anno.body = body - ap.add_item(anno) + vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt" + vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/streamed", + motivation="supplementing", + target=c.id, + anno_page_id=vttAPId, + body={"id": f"{domain}vtt/streaming/{identifier}.vtt", + "type": "Text", + "format": "text/vtt", + }) + + segments = math.floor(duration / 60) + for i in range(segments): + start = i * 60 + if i == segments - 1: + end = int(duration) + else: + end = (i + 1) * 60 + + #print (f"Start: {start} End: {end}, Duration: {float(end) - float(start)} full duration: {duration}") + anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/{i}", motivation="painting", target=f"{c.id}#t={start},{end}") + streamurl = f"https://{metadata['server']}{metadata['dir']}/{mp4File}?start={start}&end={end}&ignore=x.mp4&cnt=0" + body = ResourceItem(id=streamurl, + type='Video', + format="video/mp4", + label={"en": [f"Part {i + 1} of {segments}"]}, + duration=end - start, + height=int(filedata['height']), + width=int(filedata['width']), + ) + + anno.body = body + ap.add_item(anno) + c.add_item(ap) manifest.add_item(c) + else: + # create the canvases for each original + for file in [f for f in originals if f['format'] in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']]: + normalised_id = file['name'].rsplit(".", 1)[0] + slugged_id = normalised_id.replace(" ", "-") + c_id = f"{URI_PRIFIX}/{identifier}/{slugged_id}/canvas" + c = Canvas(id=c_id, label=normalised_id, duration=float(file['length']), height=int(file['height']), width=int(file['width'])) + + # Add vtt if present + if vttfiles and normalised_id in vttfiles: + vttAPId = f"{URI_PRIFIX}/{identifier}/{slugged_id}/vtt" + + vttNo = 1 + for vttFile in vttfiles[normalised_id]: + vtAnno = c.make_annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation/vtt/{vttNo}", + motivation="supplementing", + target=c.id, + anno_page_id=vttAPId, + body={"id": f"{domain}resource/{identifier}/{vttFile['name']}", + "type": "Text", + "format": "text/vtt", + }) + # add label and language + if vttFile['name'].endswith("autogenerated.vtt"): + vtAnno.body.label = { 'en': ['autogenerated']} + else: + # Assume language + splitName = vttFile['name'].split(".") + lang = splitName[-2] + vtAnno.body.add_label(lang, language="none") + vtAnno.body.language = lang + + vttNo += 1 + + # create intermediary objects + ap = AnnotationPage(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/page") + anno = Annotation(id=f"{URI_PRIFIX}/{identifier}/{slugged_id}/annotation", motivation="painting", target=c.id) + + # create body based on whether there are derivatives or not: + if file['name'] in derivatives: + body = Choice(items=[]) + # add the choices in order per https://github.com/ArchiveLabs/iiif.archivelab.org/issues/77#issuecomment-1499672734 + for format in ['MPEG4', 'h.264 MPEG4', '512Kb MPEG4', 'HiRes MPEG4', 'MPEG2', 'h.264', 'Matroska', 'Ogg Video', 'Ogg Theora', 'WebM', 'Windows Media', 'Cinepack']: + if format in derivatives[file['name']]: + r = ResourceItem(id=f"https://archive.org/download/{identifier}/{derivatives[file['name']][format]['name'].replace(' ', '%20')}", + type='Video', + format=to_mimetype(format), + label={"none": [format]}, + duration=float(file['length']), + height=int(file['height']), + width=int(file['width']), + ) + body.items.append(r) + elif file['format'] == format: + r = ResourceItem( + id=f"https://archive.org/download/{identifier}/{file['name'].replace(' ', '%20')}", + type='Video', + format=to_mimetype(format), + label={"none": [format]}, + duration=float(file['length']), + height=int(file['height']), + width=int(file['width'])) + body.items.append(r) + else: + # todo: deal with instances where there are no derivatives for whatever reason + pass + + anno.body = body + ap.add_item(anno) + c.add_item(ap) + manifest.add_item(c) elif mediatype == "collection": raise IsCollection else: @@ -785,6 +840,73 @@ def create_annotations(version, identifier, fileName, canvas_no, domain=None): return json.loads(annotationPage.jsonld()) +def create_vtt_stream(identifier): + """ + This method will read a SRT file using the following URL: + https://archive.org/download/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.cc5.srt?t=0/360 + and convert it to vtt. The streaming text above takes seconds as a parameter. + """ + + metadata = requests.get('%s/metadata/%s' % (ARCHIVE, identifier)).json() + filename = "" + duration = 0.0 + for file in metadata['files']: + if file['name'].endswith('.mpg') and file['source'] == 'original': + duration = float(file['length']) + # There seems to be multiple srt files but unclear how they are different + if file['name'].endswith('.srt'): + filename = file['name'] + + # Initialize the vtt content with the WEBVTT header + vtt_content = ["WEBVTT\n"] + + segments = math.floor(duration / 60) + for i in range(segments): + start = i * 60 + if i == segments - 1: + end = int(duration) + else: + end = (i + 1) * 60 + + + response = requests.get(f"https://archive.org/download/{identifier}/{filename}?t={start}/{end}") + + if response.status_code == 200: + # Get the content of the SRT file as a string + srt_content = response.text + # Split the srt file by lines + lines = srt_content.splitlines() + for line in lines: + # Convert time format: 00:00:00,000 -> 00:00:00.000 + if "-->" in line: + splitline = line.split("-->") + starttime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start) + endtime = timeToDelta(splitline[0].strip()) + timedelta(seconds=start) + line = f"{formatTimeVTT(starttime)} -> {formatTimeVTT(endtime)}" + + vtt_content.append(line) + + vtt_content.append(" ") + + # Join the list into a single string + return "\n".join(vtt_content) + +def formatTimeVTT(time): + hours, remainder = divmod(time.total_seconds(), 3600) + minutes, seconds = divmod(remainder, 60) + return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}.{int(time.microseconds / 1000):03}" + +def timeToDelta(time): + """ + Convert SRT formated times to timedelta + """ + milliseconds = int(time.split(",")[1]) + timeStr = time.split(",")[0] + hour = int(timeStr.split(":")[0]) + minute = int(timeStr.split(":")[1]) + second = int(timeStr.split(":")[2]) + return timedelta(hours=hour, minutes=minute, seconds=second, milliseconds=milliseconds) + def coerce_list(value): if isinstance(value, list): return ". ".join(value) diff --git a/tests/test_video.py b/tests/test_video.py index 117d14c..a2492ae 100644 --- a/tests/test_video.py +++ b/tests/test_video.py @@ -2,6 +2,7 @@ os.environ["FLASK_CACHE_DISABLE"] = "true" import unittest +import math from flask.testing import FlaskClient from iiify.app import app @@ -66,6 +67,34 @@ def test_vtt_multilingual(self): if item['body']['language'] == 'cy': self.assertEqual(item['body']['id'], 'https://localhost/iiif/resource/cruz-test/cruz-test.cy.vtt', 'Unexpected link for the Welsh vtt file') + def test_newsitem(self): + resp = self.test_app.get("/iiif/3/CSPAN3_20180217_164800_Poplar_Forest_Archaeology/manifest.json") + self.assertEqual(resp.status_code, 200) + manifest = resp.json + + canvas = manifest['items'][0] + annoPages = canvas['items'][0] + annotations = annoPages['items'] + self.assertEqual(len(annotations), math.floor(780.89 / 60), 'Expected the video to contain the 13min video split into 1 minute segments') + + # Check vtt file + self.assertTrue('annotations' in canvas, "Expected canvas to have annotations") + vttFile = canvas['annotations'][0]['items'][0]['body']['id'] + self.assertTrue(vttFile.endswith("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt"),f"Expected vttFile to be located at /iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt but found it at {vttFile}") + + resp = self.test_app.get("/iiif/vtt/streaming/CSPAN3_20180217_164800_Poplar_Forest_Archaeology.vtt") + checkLine=False + for line in resp.text.split("\n"): + if checkLine: + self.assertEqual("00:01:02.000 -> 00:01:02.000", line, "Expected the timecode to be over a minute as its the second video") + break + if line.startswith("28"): + checkLine=True + # 28 + # 00:01:02.000 -> 00:01:02.000 + # I AM THE DIRECTOR OF ARCHAEOLOGY + + if __name__ == '__main__': unittest.main() \ No newline at end of file