Skip to content

Commit

Permalink
Parse exif general encoded date (#137) (#193)
Browse files Browse the repository at this point in the history
* Update db schema

* Fix media-info output parsing

* Parse exif date-time

* Use date-time on backend

* Update tests

* Use numeric timestamps on frontend
  • Loading branch information
stepan-anokhin authored Nov 19, 2020
1 parent 71af1bc commit 762244a
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 57 deletions.
7 changes: 2 additions & 5 deletions db/access/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,6 @@ class ListFilesResults:
class FilesDAO:
"""Data-access object for files."""

# Format in which Dates are currently stored in exif table.
_EXIF_DATE_FORMAT = " UTC %Y-%m-%d 00"

# Label for related entities count (matches, scenes, etc.)
_LABEL_COUNT = "hit_count"
_countable_match = aliased(Matches)
Expand Down Expand Up @@ -197,11 +194,11 @@ def _filter_date(req: ListFilesRequest, query):
"""Filter by creation date."""
if req.date_from is not None:
query = query.filter(
Files.exif.has(Exif.General_Encoded_Date >= req.date_from.strftime(FilesDAO._EXIF_DATE_FORMAT)))
Files.exif.has(Exif.General_Encoded_Date >= req.date_from))

if req.date_to is not None:
query = query.filter(
Files.exif.has(Exif.General_Encoded_Date <= req.date_to.strftime(FilesDAO._EXIF_DATE_FORMAT)))
Files.exif.has(Exif.General_Encoded_Date <= req.date_to))

return query

Expand Down
27 changes: 13 additions & 14 deletions db/schema.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import datetime

from sqlalchemy import Column, String, Integer, LargeBinary, Boolean, \
Float, JSON, ForeignKey, UniqueConstraint, DateTime,PrimaryKeyConstraint,event
Float, JSON, ForeignKey, UniqueConstraint, DateTime, event
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship,backref,object_session
from sqlalchemy.orm import relationship, object_session

Base = declarative_base()

Expand All @@ -19,7 +19,7 @@ class Files(Base):
signature = relationship("Signature", uselist=False, back_populates="file")
meta = relationship("VideoMetadata", uselist=False, back_populates="file")
scenes = relationship("Scene", back_populates="file")
templatematches = relationship("Templatematches", back_populates="file",cascade='all, delete-orphan')
templatematches = relationship("Templatematches", back_populates="file", cascade='all, delete-orphan')
exif = relationship("Exif", uselist=False, back_populates="file")

# TODO: find a way to merge these two relationships
Expand Down Expand Up @@ -74,16 +74,17 @@ class Scene(Base):
class Templatematches(Base):
__tablename__ = 'templatematches'
# __table_args__ = (UniqueConstraint('file_id', 'template_name'),)
id = Column(Integer,autoincrement=True,primary_key=True)
id = Column(Integer, autoincrement=True, primary_key=True)
file_id = Column(Integer, ForeignKey('files.id'), nullable=True)
file = relationship("Files",back_populates='templatematches')
file = relationship("Files", back_populates='templatematches')
template_name = Column(String)
distance = Column(Float)
closest_match = Column(Float)
closest_match_time = Column(String)

@event.listens_for(Files.templatematches,"remove")
def rem(state,item,initiator):

@event.listens_for(Files.templatematches, "remove")
def rem(state, item, initiator):
sess = object_session(item)

# ensure we have a session
Expand All @@ -98,8 +99,6 @@ def rem(state,item,initiator):
sess.flush([item])




class Matches(Base):
__tablename__ = 'matches'
__table_args__ = (UniqueConstraint('query_video_file_id', 'match_video_file_id', name='_matches_uc'),)
Expand Down Expand Up @@ -130,9 +129,9 @@ class Exif(Base):
General_OverallBitRate = Column(Float)
General_FrameRate = Column(Float)
General_FrameCount = Column(Float)
General_Encoded_Date = Column(String)
General_File_Modified_Date = Column(String)
General_File_Modified_Date_Local = Column(String)
General_Encoded_Date = Column(DateTime)
General_File_Modified_Date = Column(DateTime)
General_File_Modified_Date_Local = Column(DateTime)
General_Tagged_Date = Column(String)
Video_Format = Column(String)
Video_BitRate = Column(Float)
Expand All @@ -146,6 +145,6 @@ class Exif(Base):
Audio_BitRate = Column(Float)
Audio_Channels = Column(Float)
Audio_Duration = Column(Float)
Audio_Encoded_Date = Column(String)
Audio_Tagged_Date = Column(String)
Audio_Encoded_Date = Column(DateTime)
Audio_Tagged_Date = Column(DateTime)
Json_full_exif = Column(JSON)
7 changes: 1 addition & 6 deletions server/tests/server/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def make_file(prefix="", length=42, ext="flv", audio=True, date=datetime.date(20
sha256 = f"hash-of-{path}"
return Files(file_path=path, sha256=sha256,
exif=Exif(General_FileExtension=ext, Audio_Duration=float(audio),
General_Encoded_Date=backend_date(date)),
General_Encoded_Date=date),
meta=VideoMetadata(video_length=length),
scenes=[Scene(start_time=start, duration=duration) for start, duration in scenes])

Expand All @@ -132,11 +132,6 @@ def make_files(count, prefix="", length=42, ext="flv", audio=True, date=datetime
]


def backend_date(date):
"""Convert date to format utilized in the backend."""
return date.strftime(" UTC %Y-%m-%d 00")


def param_date(date):
"""Convert date to REST API parameter format."""
return date.strftime("%Y-%m-%d")
Expand Down
3 changes: 1 addition & 2 deletions web/src/server-api/Server/Transform.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { randomObjects } from "../MockServer/fake-data/objects";
import { parse as parseDate } from "date-fns";

/**
* Data-transfer object and internal data format may evolve independently, the
Expand Down Expand Up @@ -48,7 +47,7 @@ export default class Transform {
if (value == null) {
return null;
}
return parseDate(value, "'UTC' yyyy-MM-dd HH", new Date());
return new Date(value * 1000);
}

fileMetadata(data) {
Expand Down
1 change: 0 additions & 1 deletion winnow/storage/db_result_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def wrapped(*args, **kwargs):
logger.debug(f"{func.__name__}(...) took {end - start:5.3} seconds")
return result

wrapped.__name__ = func.__name__
return wrapped


Expand Down
97 changes: 68 additions & 29 deletions winnow/utils/metadata_extraction.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import logging
import os
import shlex
import subprocess
from collections import defaultdict
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime, timezone

import cv2
import os
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -35,9 +38,9 @@ def process_media_info(info):
if ':' not in line:
section = line
else:
key, val, *_ = line.split(':')
key, val = line.split(':', maxsplit=1)
section, key = section.strip(), key.strip()
metadata[section][key] = val
metadata[section][key] = val.lstrip()
return dict(metadata)


Expand Down Expand Up @@ -172,33 +175,69 @@ def convert_to_df(video_metadata):
'Audio_Channels',
'Audio_Duration']

# Date column of interest
DCI = [
'General_Encoded_Date',
'General_File_Modified_Date',
'General_File_Modified_Date_Local',
'Audio_Encoded_Date',
'Audio_Tagged_Date']

# Exif date format
_EXIF_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"


def parse_timezone(str_value):
if str_value.startswith("UTC"):
return timezone.utc, str_value[3:].lstrip()
return None, str_value

def parse_and_filter_metadata_df(metadata_df):

GCI = [
x for x in CI
if x in metadata_df.columns
]
GNI = [
x for x in NCI
if x in metadata_df.columns
]
GSI = [
x for x in GCI
if x not in GNI
]

filtered = metadata_df.loc[:, GCI]
def parse_date(str_value: str):
try:
time_zone, str_value = parse_timezone(str_value)
return datetime.strptime(str_value, _EXIF_DATE_FORMAT).replace(tzinfo=time_zone)
except ValueError:
logger.exception("Cannot parse exif date")


def parse_and_filter_metadata_df(metadata_df):
all_columns = [
column_name for column_name in CI
if column_name in metadata_df.columns
]
numeric_columns = [
column_name for column_name in NCI
if column_name in metadata_df.columns
]
date_columns = [
column_name for column_name in DCI
if column_name in metadata_df.columns
]
string_columns = [
column_name for column_name in all_columns
if column_name not in numeric_columns and column_name not in date_columns
]

filtered = metadata_df.loc[:, all_columns]

# Parsing numerical fields
filtered.loc[:, GNI] = (filtered.loc[:, GNI]
.apply(lambda x: pd.to_numeric(x, errors='coerce'))
)

filtered.loc[:, GSI] = (filtered
.loc[:, GSI]
.fillna('N/A')
.apply(lambda x: x.str.strip())
)
filtered.loc[:, numeric_columns] = (
filtered.loc[:, numeric_columns]
.apply(lambda x: pd.to_numeric(x, errors='coerce'))
)

# Parsing date fields
filtered.loc[:, date_columns] = (
filtered.loc[:, date_columns]
.apply(lambda column: column.apply(parse_date))
)

filtered.loc[:, string_columns] = (
filtered
.loc[:, string_columns]
.fillna('N/A')
.apply(lambda x: x.str.strip())
)

return filtered

0 comments on commit 762244a

Please sign in to comment.