Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse exif general encoded date (#137) #193

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions db/access/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,6 @@ class ListFilesResults:
class FilesDAO:
"""Data-access object for files."""

# Format in which Dates are currently stored in exif table.
_EXIF_DATE_FORMAT = " UTC %Y-%m-%d 00"

# Label for related entities count (matches, scenes, etc.)
_LABEL_COUNT = "hit_count"
_countable_match = aliased(Matches)
Expand Down Expand Up @@ -197,11 +194,11 @@ def _filter_date(req: ListFilesRequest, query):
"""Filter by creation date."""
if req.date_from is not None:
query = query.filter(
Files.exif.has(Exif.General_Encoded_Date >= req.date_from.strftime(FilesDAO._EXIF_DATE_FORMAT)))
Files.exif.has(Exif.General_Encoded_Date >= req.date_from))

if req.date_to is not None:
query = query.filter(
Files.exif.has(Exif.General_Encoded_Date <= req.date_to.strftime(FilesDAO._EXIF_DATE_FORMAT)))
Files.exif.has(Exif.General_Encoded_Date <= req.date_to))

return query

Expand Down
27 changes: 13 additions & 14 deletions db/schema.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import datetime

from sqlalchemy import Column, String, Integer, LargeBinary, Boolean, \
Float, JSON, ForeignKey, UniqueConstraint, DateTime,PrimaryKeyConstraint,event
Float, JSON, ForeignKey, UniqueConstraint, DateTime, event
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship,backref,object_session
from sqlalchemy.orm import relationship, object_session

Base = declarative_base()

Expand All @@ -19,7 +19,7 @@ class Files(Base):
signature = relationship("Signature", uselist=False, back_populates="file")
meta = relationship("VideoMetadata", uselist=False, back_populates="file")
scenes = relationship("Scene", back_populates="file")
templatematches = relationship("Templatematches", back_populates="file",cascade='all, delete-orphan')
templatematches = relationship("Templatematches", back_populates="file", cascade='all, delete-orphan')
exif = relationship("Exif", uselist=False, back_populates="file")

# TODO: find a way to merge these two relationships
Expand Down Expand Up @@ -74,16 +74,17 @@ class Scene(Base):
class Templatematches(Base):
__tablename__ = 'templatematches'
# __table_args__ = (UniqueConstraint('file_id', 'template_name'),)
id = Column(Integer,autoincrement=True,primary_key=True)
id = Column(Integer, autoincrement=True, primary_key=True)
file_id = Column(Integer, ForeignKey('files.id'), nullable=True)
file = relationship("Files",back_populates='templatematches')
file = relationship("Files", back_populates='templatematches')
template_name = Column(String)
distance = Column(Float)
closest_match = Column(Float)
closest_match_time = Column(String)

@event.listens_for(Files.templatematches,"remove")
def rem(state,item,initiator):

@event.listens_for(Files.templatematches, "remove")
def rem(state, item, initiator):
sess = object_session(item)

# ensure we have a session
Expand All @@ -98,8 +99,6 @@ def rem(state,item,initiator):
sess.flush([item])




class Matches(Base):
__tablename__ = 'matches'
__table_args__ = (UniqueConstraint('query_video_file_id', 'match_video_file_id', name='_matches_uc'),)
Expand Down Expand Up @@ -130,9 +129,9 @@ class Exif(Base):
General_OverallBitRate = Column(Float)
General_FrameRate = Column(Float)
General_FrameCount = Column(Float)
General_Encoded_Date = Column(String)
General_File_Modified_Date = Column(String)
General_File_Modified_Date_Local = Column(String)
General_Encoded_Date = Column(DateTime)
General_File_Modified_Date = Column(DateTime)
General_File_Modified_Date_Local = Column(DateTime)
General_Tagged_Date = Column(String)
Video_Format = Column(String)
Video_BitRate = Column(Float)
Expand All @@ -146,6 +145,6 @@ class Exif(Base):
Audio_BitRate = Column(Float)
Audio_Channels = Column(Float)
Audio_Duration = Column(Float)
Audio_Encoded_Date = Column(String)
Audio_Tagged_Date = Column(String)
Audio_Encoded_Date = Column(DateTime)
Audio_Tagged_Date = Column(DateTime)
Json_full_exif = Column(JSON)
7 changes: 1 addition & 6 deletions server/tests/server/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def make_file(prefix="", length=42, ext="flv", audio=True, date=datetime.date(20
sha256 = f"hash-of-{path}"
return Files(file_path=path, sha256=sha256,
exif=Exif(General_FileExtension=ext, Audio_Duration=float(audio),
General_Encoded_Date=backend_date(date)),
General_Encoded_Date=date),
meta=VideoMetadata(video_length=length),
scenes=[Scene(start_time=start, duration=duration) for start, duration in scenes])

Expand All @@ -132,11 +132,6 @@ def make_files(count, prefix="", length=42, ext="flv", audio=True, date=datetime
]


def backend_date(date):
"""Convert date to format utilized in the backend."""
return date.strftime(" UTC %Y-%m-%d 00")


def param_date(date):
"""Convert date to REST API parameter format."""
return date.strftime("%Y-%m-%d")
Expand Down
3 changes: 1 addition & 2 deletions web/src/server-api/Server/Transform.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { randomObjects } from "../MockServer/fake-data/objects";
import { parse as parseDate } from "date-fns";

/**
* Data-transfer object and internal data format may evolve independently, the
Expand Down Expand Up @@ -48,7 +47,7 @@ export default class Transform {
if (value == null) {
return null;
}
return parseDate(value, "'UTC' yyyy-MM-dd HH", new Date());
return new Date(value * 1000);
}

fileMetadata(data) {
Expand Down
1 change: 0 additions & 1 deletion winnow/storage/db_result_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def wrapped(*args, **kwargs):
logger.debug(f"{func.__name__}(...) took {end - start:5.3} seconds")
return result

wrapped.__name__ = func.__name__
return wrapped


Expand Down
97 changes: 68 additions & 29 deletions winnow/utils/metadata_extraction.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import logging
import os
import shlex
import subprocess
from collections import defaultdict
import pandas as pd
from pandas.io.json import json_normalize
from datetime import datetime, timezone

import cv2
import os
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -35,9 +38,9 @@ def process_media_info(info):
if ':' not in line:
section = line
else:
key, val, *_ = line.split(':')
key, val = line.split(':', maxsplit=1)
section, key = section.strip(), key.strip()
metadata[section][key] = val
metadata[section][key] = val.lstrip()
return dict(metadata)


Expand Down Expand Up @@ -172,33 +175,69 @@ def convert_to_df(video_metadata):
'Audio_Channels',
'Audio_Duration']

# Date column of interest
DCI = [
'General_Encoded_Date',
'General_File_Modified_Date',
'General_File_Modified_Date_Local',
'Audio_Encoded_Date',
'Audio_Tagged_Date']

# Exif date format
_EXIF_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"


def parse_timezone(str_value):
if str_value.startswith("UTC"):
return timezone.utc, str_value[3:].lstrip()
return None, str_value

def parse_and_filter_metadata_df(metadata_df):

GCI = [
x for x in CI
if x in metadata_df.columns
]
GNI = [
x for x in NCI
if x in metadata_df.columns
]
GSI = [
x for x in GCI
if x not in GNI
]

filtered = metadata_df.loc[:, GCI]
def parse_date(str_value: str):
try:
time_zone, str_value = parse_timezone(str_value)
return datetime.strptime(str_value, _EXIF_DATE_FORMAT).replace(tzinfo=time_zone)
except ValueError:
logger.exception("Cannot parse exif date")


def parse_and_filter_metadata_df(metadata_df):
all_columns = [
column_name for column_name in CI
if column_name in metadata_df.columns
]
numeric_columns = [
column_name for column_name in NCI
if column_name in metadata_df.columns
]
date_columns = [
column_name for column_name in DCI
if column_name in metadata_df.columns
]
string_columns = [
column_name for column_name in all_columns
if column_name not in numeric_columns and column_name not in date_columns
]

filtered = metadata_df.loc[:, all_columns]

# Parsing numerical fields
filtered.loc[:, GNI] = (filtered.loc[:, GNI]
.apply(lambda x: pd.to_numeric(x, errors='coerce'))
)

filtered.loc[:, GSI] = (filtered
.loc[:, GSI]
.fillna('N/A')
.apply(lambda x: x.str.strip())
)
filtered.loc[:, numeric_columns] = (
filtered.loc[:, numeric_columns]
.apply(lambda x: pd.to_numeric(x, errors='coerce'))
)

# Parsing date fields
filtered.loc[:, date_columns] = (
filtered.loc[:, date_columns]
.apply(lambda column: column.apply(parse_date))
)

filtered.loc[:, string_columns] = (
filtered
.loc[:, string_columns]
.fillna('N/A')
.apply(lambda x: x.str.strip())
)

return filtered