Skip to content


making the evaluate tiou average across video id's rather than submis…
Browse files Browse the repository at this point in the history
  • Loading branch information
kenjihata committed Jun 30, 2017
1 parent ca56827 commit b8d9070
Showing 1 changed file with 105 additions and 59 deletions.
164 changes: 105 additions & 59 deletions
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from sets import Set
import numpy as np

def remove_nonascii(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])

class ANETcaptions(object):
PREDICTION_FIELDS = ['results', 'version', 'external_data']
Expand All @@ -37,6 +41,19 @@ def __init__(self, ground_truth_filenames=None, prediction_filename=None,
self.pred_fields = prediction_fields
self.ground_truths = self.import_ground_truths(ground_truth_filenames)
self.prediction = self.import_prediction(prediction_filename)
self.tokenizer = PTBTokenizer()

# Set up scorers, if not verbose, we only use the one we're
# testing on: METEOR
if self.verbose:
self.scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Rouge(), "ROUGE_L"),
(Cider(), "CIDEr")
self.scorers = [(Meteor(), "METEOR")]

def import_prediction(self, prediction_filename):
if self.verbose:
Expand Down Expand Up @@ -75,6 +92,12 @@ def check_gt_exists(self, vid_id):
return True
return False

def get_gt_vid_ids(self):
vid_ids = set([])
for gt in self.ground_truths:
vid_ids |= set(gt.keys())
return list(vid_ids)

def evaluate(self):
aggregator = {}
self.scores = {}
Expand All @@ -93,91 +116,114 @@ def evaluate(self):

def evaluate_detection(self, tiou):
recall = [0] * len(self.prediction.keys())
precision = [0] * len(self.prediction.keys())
for vid_i, vid_id in enumerate(self.prediction):
gt_vid_ids = self.get_gt_vid_ids()
# Recall is the percentage of ground truth that is covered by the predictions
# Precision is the percentage of predictions that are valid
recall = [0] * len(gt_vid_ids)
precision = [0] * len(gt_vid_ids)
for vid_i, vid_id in enumerate(gt_vid_ids):
best_recall = 0
best_precision = 0
for gt in self.ground_truths:
if vid_id not in gt:
refs = gt[vid_id]
ref_set_covered = set([])
pred_set_covered = set([])
num_gt = 0
num_pred = 0
for pred_i, pred in enumerate(self.prediction[vid_id]):
pred_timestamp = pred['timestamp']
for ref_i, ref_timestamp in enumerate(refs['timestamps']):
if self.iou(pred_timestamp, ref_timestamp) > tiou:
if vid_id in self.prediction:
for pred_i, pred in enumerate(self.prediction[vid_id]):
pred_timestamp = pred['timestamp']
for ref_i, ref_timestamp in enumerate(refs['timestamps']):
if self.iou(pred_timestamp, ref_timestamp) > tiou:

new_precision = float(len(pred_set_covered)) / pred_i
best_precision = max(best_precision, new_precision)
new_recall = float(len(ref_set_covered)) / len(refs['timestamps'])
new_precision = float(len(pred_set_covered)) / pred_i
best_recall = max(best_recall, new_recall)
best_precision = max(best_recall, best_precision)
recall[vid_i] = best_recall
precision[vid_i] = best_precision
return sum(recall) / len(recall), sum(precision) / len(recall)
return sum(recall) / len(recall), sum(precision) / len(precision)

def evaluate_tiou(self, tiou):
# For every prediction, find it's respective references with tIoU > the passed in argument.
# This method averages the tIoU precision from METEOR, Bleu, etc. across videos
res = {}
gts = {}
unique_index = 0
for vid_id in self.prediction:
if not self.check_gt_exists(vid_id):
for pred in self.prediction[vid_id]:
res[unique_index] = [{'caption': pred['sentence']}]
matches = []
for gt in self.ground_truths:
if vid_id not in gt:
refs = gt[vid_id]
for ref_i, ref_timestamp in enumerate(refs['timestamps']):
if self.iou(pred['timestamp'], ref_timestamp) > tiou:
if len(matches) == 0:
gts[unique_index] = [{'caption': 'abc123!@#'}]
gts[unique_index] = [{'caption': v} for v in matches]
unique_index += 1
gt_vid_ids = self.get_gt_vid_ids()
for vid_id in gt_vid_ids:

# Set up scorers
if self.verbose:
print '| Tokenizing ...'
# Suppressing tokenizer output
tokenizer = PTBTokenizer()
gts = tokenizer.tokenize(gts)
res = tokenizer.tokenize(res)
res[vid_id] = {}
gts[vid_id] = {}

# Set up scorers, if not verbose, we only use the one we're
# testing on: METEOR
if self.verbose:
print '| Setting up scorers ...'
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Rouge(), "ROUGE_L"),
(Cider(), "CIDEr")
scorers = [(Meteor(), "METEOR")]
# If the video does not have a prediction, then Vwe give it no matches
# We set it to empty, and use this as a sanity check later on
if vid_id not in self.prediction:
gts[vid_id] = {}
res[vid_id] = {}

# Compute scores
# If we do have a prediction, then we find the scores based on all the
# valid tIoU overlaps
unique_index = 0
cur_res = res[vid_id]
cur_gts = gts[vid_id]

# For each prediction, we look at the tIoU with ground truth
for pred in self.prediction[vid_id]:
has_added = False
for gt in self.ground_truths:
if vid_id not in gt:
gt_captions = gt[vid_id]
for caption_idx, caption_timestamp in enumerate(gt_captions['timestamps']):
if self.iou(pred['timestamp'], caption_timestamp) >= tiou:

cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
cur_gts[unique_index] = [{'caption': remove_nonascii(gt_captions['sentences'][caption_idx])}]
unique_index += 1
has_added = True

# If the predicted caption does not overlap with any ground truth,
# we should compare it with garbage
if not has_added:
cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
cur_gts[unique_index] = [{'caption': 'abc123!@#'}]

# Each scorer will compute across all videos and take average score
output = {}
for scorer, method in scorers:
for scorer, method in self.scorers:
if self.verbose:
print 'computing %s score...'%(scorer.method())
score, scores = scorer.compute_score(gts, res)

# For each video, take all the valid pairs (based from tIoU) and compute the score
all_scores = {}
for vid_id in gt_vid_ids:

if len(res[vid_id]) == 0 or len(gts[vid_id]) == 0:
if type(method) == list:
score = [0] * len(method)
score = 0
cur_res = self.tokenizer.tokenize(res[vid_id])
cur_gts = self.tokenizer.tokenize(gts[vid_id])
score, scores = scorer.compute_score(cur_gts, cur_res)
all_scores[vid_id] = score

print all_scores.values()
if type(method) == list:
for sc, scs, m in zip(score, scores, method):
output[m] = sc
scores = np.mean(all_scores.values(), axis=0)
for m in xrange(len(method)):
output[method[m]] = scores[m]
if self.verbose:
print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, m, sc)
print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method[m], output[method[m]])
output[method] = score
output[method] = np.mean(all_scores.values())
if self.verbose:
print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, score)
print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, output[method])
return output

def main(args):
Expand Down

0 comments on commit b8d9070

Please sign in to comment.