making the evaluate tiou average across video id's rather than submis…

…sions
ranjaykrishna · Jun 30, 2017 · b8d9070 · b8d9070
1 parent ca56827
commit b8d9070
Showing 1 changed file with 105 additions and 59 deletions.
diff --git a/evaluate.py b/evaluate.py
@@ -16,6 +16,10 @@
 from pycocoevalcap.rouge.rouge import Rouge
 from pycocoevalcap.cider.cider import Cider
 from sets import Set
+import numpy as np
+
+def remove_nonascii(text):
+    return ''.join([i if ord(i) < 128 else ' ' for i in text])
 
 class ANETcaptions(object):
     PREDICTION_FIELDS = ['results', 'version', 'external_data']
@@ -37,6 +41,19 @@ def __init__(self, ground_truth_filenames=None, prediction_filename=None,
         self.pred_fields = prediction_fields
         self.ground_truths = self.import_ground_truths(ground_truth_filenames)
         self.prediction = self.import_prediction(prediction_filename)
+        self.tokenizer = PTBTokenizer()
+
+        # Set up scorers, if not verbose, we only use the one we're
+        # testing on: METEOR
+        if self.verbose:
+            self.scorers = [
+                (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
+                (Meteor(),"METEOR"),
+                (Rouge(), "ROUGE_L"),
+                (Cider(), "CIDEr")
+            ]
+        else:
+            self.scorers = [(Meteor(), "METEOR")]
 
     def import_prediction(self, prediction_filename):
         if self.verbose:
@@ -75,6 +92,12 @@ def check_gt_exists(self, vid_id):
               return True
         return False
 
+    def get_gt_vid_ids(self):
+        vid_ids = set([])
+        for gt in self.ground_truths:
+            vid_ids |= set(gt.keys())
+        return list(vid_ids)
+
     def evaluate(self):
         aggregator = {}
         self.scores = {}
@@ -93,91 +116,114 @@ def evaluate(self):
                 self.scores['Precision'].append(precision)
 
     def evaluate_detection(self, tiou):
-        recall = [0] * len(self.prediction.keys())
-        precision = [0] * len(self.prediction.keys())
-        for vid_i, vid_id in enumerate(self.prediction):
+        gt_vid_ids = self.get_gt_vid_ids()
+        # Recall is the percentage of ground truth that is covered by the predictions
+        # Precision is the percentage of predictions that are valid
+        recall = [0] * len(gt_vid_ids)
+        precision = [0] * len(gt_vid_ids)
+        for vid_i, vid_id in enumerate(gt_vid_ids):
             best_recall = 0
             best_precision = 0
             for gt in self.ground_truths:
+                if vid_id not in gt:
+                    continue
                 refs = gt[vid_id]
                 ref_set_covered = set([])
                 pred_set_covered = set([])
                 num_gt = 0
                 num_pred = 0
-                for pred_i, pred in enumerate(self.prediction[vid_id]):
-                    pred_timestamp = pred['timestamp']
-                    for ref_i, ref_timestamp in enumerate(refs['timestamps']):
-                        if self.iou(pred_timestamp, ref_timestamp) > tiou:
-                            ref_set_covered.add(ref_i)
-                            pred_set_covered.add(pred_i)
+                if vid_id in self.prediction:
+                    for pred_i, pred in enumerate(self.prediction[vid_id]):
+                        pred_timestamp = pred['timestamp']
+                        for ref_i, ref_timestamp in enumerate(refs['timestamps']):
+                            if self.iou(pred_timestamp, ref_timestamp) > tiou:
+                                ref_set_covered.add(ref_i)
+                                pred_set_covered.add(pred_i)
+
+                    new_precision = float(len(pred_set_covered)) / pred_i 
+                    best_precision = max(best_precision, new_precision)
                 new_recall = float(len(ref_set_covered)) / len(refs['timestamps'])
-                new_precision = float(len(pred_set_covered)) / pred_i
                 best_recall = max(best_recall, new_recall)
-                best_precision = max(best_recall, best_precision)
             recall[vid_i] = best_recall
             precision[vid_i] = best_precision
-        return sum(recall) / len(recall), sum(precision) / len(recall)
+        return sum(recall) / len(recall), sum(precision) / len(precision)
 
     def evaluate_tiou(self, tiou):
-        # For every prediction, find it's respective references with tIoU > the passed in argument.
+        # This method averages the tIoU precision from METEOR, Bleu, etc. across videos 
         res = {}
         gts = {}
-        unique_index = 0
-        for vid_id in self.prediction:
-            if not self.check_gt_exists(vid_id):
-                continue
-            for pred in self.prediction[vid_id]:
-                res[unique_index] = [{'caption': pred['sentence']}]
-                matches = []
-                for gt in self.ground_truths:
-                    if vid_id not in gt:
-                        continue
-                    refs = gt[vid_id]
-                    for ref_i, ref_timestamp in enumerate(refs['timestamps']):
-                        if self.iou(pred['timestamp'], ref_timestamp) > tiou:
-                            matches.append(refs['sentences'][ref_i])
-                if len(matches) == 0:
-                    gts[unique_index] = [{'caption': 'abc123!@#'}]
-                else:
-                    gts[unique_index] = [{'caption': v} for v in matches]
-                unique_index += 1
+        gt_vid_ids = self.get_gt_vid_ids()
+        for vid_id in gt_vid_ids:
 
-        # Set up scorers
-        if self.verbose:
-            print '| Tokenizing ...'
-        # Suppressing tokenizer output
-        tokenizer = PTBTokenizer()
-        gts  = tokenizer.tokenize(gts)
-        res = tokenizer.tokenize(res)
+            res[vid_id] = {}
+            gts[vid_id] = {} 
 
-        # Set up scorers, if not verbose, we only use the one we're
-        # testing on: METEOR
-        if self.verbose:
-            print '| Setting up scorers ...'
-            scorers = [
-                (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
-                (Meteor(),"METEOR"),
-                (Rouge(), "ROUGE_L"),
-                (Cider(), "CIDEr")
-            ]
-        else:
-            scorers = [(Meteor(), "METEOR")]
+            # If the video does not have a prediction, then Vwe give it no matches
+            # We set it to empty, and use this as a sanity check later on
+            if vid_id not in self.prediction:
+                gts[vid_id] = {}
+                res[vid_id] = {}
 
-        # Compute scores
+            # If we do have a prediction, then we find the scores based on all the
+            # valid tIoU overlaps
+            else:
+                unique_index = 0
+                cur_res = res[vid_id]
+                cur_gts = gts[vid_id]
+
+                # For each prediction, we look at the tIoU with ground truth
+                for pred in self.prediction[vid_id]:
+                    has_added = False
+                    for gt in self.ground_truths:
+                        if vid_id not in gt:
+                            continue
+                        gt_captions = gt[vid_id]
+                        for caption_idx, caption_timestamp in enumerate(gt_captions['timestamps']):
+                            if self.iou(pred['timestamp'], caption_timestamp) >= tiou:
+
+                                cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
+                                cur_gts[unique_index] = [{'caption': remove_nonascii(gt_captions['sentences'][caption_idx])}]
+                                unique_index += 1
+                                has_added = True
+
+                        # If the predicted caption does not overlap with any ground truth,
+                        # we should compare it with garbage
+                        if not has_added:
+                            cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}]
+                            cur_gts[unique_index] = [{'caption': 'abc123!@#'}]
+
+        # Each scorer will compute across all videos and take average score
         output = {}
-        for scorer, method in scorers:
+        for scorer, method in self.scorers:
             if self.verbose:
                 print 'computing %s score...'%(scorer.method())
-            score, scores = scorer.compute_score(gts, res)
+
+            # For each video, take all the valid pairs (based from tIoU) and compute the score
+            all_scores = {}
+            for vid_id in gt_vid_ids:
+
+                if len(res[vid_id]) == 0 or len(gts[vid_id]) == 0:
+                    if type(method) == list:
+                        score = [0] * len(method)
+                    else:
+                        score = 0
+                else:
+                    cur_res = self.tokenizer.tokenize(res[vid_id])
+                    cur_gts = self.tokenizer.tokenize(gts[vid_id])
+                    score, scores = scorer.compute_score(cur_gts, cur_res)
+                all_scores[vid_id] = score
+
+            print all_scores.values()
             if type(method) == list:
-                for sc, scs, m in zip(score, scores, method):
-                    output[m] = sc
+                scores = np.mean(all_scores.values(), axis=0)
+                for m in xrange(len(method)):
+                    output[method[m]] = scores[m]
                     if self.verbose:
-                        print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, m, sc)
+                        print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method[m], output[method[m]])
             else:
-                output[method] = score
+                output[method] = np.mean(all_scores.values())
                 if self.verbose:
-                    print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, score)
+                    print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, output[method])
         return output
 
 def main(args):