From b8d90707984bf9c99454ba82b089006f14fb62b3 Mon Sep 17 00:00:00 2001 From: Kenji Hata Date: Fri, 30 Jun 2017 10:06:54 +0100 Subject: [PATCH] making the evaluate tiou average across video id's rather than submissions --- evaluate.py | 164 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 105 insertions(+), 59 deletions(-) diff --git a/evaluate.py b/evaluate.py index aa791ca..5d73eb8 100644 --- a/evaluate.py +++ b/evaluate.py @@ -16,6 +16,10 @@ from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.cider.cider import Cider from sets import Set +import numpy as np + +def remove_nonascii(text): + return ''.join([i if ord(i) < 128 else ' ' for i in text]) class ANETcaptions(object): PREDICTION_FIELDS = ['results', 'version', 'external_data'] @@ -37,6 +41,19 @@ def __init__(self, ground_truth_filenames=None, prediction_filename=None, self.pred_fields = prediction_fields self.ground_truths = self.import_ground_truths(ground_truth_filenames) self.prediction = self.import_prediction(prediction_filename) + self.tokenizer = PTBTokenizer() + + # Set up scorers, if not verbose, we only use the one we're + # testing on: METEOR + if self.verbose: + self.scorers = [ + (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), + (Meteor(),"METEOR"), + (Rouge(), "ROUGE_L"), + (Cider(), "CIDEr") + ] + else: + self.scorers = [(Meteor(), "METEOR")] def import_prediction(self, prediction_filename): if self.verbose: @@ -75,6 +92,12 @@ def check_gt_exists(self, vid_id): return True return False + def get_gt_vid_ids(self): + vid_ids = set([]) + for gt in self.ground_truths: + vid_ids |= set(gt.keys()) + return list(vid_ids) + def evaluate(self): aggregator = {} self.scores = {} @@ -93,91 +116,114 @@ def evaluate(self): self.scores['Precision'].append(precision) def evaluate_detection(self, tiou): - recall = [0] * len(self.prediction.keys()) - precision = [0] * len(self.prediction.keys()) - for vid_i, vid_id in enumerate(self.prediction): + gt_vid_ids = self.get_gt_vid_ids() + # Recall is the percentage of ground truth that is covered by the predictions + # Precision is the percentage of predictions that are valid + recall = [0] * len(gt_vid_ids) + precision = [0] * len(gt_vid_ids) + for vid_i, vid_id in enumerate(gt_vid_ids): best_recall = 0 best_precision = 0 for gt in self.ground_truths: + if vid_id not in gt: + continue refs = gt[vid_id] ref_set_covered = set([]) pred_set_covered = set([]) num_gt = 0 num_pred = 0 - for pred_i, pred in enumerate(self.prediction[vid_id]): - pred_timestamp = pred['timestamp'] - for ref_i, ref_timestamp in enumerate(refs['timestamps']): - if self.iou(pred_timestamp, ref_timestamp) > tiou: - ref_set_covered.add(ref_i) - pred_set_covered.add(pred_i) + if vid_id in self.prediction: + for pred_i, pred in enumerate(self.prediction[vid_id]): + pred_timestamp = pred['timestamp'] + for ref_i, ref_timestamp in enumerate(refs['timestamps']): + if self.iou(pred_timestamp, ref_timestamp) > tiou: + ref_set_covered.add(ref_i) + pred_set_covered.add(pred_i) + + new_precision = float(len(pred_set_covered)) / pred_i + best_precision = max(best_precision, new_precision) new_recall = float(len(ref_set_covered)) / len(refs['timestamps']) - new_precision = float(len(pred_set_covered)) / pred_i best_recall = max(best_recall, new_recall) - best_precision = max(best_recall, best_precision) recall[vid_i] = best_recall precision[vid_i] = best_precision - return sum(recall) / len(recall), sum(precision) / len(recall) + return sum(recall) / len(recall), sum(precision) / len(precision) def evaluate_tiou(self, tiou): - # For every prediction, find it's respective references with tIoU > the passed in argument. + # This method averages the tIoU precision from METEOR, Bleu, etc. across videos res = {} gts = {} - unique_index = 0 - for vid_id in self.prediction: - if not self.check_gt_exists(vid_id): - continue - for pred in self.prediction[vid_id]: - res[unique_index] = [{'caption': pred['sentence']}] - matches = [] - for gt in self.ground_truths: - if vid_id not in gt: - continue - refs = gt[vid_id] - for ref_i, ref_timestamp in enumerate(refs['timestamps']): - if self.iou(pred['timestamp'], ref_timestamp) > tiou: - matches.append(refs['sentences'][ref_i]) - if len(matches) == 0: - gts[unique_index] = [{'caption': 'abc123!@#'}] - else: - gts[unique_index] = [{'caption': v} for v in matches] - unique_index += 1 + gt_vid_ids = self.get_gt_vid_ids() + for vid_id in gt_vid_ids: - # Set up scorers - if self.verbose: - print '| Tokenizing ...' - # Suppressing tokenizer output - tokenizer = PTBTokenizer() - gts = tokenizer.tokenize(gts) - res = tokenizer.tokenize(res) + res[vid_id] = {} + gts[vid_id] = {} - # Set up scorers, if not verbose, we only use the one we're - # testing on: METEOR - if self.verbose: - print '| Setting up scorers ...' - scorers = [ - (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), - (Meteor(),"METEOR"), - (Rouge(), "ROUGE_L"), - (Cider(), "CIDEr") - ] - else: - scorers = [(Meteor(), "METEOR")] + # If the video does not have a prediction, then Vwe give it no matches + # We set it to empty, and use this as a sanity check later on + if vid_id not in self.prediction: + gts[vid_id] = {} + res[vid_id] = {} - # Compute scores + # If we do have a prediction, then we find the scores based on all the + # valid tIoU overlaps + else: + unique_index = 0 + cur_res = res[vid_id] + cur_gts = gts[vid_id] + + # For each prediction, we look at the tIoU with ground truth + for pred in self.prediction[vid_id]: + has_added = False + for gt in self.ground_truths: + if vid_id not in gt: + continue + gt_captions = gt[vid_id] + for caption_idx, caption_timestamp in enumerate(gt_captions['timestamps']): + if self.iou(pred['timestamp'], caption_timestamp) >= tiou: + + cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}] + cur_gts[unique_index] = [{'caption': remove_nonascii(gt_captions['sentences'][caption_idx])}] + unique_index += 1 + has_added = True + + # If the predicted caption does not overlap with any ground truth, + # we should compare it with garbage + if not has_added: + cur_res[unique_index] = [{'caption': remove_nonascii(pred['sentence'])}] + cur_gts[unique_index] = [{'caption': 'abc123!@#'}] + + # Each scorer will compute across all videos and take average score output = {} - for scorer, method in scorers: + for scorer, method in self.scorers: if self.verbose: print 'computing %s score...'%(scorer.method()) - score, scores = scorer.compute_score(gts, res) + + # For each video, take all the valid pairs (based from tIoU) and compute the score + all_scores = {} + for vid_id in gt_vid_ids: + + if len(res[vid_id]) == 0 or len(gts[vid_id]) == 0: + if type(method) == list: + score = [0] * len(method) + else: + score = 0 + else: + cur_res = self.tokenizer.tokenize(res[vid_id]) + cur_gts = self.tokenizer.tokenize(gts[vid_id]) + score, scores = scorer.compute_score(cur_gts, cur_res) + all_scores[vid_id] = score + + print all_scores.values() if type(method) == list: - for sc, scs, m in zip(score, scores, method): - output[m] = sc + scores = np.mean(all_scores.values(), axis=0) + for m in xrange(len(method)): + output[method[m]] = scores[m] if self.verbose: - print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, m, sc) + print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method[m], output[method[m]]) else: - output[method] = score + output[method] = np.mean(all_scores.values()) if self.verbose: - print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, score) + print "Calculated tIoU: %1.1f, %s: %0.3f" % (tiou, method, output[method]) return output def main(args):