From 933ad54e103e55061facf62cb2d0fe8800a00b69 Mon Sep 17 00:00:00 2001 From: Weikai Date: Sun, 4 Aug 2024 15:38:13 -0700 Subject: [PATCH 1/8] add tma_image_random --- vlmeval/dataset/image_mcq.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 4875a5cfd..34e1c0077 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -52,6 +52,7 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv', 'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv', 'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv', + 'TaskMeAnything_v1_ImageQA': 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/blob/main/TaskMeAnything-v1-imageqa-random.tsv' } DATASET_MD5 = { @@ -97,6 +98,7 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': '92321028d2bc29040284b6674721e48f', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', + 'TaskMeAnything_v1_ImageQA': '14cce4225839c4b2dbc68cee86f173d7' } def build_prompt(self, line): From 7ee78c89be8bff4c55e5389ceb068f42f0fb7e3f Mon Sep 17 00:00:00 2001 From: Weikai Date: Sun, 4 Aug 2024 15:54:59 -0700 Subject: [PATCH 2/8] fix_name --- vlmeval/dataset/image_mcq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 34e1c0077..745b1b8e0 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -52,7 +52,7 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv', 'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv', 'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv', - 'TaskMeAnything_v1_ImageQA': 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/blob/main/TaskMeAnything-v1-imageqa-random.tsv' + 'TaskMeAnything_v1_imageqa_random': 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/blob/main/TaskMeAnything-v1-imageqa-random.tsv' } DATASET_MD5 = { @@ -98,7 +98,7 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': '92321028d2bc29040284b6674721e48f', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', - 'TaskMeAnything_v1_ImageQA': '14cce4225839c4b2dbc68cee86f173d7' + 'TaskMeAnything_v1_imageqa_random': '14cce4225839c4b2dbc68cee86f173d7' } def build_prompt(self, line): From 1bde603491b8132fb83acc8e9290436cc1922122 Mon Sep 17 00:00:00 2001 From: JieyuZ2 Date: Sun, 4 Aug 2024 22:30:18 -0700 Subject: [PATCH 3/8] fix bug --- vlmeval/dataset/image_mcq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 745b1b8e0..46ea1837b 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -52,7 +52,7 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv', 'MLLMGuard_DS': 'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv', 'BLINK': 'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv', - 'TaskMeAnything_v1_imageqa_random': 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/blob/main/TaskMeAnything-v1-imageqa-random.tsv' + 'TaskMeAnything_v1_imageqa_random': 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/resolve/main/TaskMeAnything-v1-imageqa-random.tsv' } DATASET_MD5 = { From beb19d7672ab74bd9f40c7a6f2ce54d3dd60b6b2 Mon Sep 17 00:00:00 2001 From: Weikai Date: Mon, 5 Aug 2024 12:26:42 -0700 Subject: [PATCH 4/8] new_md5 --- vlmeval/dataset/image_mcq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 46ea1837b..5cf7f8d8d 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -98,7 +98,7 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': '92321028d2bc29040284b6674721e48f', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', - 'TaskMeAnything_v1_imageqa_random': '14cce4225839c4b2dbc68cee86f173d7' + 'TaskMeAnything_v1_imageqa_random': '14ec959e7955a33efeac2099b26cafd9' } def build_prompt(self, line): From a4f0b303afc5879fabcfb93846882434a818133d Mon Sep 17 00:00:00 2001 From: Weikai Date: Mon, 5 Aug 2024 12:33:19 -0700 Subject: [PATCH 5/8] fix md5 --- vlmeval/dataset/image_mcq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 5cf7f8d8d..1a26e5319 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -98,7 +98,7 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': '92321028d2bc29040284b6674721e48f', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', - 'TaskMeAnything_v1_imageqa_random': '14ec959e7955a33efeac2099b26cafd9' + 'TaskMeAnything_v1_imageqa_random': '93b7290b447ef947f3b3abae5ad4bc1b' } def build_prompt(self, line): From db9e3c2f79b9ab7cb9cf0dbf13ef12807d51082b Mon Sep 17 00:00:00 2001 From: Weikai Date: Mon, 5 Aug 2024 12:45:16 -0700 Subject: [PATCH 6/8] support for video mcq and with TaskMeAnything-v1-videoqa-random --- vlmeval/dataset/__init__.py | 3 +- vlmeval/dataset/utils/video_mcq_utils.py | 14 +++ vlmeval/dataset/video_base.py | 94 +++++++++++--- vlmeval/dataset/video_mcq.py | 153 +++++++++++++++++++++++ 4 files changed, 246 insertions(+), 18 deletions(-) create mode 100644 vlmeval/dataset/utils/video_mcq_utils.py create mode 100644 vlmeval/dataset/video_mcq.py diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 980f8b488..8b98774f5 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -11,6 +11,7 @@ from .mmbench_video import MMBenchVideo from .text_mcq import CustomTextMCQDataset, TextMCQDataset from .videomme import VideoMME +from .video_mcq import VideoMCQDataset from .utils import * from ..smp import * @@ -23,7 +24,7 @@ ] VIDEO_DATASET = [ - MMBenchVideo, VideoMME + MMBenchVideo, VideoMME, VideoMCQDataset ] TEXT_DATASET = [ diff --git a/vlmeval/dataset/utils/video_mcq_utils.py b/vlmeval/dataset/utils/video_mcq_utils.py new file mode 100644 index 000000000..ed1e68d82 --- /dev/null +++ b/vlmeval/dataset/utils/video_mcq_utils.py @@ -0,0 +1,14 @@ +import base64 + +# video_mcq use base64 for mp4 video encoding and decoding. +# using this code to convert mp4 to base64 +def mp4_to_base64(mp4_path): + try: + with open(mp4_path, "rb") as video_file: + video_data = video_file.read() + base64_encoded_video = base64.b64encode(video_data).decode('utf-8') + return base64_encoded_video + except FileNotFoundError: + return "The file was not found." + except Exception as e: + return f"An error occurred: {e}" \ No newline at end of file diff --git a/vlmeval/dataset/video_base.py b/vlmeval/dataset/video_base.py index d2e0b4a51..20a4ca61c 100644 --- a/vlmeval/dataset/video_base.py +++ b/vlmeval/dataset/video_base.py @@ -1,6 +1,9 @@ from abc import abstractmethod from ..smp import * +def video_root_map(dataset): + return dataset + class VideoBaseDataset: @@ -8,29 +11,62 @@ class VideoBaseDataset: def __init__(self, dataset='MMBench-Video', - pack=False): + pack=False, skip_novideo=True): try: import decord except: warnings.warn('Please install decord via `pip install decord`.') - self.dataset_name = dataset - ret = self.prepare_dataset(dataset) - assert ret is not None - lmu_root = LMUDataRoot() - self.frame_root = osp.join(lmu_root, 'images', dataset) - os.makedirs(self.frame_root, exist_ok=True) - self.frame_tmpl = 'frame-{}-of-{}.jpg' + # the init for previous two video dataset + if dataset in ['MMBench-Video', 'Video-MME']: + self.dataset_name = dataset + ret = self.prepare_dataset(dataset) + assert ret is not None + lmu_root = LMUDataRoot() + self.frame_root = osp.join(lmu_root, 'images', dataset) + os.makedirs(self.frame_root, exist_ok=True) + self.frame_tmpl = 'frame-{}-of-{}.jpg' + + self.data_root = ret['root'] + self.data_file = ret['data_file'] + self.data = load(self.data_file) + + assert 'question' in self.data and 'video' in self.data + videos = list(set(self.data['video'])) + videos.sort() + self.videos = videos + self.pack = pack + + # dataset init without prepare_dataset, just like image_base + else: + lmu_root = LMUDataRoot() + # You can override this variable to save image files to a different directory + self.dataset_name = dataset + self.frame_root = osp.join(lmu_root, 'images', dataset) + self.frame_tmpl = 'frame-{}-of-{}.jpg' + data, data_root = self.load_data(dataset) + self.data_root = data_root + self.meta_only = True + self.skip_novideo = skip_novideo + if skip_novideo and 'video' in data: + data = data[~pd.isna(data['video'])] + + data['index'] = [str(x) for x in data['index']] + data['index'] = [str(x) for x in data['index']] - self.data_root = ret['root'] - self.data_file = ret['data_file'] - self.data = load(self.data_file) + if 'video' in data: + self.meta_only = False + + if 'video_path' in data: + paths = [toliststr(x) for x in data['video_path']] + data['video_path'] = [x[0] if len(x) == 1 else x for x in paths] + + if np.all([istype(x, int) for x in data['index']]): + data['index'] = [int(x) for x in data['index']] + + self.data = data + self.post_build(dataset) - assert 'question' in self.data and 'video' in self.data - videos = list(set(self.data['video'])) - videos.sort() - self.videos = videos - self.pack = pack def __len__(self): return len(self.videos) if self.pack else len(self.data) @@ -43,6 +79,29 @@ def __getitem__(self, idx): else: assert idx < len(self.data) return dict(self.data.iloc[idx]) + + def load_data(self, dataset): + url = self.DATASET_URL[dataset] + file_md5 = self.DATASET_MD5[dataset] + return self.prepare_tsv(url, file_md5) + + def prepare_tsv(self, url, file_md5=None): + data_root = LMUDataRoot() + os.makedirs(data_root, exist_ok=True) + update_flag = False + file_name = url.split('/')[-1] + data_path = osp.join(data_root, file_name) + if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5): + pass + else: + warnings.warn('The dataset tsv is not downloaded') + download_file(url, data_path) + update_flag = True + + return load(data_path), data_root + + def post_build(self, dataset): + pass def frame_paths(self, video, num_frames=8): frame_root = osp.join(self.frame_root, video) @@ -68,7 +127,8 @@ def save_video_frames(self, video, num_frames=8): # Return a list of dataset names that are supported by this class, can override @classmethod def supported_datasets(cls): - return ['MMBench-Video', 'Video-MME'] + return ['MMBench-Video', 'Video-MME'] + list(cls.DATASET_URL) + # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe @abstractmethod diff --git a/vlmeval/dataset/video_mcq.py b/vlmeval/dataset/video_mcq.py new file mode 100644 index 000000000..f283ff782 --- /dev/null +++ b/vlmeval/dataset/video_mcq.py @@ -0,0 +1,153 @@ +import os +import string + +# uuid is for generate random video file name to save the video +import uuid +from .utils import build_judge, DEBUG_MESSAGE +import pandas as pd + +from ..smp import * +from .video_base import VideoBaseDataset + + +def combine_images(self): + pass + + +class VideoMCQDataset(VideoBaseDataset): + + TYPE = 'MCQ' + + DATASET_URL = { + # TaskMeAnything_v1_videoqa + 'TaskMeAnything_v1_videoqa_random': 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-random/resolve/main/TaskMeAnything-v1-videoqa-random.tsv' + # Other Benchmarks + } + + DATASET_MD5 = { + # TaskMeAnything_v1_videoqa + 'TaskMeAnything_v1_videoqa_random': "627cb1409a98d3cc4f28928c2e0efdde" + # Other Benchmarks + } + + def base64_to_mp4(self, base64_string): + video_name = str(uuid.uuid4()) + video_path = os.path.join(self.data_root, video_name + '.mp4') + with open(video_path, 'wb') as f: + f.write(base64.b64decode(base64_string)) + return video_name, video_path + + + def build_prompt(self, line, num_frames: int, video_llm: bool, is_combine_images: bool=False): + # if line is an index, get the line from the data + if isinstance(line, int): + line = self.data.iloc[line] + + # the video stored in data should be a binary stream format + video_name, video_path = self.base64_to_mp4(line['video']) + message = [] + # setup default video or frames for ImageQA model or VideoQA model + if video_llm: + message.append(dict(type='video', value=video_path)) + elif is_combine_images: + # combine images means that combine all the frames into one image, instead of provide a sequences of image. + # This is useful for some models that only accept one image as input. + # And I was surprised to find that most of the time, ImageQA models perform better with a combined image instead of a sequence of frames. + frame_paths = self.save_video_frames(video_name, num_frames) + combined_image = combine_images(frame_paths) + message.append(dict(type='image', value=combined_image)) + else: + frame_paths = self.save_video_frames(video_name, num_frames) + for im in frame_paths: + message.append(dict(type='image', value=im)) + + # setup default prompt for MCQ + question = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = '' + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'Question: {question}\n' + if len(options): + prompt += options_prompt + prompt += 'Please select the correct answer from the options above. \n' + message.append(dict(type='text', value=prompt)) + return message + + def evaluate(self, eval_file, **judge_kwargs): + from .utils.multiple_choice import ( + mcq_circular_eval, + mcq_vanilla_eval, + report_acc, + report_acc_MMT, + ) + + dataset = self.dataset_name + + nproc = judge_kwargs.pop('nproc', 4) + + circular = False + + + suffix = eval_file.split('.')[-1] + model = judge_kwargs.get('model', 'exact_matching') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + name_str_map = {'chatgpt-0125': 'openai', 'gpt-4-0125': 'gpt4'} + name_str = name_str_map[model] if model in name_str_map else model + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + + result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') + + data = load(eval_file) + data = data.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + # If not choice label, then use lower case + for k in data.keys(): + data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) + + meta = self.data + meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])} + data_map = {x: y for x, y in zip(data['index'], data['question'])} + for k in data_map: + assert k in meta_q_map, ( + f'eval_file should be the same as or a subset of dataset {self.dataset_name}' + ) + + if circular: + data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name) + else: + data = mcq_vanilla_eval(model, data, meta, nproc, result_file, self.dataset_name) + + # load split + dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) + data = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) + + # May have different report acc functions for different datasets + if 'MMT' in dataset: + acc = report_acc_MMT(data) + else: + acc = report_acc(data) + + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + dump(acc, score_file) + + return acc \ No newline at end of file From a9d279f1176f1e8f46b9d72b6f80bd828e2823e4 Mon Sep 17 00:00:00 2001 From: Weikai Date: Mon, 5 Aug 2024 21:48:56 -0700 Subject: [PATCH 7/8] fix md5 bugs --- vlmeval/dataset/image_mcq.py | 2 +- vlmeval/dataset/video_mcq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 8d9aaab81..256ef213a 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -100,7 +100,7 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': '92321028d2bc29040284b6674721e48f', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', - 'TaskMeAnything_v1_imageqa_random': '93b7290b447ef947f3b3abae5ad4bc1b' + 'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889' } def build_prompt(self, line): diff --git a/vlmeval/dataset/video_mcq.py b/vlmeval/dataset/video_mcq.py index f283ff782..20727756f 100644 --- a/vlmeval/dataset/video_mcq.py +++ b/vlmeval/dataset/video_mcq.py @@ -26,7 +26,7 @@ class VideoMCQDataset(VideoBaseDataset): DATASET_MD5 = { # TaskMeAnything_v1_videoqa - 'TaskMeAnything_v1_videoqa_random': "627cb1409a98d3cc4f28928c2e0efdde" + 'TaskMeAnything_v1_videoqa_random': "d18394a66dd476f0ff7b92bb9c300aeb" # Other Benchmarks } From 45e35ccb72427d9c17074c20499bfc0d83b015fd Mon Sep 17 00:00:00 2001 From: Weikai Date: Tue, 6 Aug 2024 01:04:16 -0700 Subject: [PATCH 8/8] format the code with flake8 formatter instead of rufff --- run.py | 4 ++- vlmeval/dataset/image_mcq.py | 38 +++++++++++------------- vlmeval/dataset/utils/video_mcq_utils.py | 11 +++---- vlmeval/dataset/video_base.py | 15 +++------- vlmeval/dataset/video_mcq.py | 37 ++++++++++------------- 5 files changed, 46 insertions(+), 59 deletions(-) diff --git a/run.py b/run.py index 943d4ec0d..4e8775d71 100644 --- a/run.py +++ b/run.py @@ -44,7 +44,6 @@ def main(): args = parse_args() assert len(args.data), '--data should be a list of data files' - if args.retry is not None: for k, v in supported_VLM.items(): if hasattr(v, 'keywords') and 'retry' in v.keywords: @@ -89,6 +88,9 @@ def main(): continue result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx' + if dataset_name in ['TaskMeAnything_v1_videoqa_random']: + packstr = 'pack' if args.pack else 'nopack' + result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' if dataset_name in ['MMBench-Video']: packstr = 'pack' if args.pack else 'nopack' result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' diff --git a/vlmeval/dataset/image_mcq.py b/vlmeval/dataset/image_mcq.py index 29cde2fa9..0d2d86cbd 100644 --- a/vlmeval/dataset/image_mcq.py +++ b/vlmeval/dataset/image_mcq.py @@ -6,7 +6,6 @@ class ImageMCQDataset(ImageBaseDataset): - TYPE = 'MCQ' DATASET_URL = { @@ -16,14 +15,14 @@ class ImageMCQDataset(ImageBaseDataset): 'MMBench_DEV_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN.tsv', 'MMBench_TEST_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN.tsv', 'MMBench': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench.tsv', # Internal Only - 'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv', # Internal Only + 'MMBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN.tsv', # Internal Only # MMBench v1.1 'MMBench_DEV_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_EN_V11.tsv', 'MMBench_TEST_EN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_EN_V11.tsv', 'MMBench_DEV_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_DEV_CN_V11.tsv', 'MMBench_TEST_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_TEST_CN_V11.tsv', 'MMBench_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_V11.tsv', # Internal Only - 'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv', # Internal Only + 'MMBench_CN_V11': 'https://opencompass.openxlab.space/utils/VLMEval/MMBench_CN_V11.tsv', # Internal Only # SEEDBench Series 'SEEDBench_IMG': 'https://opencompass.openxlab.space/utils/VLMEval/SEEDBench_IMG.tsv', 'SEEDBench2': 'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv', @@ -66,14 +65,14 @@ class ImageMCQDataset(ImageBaseDataset): 'MMBench_DEV_CN': '08b8fc3324a5ed74155350f57be69fbd', 'MMBench_TEST_CN': '7e1239baf0ee4c8b513e19705a0f317e', 'MMBench': '4115aea3383f3dd0083be6a633e0f820', # Internal Only - 'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only + 'MMBench_CN': '2e053ffc90ea598b1feae13c36dc13ee', # Internal Only # MMBench v1.1 'MMBench_DEV_EN_V11': '30c05be8f2f347a50be25aa067248184', 'MMBench_TEST_EN_V11': '26f0f15381a21720255091d3e0316ce6', 'MMBench_DEV_CN_V11': '593f9b5f6bea453d870a798b34ae4f37', 'MMBench_TEST_CN_V11': '74bbe4556dac745613c7cbe5ad787050', 'MMBench_V11': 'b9276414f57af1308dcc4d0cd9b42e7c', # Internal Only - 'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only + 'MMBench_CN_V11': '95f6980dd1b4de38e3cbffe0305a3f25', # Internal Only # SEEDBench 'SEEDBench_IMG': '68017231464752261a2526d6ca3a10c0', 'SEEDBench2': '4ec15cf864c4f16274112284f531813e', @@ -103,11 +102,10 @@ class ImageMCQDataset(ImageBaseDataset): 'RealWorldQA': '92321028d2bc29040284b6674721e48f', 'MLLMGuard_DS': '975fc0dd7119386e198c37d71e274b3f', 'BLINK': '3b6649b6a662184ea046908e5506260e', - 'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889' + 'TaskMeAnything_v1_imageqa_random': '023fef69e2ca21827afb77c5ec3bc889', } def build_prompt(self, line): - if isinstance(line, int): line = self.data.iloc[line] @@ -117,11 +115,7 @@ def build_prompt(self, line): tgt_path = self.dump_image(line) question = line['question'] - options = { - cand: line[cand] - for cand in string.ascii_uppercase - if cand in line and not pd.isna(line[cand]) - } + options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])} options_prompt = 'Options:\n' for key, item in options.items(): options_prompt += f'{key}. {item}\n' @@ -145,10 +139,13 @@ def build_prompt(self, line): def evaluate(self, eval_file, **judge_kwargs): from .utils.multiple_choice import report_acc, report_acc_MMT, mcq_circular_eval, mcq_vanilla_eval + # assert dataset is not None dataset_map = { - 'MMBench_TEST_EN': 'MMBench', 'MMBench_TEST_EN_V11': 'MMBench_V11', - 'MMBench_TEST_CN': 'MMBench_CN', 'MMBench_TEST_CN_V11': 'MMBench_CN_V11' + 'MMBench_TEST_EN': 'MMBench', + 'MMBench_TEST_EN_V11': 'MMBench_V11', + 'MMBench_TEST_CN': 'MMBench_CN', + 'MMBench_TEST_CN_V11': 'MMBench_CN_V11', } dataset = self.dataset_name if dataset in dataset_map: @@ -193,9 +190,7 @@ def evaluate(self, eval_file, **judge_kwargs): meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])} data_map = {x: y for x, y in zip(data['index'], data['question'])} for k in data_map: - assert k in meta_q_map, ( - f'eval_file should be the same as or a subset of dataset {self.dataset_name}' - ) + assert k in meta_q_map, f'eval_file should be the same as or a subset of dataset {self.dataset_name}' if circular: data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name) @@ -216,14 +211,15 @@ def evaluate(self, eval_file, **judge_kwargs): dump(acc, score_file) if dataset == 'AesBench_VAL': - warnings.warn('Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \ + warnings.warn( + 'Note that AesBench VAL is just a toy version of AesBench TEST. For full results, \ please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times \ - larger than the VAL dataset and the leaderboard results are based on AesBench TEST.') + larger than the VAL dataset and the leaderboard results are based on AesBench TEST.' + ) return acc class MMMUDataset(ImageMCQDataset): - DATASET_URL = { 'MMMU_DEV_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv', 'MMMU_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv', @@ -264,7 +260,6 @@ def build_prompt(self, line): class CustomMCQDataset(ImageMCQDataset): - def load_data(self, dataset): data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv') @@ -272,6 +267,7 @@ def load_data(self, dataset): local_path = data_path.replace('.tsv', '_local.tsv') if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None): from ..tools import LOCALIZE + LOCALIZE(data_path, local_path) data_path = local_path return load(data_path) diff --git a/vlmeval/dataset/utils/video_mcq_utils.py b/vlmeval/dataset/utils/video_mcq_utils.py index ed1e68d82..c3c65dd50 100644 --- a/vlmeval/dataset/utils/video_mcq_utils.py +++ b/vlmeval/dataset/utils/video_mcq_utils.py @@ -1,14 +1,15 @@ import base64 -# video_mcq use base64 for mp4 video encoding and decoding. + +# video_mcq use base64 for mp4 video encoding and decoding. # using this code to convert mp4 to base64 def mp4_to_base64(mp4_path): try: - with open(mp4_path, "rb") as video_file: + with open(mp4_path, 'rb') as video_file: video_data = video_file.read() base64_encoded_video = base64.b64encode(video_data).decode('utf-8') - return base64_encoded_video + return base64_encoded_video except FileNotFoundError: - return "The file was not found." + return 'The file was not found.' except Exception as e: - return f"An error occurred: {e}" \ No newline at end of file + return f'An error occurred: {e}' diff --git a/vlmeval/dataset/video_base.py b/vlmeval/dataset/video_base.py index 20a4ca61c..dfcdc1c81 100644 --- a/vlmeval/dataset/video_base.py +++ b/vlmeval/dataset/video_base.py @@ -1,17 +1,15 @@ from abc import abstractmethod from ..smp import * + def video_root_map(dataset): return dataset class VideoBaseDataset: - MODALITY = 'VIDEO' - def __init__(self, - dataset='MMBench-Video', - pack=False, skip_novideo=True): + def __init__(self, dataset='MMBench-Video', pack=False, skip_novideo=True): try: import decord except: @@ -67,7 +65,6 @@ def __init__(self, self.data = data self.post_build(dataset) - def __len__(self): return len(self.videos) if self.pack else len(self.data) @@ -79,16 +76,15 @@ def __getitem__(self, idx): else: assert idx < len(self.data) return dict(self.data.iloc[idx]) - + def load_data(self, dataset): url = self.DATASET_URL[dataset] file_md5 = self.DATASET_MD5[dataset] return self.prepare_tsv(url, file_md5) - + def prepare_tsv(self, url, file_md5=None): data_root = LMUDataRoot() os.makedirs(data_root, exist_ok=True) - update_flag = False file_name = url.split('/')[-1] data_path = osp.join(data_root, file_name) if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5): @@ -96,8 +92,6 @@ def prepare_tsv(self, url, file_md5=None): else: warnings.warn('The dataset tsv is not downloaded') download_file(url, data_path) - update_flag = True - return load(data_path), data_root def post_build(self, dataset): @@ -128,7 +122,6 @@ def save_video_frames(self, video, num_frames=8): @classmethod def supported_datasets(cls): return ['MMBench-Video', 'Video-MME'] + list(cls.DATASET_URL) - # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe @abstractmethod diff --git a/vlmeval/dataset/video_mcq.py b/vlmeval/dataset/video_mcq.py index 20727756f..d46e27359 100644 --- a/vlmeval/dataset/video_mcq.py +++ b/vlmeval/dataset/video_mcq.py @@ -15,18 +15,20 @@ def combine_images(self): class VideoMCQDataset(VideoBaseDataset): - TYPE = 'MCQ' DATASET_URL = { # TaskMeAnything_v1_videoqa - 'TaskMeAnything_v1_videoqa_random': 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-random/resolve/main/TaskMeAnything-v1-videoqa-random.tsv' + 'TaskMeAnything_v1_videoqa_random': ( + 'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-videoqa-random/' + 'resolve/main/TaskMeAnything-v1-videoqa-random.tsv' + ) # Other Benchmarks } DATASET_MD5 = { # TaskMeAnything_v1_videoqa - 'TaskMeAnything_v1_videoqa_random': "d18394a66dd476f0ff7b92bb9c300aeb" + 'TaskMeAnything_v1_videoqa_random': 'd18394a66dd476f0ff7b92bb9c300aeb' # Other Benchmarks } @@ -37,12 +39,11 @@ def base64_to_mp4(self, base64_string): f.write(base64.b64decode(base64_string)) return video_name, video_path - - def build_prompt(self, line, num_frames: int, video_llm: bool, is_combine_images: bool=False): + def build_prompt(self, line, num_frames: int, video_llm: bool, is_combine_images: bool = False): # if line is an index, get the line from the data if isinstance(line, int): line = self.data.iloc[line] - + # the video stored in data should be a binary stream format video_name, video_path = self.base64_to_mp4(line['video']) message = [] @@ -51,23 +52,20 @@ def build_prompt(self, line, num_frames: int, video_llm: bool, is_combine_images message.append(dict(type='video', value=video_path)) elif is_combine_images: # combine images means that combine all the frames into one image, instead of provide a sequences of image. - # This is useful for some models that only accept one image as input. - # And I was surprised to find that most of the time, ImageQA models perform better with a combined image instead of a sequence of frames. + # This is useful for some models that only accept one image as input. + # And I was surprised to find that most of the time, ImageQA models perform better + # with a combined image instead of a sequence of frames. frame_paths = self.save_video_frames(video_name, num_frames) combined_image = combine_images(frame_paths) message.append(dict(type='image', value=combined_image)) - else: + else: frame_paths = self.save_video_frames(video_name, num_frames) for im in frame_paths: message.append(dict(type='image', value=im)) - + # setup default prompt for MCQ question = line['question'] - options = { - cand: line[cand] - for cand in string.ascii_uppercase - if cand in line and not pd.isna(line[cand]) - } + options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])} options_prompt = 'Options:\n' for key, item in options.items(): options_prompt += f'{key}. {item}\n' @@ -81,7 +79,7 @@ def build_prompt(self, line, num_frames: int, video_llm: bool, is_combine_images prompt += 'Please select the correct answer from the options above. \n' message.append(dict(type='text', value=prompt)) return message - + def evaluate(self, eval_file, **judge_kwargs): from .utils.multiple_choice import ( mcq_circular_eval, @@ -95,7 +93,6 @@ def evaluate(self, eval_file, **judge_kwargs): nproc = judge_kwargs.pop('nproc', 4) circular = False - suffix = eval_file.split('.')[-1] model = judge_kwargs.get('model', 'exact_matching') @@ -128,9 +125,7 @@ def evaluate(self, eval_file, **judge_kwargs): meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])} data_map = {x: y for x, y in zip(data['index'], data['question'])} for k in data_map: - assert k in meta_q_map, ( - f'eval_file should be the same as or a subset of dataset {self.dataset_name}' - ) + assert k in meta_q_map, f'eval_file should be the same as or a subset of dataset {self.dataset_name}' if circular: data = mcq_circular_eval(model, data, meta, nproc, result_file, self.dataset_name) @@ -150,4 +145,4 @@ def evaluate(self, eval_file, **judge_kwargs): score_file = eval_file.replace(f'.{suffix}', '_acc.csv') dump(acc, score_file) - return acc \ No newline at end of file + return acc