-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathuitviic_data.py
executable file
·104 lines (84 loc) · 3.58 KB
/
uitviic_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from PIL import Image
from tqdm import tqdm
logging.basicConfig(filename='uitviic_sanity_check.log',
filemode='w',
format='%(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
@dataclass
class Meta:
img_dir = Path('/mnt/disks/nlpvnhub/dinhanhx/train2017')
# where all images of COCO train2017 are located
json_dir = Path('/mnt/disks/nlpvnhub/dinhanhx/UIT-ViIC')
# where all json files that are downloaded from
# https://sites.google.com/uit.edu.vn/uit-nlp/datasets-projects?authuser=0#h.p_Uj6Wqs5dCpc4
train_file = json_dir.joinpath('uitviic_captions_train2017.json')
val_file = json_dir.joinpath('uitviic_captions_val2017.json')
test_file = json_dir.joinpath('uitviic_captions_test2017.json')
class DataUnpacker:
def __init__(self, meta: Meta, sanity_check=False) -> None:
self.meta = meta
if sanity_check:
self.run_sanity_check()
def get_item(self, index: int = 0, target='train_file'):
"""Get data by index coresponding in json files
Parameters
----------
index : int, optional
index of data in json file, by default 0
target : str, optional
'train_file', 'test_file', by default 'train_file'
Returns
-------
A tuple of Dict containing data, and Path to image file
"""
target = self.meta.__getattribute__(target)
with open(target) as target_file:
dataset = json.load(target_file)
data = dataset['annotations'][index]
img_file = self.meta.img_dir.joinpath(str(data['image_id']).zfill(12)+'.jpg')
return data, img_file
def run_sanity_check(self):
"""Check files, directories, and images path exist or not
"""
meta_file_list = [self.meta.train_file, self.meta.val_file, self.meta.test_file]
for target in meta_file_list:
with open(target) as target_file:
dataset = json.load(target_file)
for d in tqdm(dataset['annotations']):
img_file = self.meta.img_dir.joinpath(str(d['image_id']).zfill(12)+'.jpg')
if not img_file.is_file():
logger.warn(f'{d} @ {target} has no image')
def get_image_list(self):
meta_file_list = [self.meta.train_file, self.meta.val_file, self.meta.test_file]
image_set = set()
for target in meta_file_list:
with open(target) as target_file:
dataset = json.load(target_file)
for d in tqdm(dataset['annotations']):
image_set.add(self.meta.img_dir.joinpath(str(d['image_id']).zfill(12)+'.jpg'))
return list(image_set)
if '__main__' == __name__:
meta = Meta()
data_unpacker = DataUnpacker(meta)
image_set = data_unpacker.get_image_list()
l = len(image_set)
h, w = 0, 0
min_h, min_w = Image.open(image_set[0]).height, Image.open(image_set[0]).width
max_h, max_w = 0, 0
for img_path in tqdm(image_set):
img = Image.open(img_path)
h += img.height
w += img.width
if img.height * img.width <= min_h * min_w:
min_h, min_w = img.height, img.width
if img.height * img.width >= max_h * max_w:
max_h, max_w = img.height, img.width
print(f'Number of image-text pairs: {l}')
print(f'Average H W: {h/l} {w/l}')
print(f'Min H W: {min_h} {min_w}')
print(f'Max H W: {max_h} {max_w}')