-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataLoading.py
244 lines (195 loc) · 9.09 KB
/
DataLoading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import argparse
import os
import random
import tempfile
import shutil
import xml.etree.ElementTree as ET
def unpack_label_me(input_dir, output_dir):
"""
This script helps to unpack downloaded labelme collections and moves all files in a single folder according to the following routine:
1) Scan input folder and its subfolders for .tar.gz files
2) Unpack .tar.gz files and move them to a temporary directory
3) For each original collection (=each unzipped file):
3a) Crawl all .xml files
3b) Replace filename by a global counter (to avoid issues by similar filenames over different collections)
3c) Save renamed .xml and image files in output dir
4) Delete temporary folder
Args:
--inputDir -i Directory that will be crawled for .tar.gz files downloaded from labelme
--outputDir -o Directory where all unpacked .xml and .jpg files will be moved to
Returns:
Nothing
Usage:
python DataLoading.py -i [PATH_TO_DOWNLOAD_FOLDER] -o [TARGET_FOLDER]
"""
os.makedirs(os.path.join(output_dir), exist_ok=True)
# Initiate temporary directory
tmpdir = tempfile.mkdtemp()
print(f'Created tmpdir {tmpdir}')
# Walk through inputDir, unzip all files and move to tempdir
suffix = ".tar.gz" # file extension to extract
for folder, dirs, files in os.walk(input_dir, topdown=False, followlinks=False):
for name in files:
if name.endswith(suffix):
shutil.unpack_archive(os.path.join(folder, name), os.path.join(tmpdir, os.path.splitext(name)[0]))
print(f'Successfully unpacked {os.path.join(folder, name)}')
file_counter = 0 # counter for filename
# Walk toplevel (=unpacked collections) of tempdir
for collection in os.listdir(tmpdir):
if not os.path.isdir(os.path.join(tmpdir, collection)):
continue
# Crawl tempdir/collection, rename and move files to output folder
suffix = ".xml"
collection_dir = os.path.join(tmpdir, collection)
rename_dict = {} # Dict containing rename rules for image files
# Grab .xml files, rename filename attribute
for folder, dirs, files in os.walk(collection_dir, topdown=False, followlinks=False):
for name in files:
if name.endswith(suffix):
xmlfile = os.path.join(collection_dir, folder, name)
# open xml file
try:
tree = ET.parse(xmlfile)
root = tree.getroot()
original_file_name = root.find("filename").text
file_counter += 1
except:
print(f'Invalid annotation file {xmlfile}')
continue
# manipulate filename attribute
new_file_name = str(file_counter) + os.path.splitext(original_file_name)[1]
rename_dict[original_file_name] = new_file_name
tree.find("filename").text = new_file_name
# save manipulated xml file in output dir
output_path = os.path.join(output_dir, str(file_counter) + suffix)
tree.write(open(output_path, 'wb'), encoding='utf-8')
# Grab image files and copy them to output dir (with new name!)
for folder, dirs, files in os.walk(collection_dir, topdown=False, followlinks=False):
for name in files:
if name in rename_dict:
old_path = os.path.join(collection_dir, folder, name)
new_path = os.path.join(output_dir, rename_dict[name])
shutil.copy2(old_path, new_path)
print(f'Finished renaming and copying for {collection_dir}')
# Delete tempdir
shutil.rmtree(tmpdir)
print(f'Cleaned tmpdir {tmpdir}')
print(f'Extraction completed. You find all image and annotation files in {output_dir}')
def train_test_valid_split(output_dir, weights):
"""
Takes up work from unpack_label_me and assigns all files extracted to train, test and validation set randomly
given the weights
Files are moved to a folder for images (img) and one for annotations (anot)
Args:
output_dir (str): Directory where all unpacked files from LabelMe are saved
weights (list): Weights for train, validation and test set (probability to be assigned in %)
Returns:
None
"""
print(f"Split data into train, validation and test set, with the following shares [{weights[0]}, {weights[1]},"
f" {weights[2]}]")
# create directories for all sets
list_split_folders = ["/train/", "/valid/", "/test/"]
for fold in list_split_folders:
os.makedirs(output_dir + fold, exist_ok=True)
os.makedirs(output_dir + fold + 'img/', exist_ok=True)
os.makedirs(output_dir + fold + 'anot/', exist_ok=True)
# create a list of all image files
list_img_files = [f for f in os.listdir(output_dir) if f.endswith('.jpg')]
# go over all image files and draw a set to be assigned randomly
for img_file in list_img_files:
idx = random.choices(range(0, 3), weights=weights, k=1)[0]
# move img and xml to the assigned directory
# as for each image a corresponding xml exists, we can simply use the image file name for the xml as well
shutil.move(output_dir + img_file, output_dir + list_split_folders[idx] + 'img/' + img_file)
shutil.move(output_dir + img_file.replace('.jpg', '.xml'),
output_dir + list_split_folders[idx] + 'anot/' + img_file.replace('.jpg', '.xml'))
print(f'Split is successfully performed. \n'
f'You will find the sets in the sub-folders of {output_dir}')
def load_data(inputDir, outputDir, train=50, valid=20, test=30):
"""
Serves as wrapper method for the following two tasks:
1) unpack collections from LabelMe
2) Split the unpacked files into train, test and validation sets
Args:
inputDir: Directory that will be crawled for .tar.gz files downloaded from labelme
outputDir: Directory where all unpacked .xml and .jpg files will be moved to (make sure that it ends with a "/")
train: Percentage of train images of total data set (default: 50)
valid: Percentage of validation images of total data set (default: 20)
test: Percentage of test images of total data set (default: 30)
Returns:
None
"""
# Validate input args
assert os.path.isdir(inputDir)
# unpack all files from LabelMe
unpack_label_me(inputDir, outputDir)
# assign files to train, test and validation set
weights = [train, valid, test]
train_test_valid_split(outputDir, weights=weights)
def main():
"""
Use this method with console arguments!
Serves as wrapper method for the following two tasks:
1) unpack collections from LabelMe
2) Split the unpacked files into train, test and validation sets
Args:
--inputDir -i Directory that will be crawled for .tar.gz files downloaded from labelme
--outputDir -o Directory where all unpacked .xml and .jpg files will be moved to
--train -train Percentage of train images of total data set (default: 50)
--valid -valid Percentage of validation images of total data set (default: 20)
--test -test Percentage of test images of total data set (default: 30)
Returns:
Nothing
Usage:
python DataLoading.py -i [PATH_TO_DOWNLOAD_FOLDER] -o [TARGET_FOLDER] -train [TRAIN_SHARE]
-valid [VALIDATION_SHARE] -test [TEST_SHARE]
"""
# Initiate argument parser
parser = argparse.ArgumentParser(
description="Extract labelme downloads and merge into single folder"
)
parser.add_argument(
"-i",
"--inputDir",
help="Path that shall be crawled",
type=str,
)
parser.add_argument(
"-o",
"--outputDir",
help="Directory where all images and annotations shall be moved to",
type=str,
)
parser.add_argument(
"-train",
"--train",
help="share of test images in percent (e.g. 50)",
type=int,
default=50,
)
parser.add_argument(
"-valid",
"--valid",
help="share of validation images in percent (e.g. 30)",
type=int,
default=20,
)
parser.add_argument(
"-test",
"--test",
help="share of test images in percent (e.g. 20)",
type=int,
default=30,
)
args = parser.parse_args()
# Validate input args
assert os.path.isdir(args.inputDir)
# unpack all files from LabelMe
unpack_label_me(args.inputDir, args.outputDir)
# assign files to train, test and validation set
weights = [args.train, args.valid, args.test]
train_test_valid_split(args.outputDir, weights=weights)
if __name__ == "__main__":
load_data(inputDir='/Users/Dany1/Desktop/LabelMe/',
outputDir='/Users/Dany1/PycharmProjects/semester-challange/Pictures/')