forked from roym899/abandoned_bag_detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_datasets.py
134 lines (111 loc) · 4.87 KB
/
filter_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
import os
from shutil import copy
from tqdm import tqdm
OUTPUT_FOLDER = './datasets/person_bag'
DATASETS = ['ADE20K', 'COCO']
SPLITS = ['train', 'val']
ADE20K_FOLDER = './datasets/ade20k_coco'
COCO_FOLDER = './datasets/coco'
PERSON_ID = 1
BAG_ID = 2
def get_json_path(dataset, split):
if dataset == 'ADE20K' and split == 'val':
return os.path.join(ADE20K_FOLDER, 'annotations', 'val.json')
if dataset == 'ADE20K' and split == 'train':
return os.path.join(ADE20K_FOLDER, 'annotations', 'train.json')
if dataset == 'COCO' and split == 'train':
return os.path.join(COCO_FOLDER, 'annotations', 'instances_train2017.json')
if dataset == 'COCO' and split == 'val':
return os.path.join(COCO_FOLDER, 'annotations', 'instances_val2017.json')
def get_person_ids_for_dataset(dataset):
if dataset == 'COCO':
return [1]
elif dataset == 'ADE20K':
return [24]
def get_bag_ids_for_dataset(dataset):
if dataset == 'COCO':
return [27, 31, 33]
elif dataset == 'ADE20K':
return [29, 124, 1011, 1515, 1513, 1537, 2217, 2347, 2604, 178, 833]
def get_image_folder(dataset, split):
if dataset == 'ADE20K' and split == 'val':
return os.path.join(ADE20K_FOLDER, 'val')
if dataset == 'ADE20K' and split == 'train':
return os.path.join(ADE20K_FOLDER, 'train')
if dataset == 'COCO' and split == 'train':
return os.path.join(COCO_FOLDER, 'images', 'train2017')
if dataset == 'COCO' and split == 'val':
return os.path.join(COCO_FOLDER, 'images', 'val2017')
def main():
id_counter = 0
for split in SPLITS:
# create coco format json
output_data = {'info': {},
'licenes': {},
'images': [],
'annotations': [],
'categories': []}
# TODO: licenses structs
# add info dictionary
output_data['info'] = {
'contributor': 'WASP AS1',
'date_created': '2020/02/27',
'description': 'ADE20K + COCO persons and bags only',
'url': '',
'version': '0.1',
'year': 2020
}
# add the categories that we want
output_data['categories'] = [
{
'supercategory': 'person',
'id': PERSON_ID,
'name': 'person'
},
{
'supercategory': 'bag',
'id': BAG_ID,
'name': 'bag'
}
]
for dataset in DATASETS:
with open(get_json_path(dataset, split)) as f:
data = json.load(f)
id_to_index = {}
for index, image in enumerate(data['images']):
id_to_index[image['id']] = index
# go through coco annotations
filtered_image_ids = set()
filtered_dataset_images = []
for annotation in data['annotations']:
if annotation['category_id'] in get_person_ids_for_dataset(dataset):
output_data['annotations'].append(annotation)
output_data['annotations'][-1]['category_id'] = PERSON_ID
output_data['annotations'][-1]['id'] = id_counter
id_counter += 1
filtered_image_ids.add(annotation['image_id'])
elif annotation['category_id'] in get_bag_ids_for_dataset(dataset):
output_data['annotations'].append(annotation)
output_data['annotations'][-1]['category_id'] = BAG_ID
output_data['annotations'][-1]['id'] = id_counter
id_counter += 1
filtered_image_ids.add(annotation['image_id'])
# go through the filtered images
for filtered_image_id in filtered_image_ids:
output_data['images'].append(data['images'][id_to_index[filtered_image_id]])
filtered_dataset_images.append(data['images'][id_to_index[filtered_image_id]])
# move the images
output_folder = os.path.join(OUTPUT_FOLDER, 'images', split)
os.makedirs(output_folder, exist_ok=True)
for image in tqdm(filtered_dataset_images):
image_path = os.path.join(get_image_folder(dataset, split), image['file_name'])
output_path = os.path.join(output_folder, image['file_name'])
copy(image_path, output_path)
json_output_folder = os.path.join(OUTPUT_FOLDER, 'annotations')
os.makedirs(json_output_folder, exist_ok=True)
json_output_path = os.path.join(json_output_folder, f'{split}.json')
with open(json_output_path, 'w') as f:
json.dump(output_data, f)
if __name__ == '__main__':
main()