-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
executable file
·110 lines (75 loc) · 3.26 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python2
import numpy as np
import argparse
from os.path import isfile, join, splitext, basename
import h5py
from sklearn.cross_validation import StratifiedShuffleSplit
from progressbar import Percentage, Bar, ETA, AdaptiveETA, ProgressBar
from convnetskeras.convnets import preprocess_image_batch
def folder2labels(path):
#TODO
pass
def split_traintest(data, splitsize=.1):
sss = StratifiedShuffleSplit([l for f,l in data], test_size=splitsize)
for train_index, test_index in sss:
data_train = [data[i] for i in train_index]
data_test = [data[i] for i in test_index]
return data_train, data_test
def preprocessing(data, train_set_h5, test_set_h5=None, splitsize=.1):
if test_set_h5 == None:
data_train = data
else:
data_train, data_test = split_traintest(data, splitsize=splitsize)
print("Preprocessing train set ...")
save_img_h5(data_train, train_set_h5, img_size=(256,256))
if test_set_h5 != None:
print("Preprocessing test set ...")
save_img_h5(data_test, test_set_h5, img_size=(256,256))
def save_img_h5(data, h5_file, cache_size=30, img_size=None):
f = h5py.File(h5_file, "w")
files = [name for (name,l) in data]
labels = np.array([l for (name,l) in data])
n_samples = len(data)
files_dset = f.create_dataset("files", data=files)
lab_dset = f.create_dataset("labels", data=labels,compression="gzip")
img_dset = f.create_dataset("imgs", (n_samples,3,256,256),
dtype='float32',
compression="gzip")
n_step = len(data)/cache_size
if n_step % cache_size != 0:
n_step += 1
############# Progressbar ################
widgets = [Percentage(),
' ', Bar(),
' ', ETA(),
' ', AdaptiveETA()]
pbar = ProgressBar(widgets=widgets, maxval=len(data))
pbar.start()
#########################################
for j in range(n_step):
X = preprocess_image_batch(files[j*cache_size:(j+1)*cache_size],
img_size=img_size)
img_dset[j*cache_size:(j+1)*cache_size] = X
pbar.update(j*cache_size)
pbar.finish()
f.close()
def main(data_file, train_set, test_set=None, splitsize=.1):
data = []
with open(data_file, "r") as f:
for l in f:
path, label = l.split(",")
label = int(label[:-1])
data.append((path, label))
preprocessing(data, train_set, test_set_h5 = test_set, splitsize=splitsize)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("DATA", help = "CSV file containing images and labels")
parser.add_argument("H5PATH", help= "Path to the preprocessed data")
parser.add_argument("-ts", "--testset", default=None,
help = ("If path is given, it splits the data into "
"train and test set")
)
parser.add_argument("-ss", "--splitsize", type=float, default=.1,
help="Size of the test set. Default : 0.1")
args = parser.parse_args()
main(args.DATA, args.H5PATH, test_set=args.testset, splitsize=args.splitsize)