-
Notifications
You must be signed in to change notification settings - Fork 174
/
load_data.py
72 lines (61 loc) · 2.26 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import sys
import numpy as np
from scipy.misc import imread
import pickle
import os
import matplotlib.pyplot as plt
import argparse
"""Script to preprocess the omniglot dataset and pickle it into an array that's easy
to index my character type"""
parser = argparse.ArgumentParser()
parser.add_argument("--path",help="Path where omniglot folder resides")
parser.add_argument("--save", help = "Path to pickle data to.", default=os.getcwd())
args = parser.parse_args()
data_path = os.path.join(args.path, "python")
train_folder = os.path.join(data_path,'images_background')
valpath = os.path.join(data_path,'images_evaluation')
save_path = args.save
lang_dict = {}
def loadimgs(path,n=0):
#if data not already unzipped, unzip it.
if not os.path.exists(path):
print("unzipping")
os.chdir(data_path)
os.system("unzip {}".format(path+".zip" ))
X=[]
y = []
cat_dict = {}
lang_dict = {}
curr_y = n
#we load every alphabet seperately so we can isolate them later
for alphabet in os.listdir(path):
print("loading alphabet: " + alphabet)
lang_dict[alphabet] = [curr_y,None]
alphabet_path = os.path.join(path,alphabet)
#every letter/category has it's own column in the array, so load seperately
for letter in os.listdir(alphabet_path):
cat_dict[curr_y] = (alphabet, letter)
category_images=[]
letter_path = os.path.join(alphabet_path, letter)
for filename in os.listdir(letter_path):
image_path = os.path.join(letter_path, filename)
image = imread(image_path)
category_images.append(image)
y.append(curr_y)
try:
X.append(np.stack(category_images))
#edge case - last one
except ValueError as e:
print(e)
print("error - category_images:", category_images)
curr_y += 1
lang_dict[alphabet][1] = curr_y - 1
y = np.vstack(y)
X = np.stack(X)
return X,y,lang_dict
X,y,c=loadimgs(train_folder)
with open(os.path.join(save_path,"train.pickle"), "wb") as f:
pickle.dump((X,c),f)
X,y,c=loadimgs(valpath)
with open(os.path.join(save_path,"val.pickle"), "wb") as f:
pickle.dump((X,c),f)