forked from wwoast/redpanda-lineage
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsample.py
executable file
·242 lines (231 loc) · 10.6 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/python3
# Tools for making subsets / samples of Red Panda Lineage data
import os
import random
import sys
from shared import *
def collect_photo_uris(min_photos=0, species=["1", "2"], taglist=None):
"""
Collect all photo uris in the dataset that match:
- A minimum number of photos for the animal
- An array of possible species (defaults to ["1", "2"] which gets all)
- A list of photo tags (defaults to getting photos regardless of tag)
"""
matched_photos = []
for file_path in [PANDA_PATH]:
section = None
for section_name in ["pandas"]:
if section_name in file_path.split("/"):
section = section_name.split("s")[0] # HACK
# Enter the pandas subdirectories
for root, dirs, files in os.walk(file_path):
for filename in files:
path = root + os.sep + filename
# print(path)
photo_list = PhotoFile(section, path)
photo_count = photo_list.photo_count()
if photo_count == 0:
# Ignore if panda has no photos
continue
if photo_count < min_photos:
# Ignore if this panda doesn't have enough photos
continue
# Ignore if it's not the species we want
if (photo_list.get_field("species") not in species):
continue
photo_index = 1
while (photo_index <= photo_count):
current_photo = "photo." + str(photo_index)
current_tag = "photo." + str(photo_index) + ".tags"
# If we have a taglist, only collect photos in the list
if taglist != None:
if photo_list.array_has_value(current_tag, taglist) == False:
photo_index = photo_index + 1
continue
# Collect photos
value = photo_list.get_field(current_photo)
raw = current_photo + ": " + value
photo = PhotoEntry(path, raw)
matched_photos.append(photo)
photo_index = photo_index + 1
return matched_photos
def define_min_photo_sample(min_count=40, photo_count=40, species=["1", "2"]):
"""
Fetch a sample of all animals that have at least N photos in the dataset.
Defaults to 40 photos.
"""
output_photos = []
matched_photos = collect_photo_uris(min_count, species)
# Shuffle the list of photos that match our interest
random.shuffle(matched_photos)
# Take entire photo set we've gathered, and whittle it down to
# the animal_count and photo_count set of photos.
animal_id_dict = {}
#!!!! TODO why doesn't photo-count matter?
for photo in matched_photos:
# Count how many photos of each animal we've gone through
if photo.entity_id not in animal_id_dict:
animal_id_dict[photo.entity_id] = 1
else:
animal_id_dict[photo.entity_id] = animal_id_dict[photo.entity_id] + 1
if animal_id_dict[photo.entity_id] > photo_count:
# We have enough photos of this animal
continue
else:
output_photos.append(photo)
return output_photos
def define_random_tag_sample(num_animals, num_photos, species, taglist):
"""
Fetch a random sample of the Red Panda Lineage project's linked photos.
"""
output_photos = []
matched_photos = collect_photo_uris(0, species, taglist)
# Shuffle the list of photos that match our interest
random.shuffle(matched_photos)
# Take entire photo set we've gathered, and whittle it down to
# the animal_count and photo_count set of photos.
animal_id_list = []
for photo in matched_photos:
# Don't go over the photo count overall
if len(output_photos) == num_photos:
break
if photo.entity_id not in animal_id_list:
# Don't go over your animal count
if len(animal_id_list) < num_animals:
output_photos.append(photo)
animal_id_list.append(photo.entity_id)
else:
continue
else:
# Animal id seen previously, and we still need photos
output_photos.append(photo)
return output_photos
def fetch_sample_photos(folder, desired_photos, species, size):
"""
Given a defined set of photos we selected from the dataset, grab them
from the Internet, and write them in an organized way.
Structure of the output:
./sample-<utime>: output folder
./sample/a.f.fulgens OR ./sample/a.f.styani:
- images arranged by subspecies
./sample/<species>/<rpf-id>_photo.<photo.index>.jpg
"""
# Build the species output folders based on desired species values
for specie in species:
if (specie == "1"):
os.makedirs(folder + "/a.f.fulgens")
if (specie == "2"):
os.makedirs(folder + "/a.f.styani")
for photo in desired_photos:
output_species = "a.f.fulgens"
if photo.species == "2":
output_species = "a.f.styani"
output_entity = photo.entity_id
output_photo_index = photo.photo_index
output_image = folder + "/" + output_species + "/" + output_entity + "_photo." + output_photo_index + ".jpg"
# Fetch an image
success = fetch_photo(photo.photo_uri, output_image, size)
if success:
random_sleep()
else:
while success == False:
print("Validate the URI/your Internet. Press ENTER to try again.")
choice = input("Or type \"continue\" to skip: ")
if choice == "continue":
break
success = fetch_photo(photo.photo_uri, output_image, size)
def write_sample_summary(folder, desired_photos):
"""
Write an informational summary of the sample, as well as all URLs gathered
and the ownership data/commit info for each one.
Structure of the output:
./sample-<utime>: output folder
./sample/info.txt: Record and summary of the queried photo data
- RPF Git commit, sample.py command ran (including animal and photo counts)
"""
animal_count = str(len(set(map(lambda x: x.entity_id, desired_photos))))
fulgens = list(filter(lambda x: x.species == "1", desired_photos))
fulgens_count = str(len(set(map(lambda x: x.entity_id, fulgens))))
photo_count = str(len(desired_photos))
styani = list(filter(lambda x: x.species == "2", desired_photos))
styani_count = str(len(set(map(lambda x: x.entity_id, styani))))
# Write output metadata
# TODO: other metadata can be changing, so do we care?
# If we do, PhotoEntry needs to track more values from the source files
output_metadata = folder + "/info.txt"
with open(output_metadata, 'w') as wfh:
# TODO: high-level data
wfh.write("panda.count: " + animal_count)
wfh.write("\npanda.fulgens.count: " + fulgens_count + "\n")
for photo in fulgens:
real_uri = unfurl_ig_link(photo.photo_uri)
wfh.write(photo.entity_id + ".photo." + photo.photo_index + ": " + real_uri + "\n")
wfh.write(photo.entity_id + ".photo." + photo.photo_index + ".author: " + photo.author_name + "\n")
wfh.write(photo.entity_id + ".photo." + photo.photo_index + ".commitdate: " + photo.commitdate + "\n")
wfh.write("panda.styani.count: " + styani_count + "\n")
for photo in styani:
real_uri = unfurl_ig_link(photo.photo_uri)
wfh.write(photo.entity_id + ".photo." + photo.photo_index + ": " + real_uri + "\n")
wfh.write(photo.entity_id + ".photo." + photo.photo_index + ".author: " + photo.author_name + "\n")
wfh.write(photo.entity_id + ".photo." + photo.photo_index + ".commitdate: " + photo.commitdate + "\n")
if __name__ == '__main__':
# Default settings
animals = 100
min_photo_count = 0
photo_count = 5
size = "m"
species = ["1", "2"] # All Species
taglist = "close-up, profile, portrait"
# Parse arguments
if "--animals" in sys.argv:
animals = int(sys.argv[sys.argv.index("--animals") + 1])
if animals < 1:
print("Animals count must be positive.")
sys.exit()
if "--animal-has-photos" in sys.argv:
min_photo_count = int(sys.argv[sys.argv.index("--animal-has-photos") + 1])
if min_photo_count < 1:
print("Candindate animal photo count must be positive.")
sys.exit()
if "--photo-count" in sys.argv:
photo_count = int(sys.argv[sys.argv.index("--photo-count") + 1])
if photo_count < 1:
print("Photo count must be positive.")
sys.exit()
if "--size" in sys.argv:
size = sys.argv[sys.argv.index("--size") + 1]
if ((size != "t") and (size != "m") and (size != "l")):
raise SizeError("%s photo size is not one of: t m l" % size)
if "--species" in sys.argv:
species = int(sys.argv[sys.argv.index("--species") + 1])
if ((species < 1) or (species > 2)):
raise SpeciesError("%s species value not 1 or 2 (1: fulgens, 2: styani)" % species)
species = [str(species)] # Treat like array of species values
if "--taglist" in sys.argv:
taglist = sys.argv[sys.argv.index("--taglist") + 1]
taglist = taglist.split(", ")
# The token isn't used here (it's in the fetch function) but if we check here,
# we'll save time building a sample if we don't have all the necessary things to
# fetch remote images.
token = os.getenv('OE_TOKEN', None)
if token == None:
raise KeyError("Please set an OE_TOKEN environment variable for using the IG API")
# Build a sample. If we do a min-photo-count sample set, then we ignore the
# tag list to guarantee we have enough photos to work with.
if (min_photo_count > 0):
photos = define_min_photo_sample(min_photo_count, photo_count, species)
else:
photos = define_random_tag_sample(animals, photo_count, species, taglist)
photo_count = str(len(photos))
if photo_count == 0:
print("Sample for your arguments contains no photos.")
sys.exit()
else:
print("Sample for your arguments contains %s photos. Fetching..." % photo_count)
# Unique directory name (with current unixtime)
folder = "export/sample_" + str(current_time_to_unixtime())
os.makedirs(folder)
# Write output information
write_sample_summary(folder, photos)
# Start fetching photos
fetch_sample_photos(folder, photos, species, size)