-
Notifications
You must be signed in to change notification settings - Fork 3
/
hash_images.py
66 lines (56 loc) · 3.18 KB
/
hash_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import imagehash
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
def hash_images_1(directory: str, hashes=list()) -> None:
"""
NOTE
The images are named like: C:\\Users\\Alex\\Desktop\\hello\\my_tinder_test\\tinder_pics\\311_Hannah_5.jpg
The directory's items need to be sorted by the number at the beginning of the filename (e.g., 311 in this example)
"""
# Iterate over all the items in a directory, only hashing jpgs (can expand this to be other filetypes, if necessary)
for item in tqdm(sorted(os.listdir(path=directory), key=lambda x: int(x.split('_')[0]))):
if item.endswith('.jpg'):
# Append the hash
hashes.append(str(imagehash.phash(image=Image.open(fp=os.path.join(directory, item)))))
# Create a list of tuples which include only the image names and their respective hashes.
objects = list(zip([item for
item in
sorted(os.listdir(path=directory), key=lambda x: int(x.split('_')[0]))
if item.endswith('.jpg')],
hashes)
)
# Write all of the image names and their respective hash values to a text file (can write to Excel or csv with pandas, if necessary)
with open(file='./hash_images.txt', mode='w', encoding='utf-8-sig') as f:
for i in range(len(hashes)):
if i == (len(hashes) - 1): # If we're at the last image, don't add the new line
f.write(objects[i][0] + '|' + objects[i][1])
else: # Otherwise, we're not at the last image, so add the new line
f.write(objects[i][0] + '|' + objects[i][1] + '\n')
def hash_images_2(directory: str, records=list()) -> None:
"""
NOTE
The images are named like: C:\\Users\\Alex\\Downloads\\Tinder is maddening\\maddy - 2.jpg
TODO
Potentially, we may want to calculate the hamming distance between images so that similar images, whose
hashes don't match exactly, are seen as a match if they're defined as "similar enough" or something like that.
"""
# Iterate over all the items in a directory, only hashing jpgs (can expand this to be other filetypes, if necessary)
for item in tqdm(sorted(os.listdir(path=directory), key=lambda x: int(x.split(' - ')[-1].split('.')[0]))):
if item.endswith('.jpg'):
# Append the image's name and the hash
records.append((item, str(imagehash.phash(image=Image.open(fp=os.path.join(directory, item))))))
# Write the data to Excel
pandas_to_excel(data=records, records_path=r'C:\Users\Alex\Desktop\My_Hashes.xlsx', sheet_name='Hashes')
def pandas_to_excel(data: list, records_path: str, sheet_name: str) -> None:
df = pd.DataFrame(data=data, columns=['File Name', 'Hash'])
df.to_excel(excel_writer=records_path,
sheet_name=sheet_name,
index=False,
freeze_panes=(1,0)
)
# Call the function(s)
if __name__ == '__main__':
directory = input('Please input your directory:\n\t')
hash_images_2(directory=directory)