|
7 | 7 | duplicate_finder.py remove <path> ... [--db=<db_path>] |
8 | 8 | duplicate_finder.py clear [--db=<db_path>] |
9 | 9 | duplicate_finder.py show [--db=<db_path>] |
10 | | - duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=<trash_path>] [--db=<db_path>] |
| 10 | + duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=<trash_path>] [--db=<db_path>] [--threshold=<num>] |
11 | 11 | duplicate_finder.py -h | --help |
12 | 12 |
|
13 | 13 | Options: |
|
19 | 19 | files (default: number of CPUs). |
20 | 20 |
|
21 | 21 | find: |
| 22 | + --threshold=<num> Image matching threshold. Number of different bits in Hamming distance. False positives are possible. |
22 | 23 | --print Only print duplicate files rather than displaying HTML file |
23 | 24 | --delete Move all found duplicate pictures to the trash. This option takes priority over --print. |
24 | 25 | --match-time Adds the extra constraint that duplicate images must have the |
|
45 | 46 | from PIL import Image, ExifTags |
46 | 47 | import pymongo |
47 | 48 | from termcolor import cprint |
48 | | - |
| 49 | +import pybktree |
49 | 50 |
|
50 | 51 | @contextmanager |
51 | 52 | def connect_to_db(db_conn_string='./db'): |
@@ -244,6 +245,51 @@ def find(db, match_time=False): |
244 | 245 |
|
245 | 246 | return list(dups) |
246 | 247 |
|
| 248 | +def find_threshold(db, threshold=1): |
| 249 | + dups = [] |
| 250 | + # Build a tree |
| 251 | + cursor = db.find() |
| 252 | + tree = pybktree.BKTree(pybktree.hamming_distance) |
| 253 | + |
| 254 | + cprint('Finding fuzzy duplicates, it might take a while...') |
| 255 | + cnt = 0 |
| 256 | + for document in db.find(): |
| 257 | + int_hash = int(document['hash'], 16) |
| 258 | + tree.add(int_hash) |
| 259 | + cnt = cnt + 1 |
| 260 | + |
| 261 | + deduplicated = set() |
| 262 | + |
| 263 | + scanned = 0 |
| 264 | + for document in db.find(): |
| 265 | + cprint("\r%d%%" % (scanned * 100 / (cnt - 1)), end='') |
| 266 | + scanned = scanned + 1 |
| 267 | + if document['hash'] in deduplicated: |
| 268 | + continue |
| 269 | + deduplicated.add(document['hash']) |
| 270 | + hash_len = len(document['hash']) |
| 271 | + int_hash = int(document['hash'], 16) |
| 272 | + similar = tree.find(int_hash, threshold) |
| 273 | + similar = list(set(similar)) |
| 274 | + if len(similar) > 1: |
| 275 | + similars = [] |
| 276 | + for (distance, item_hash) in similar: |
| 277 | + #if distance > 0: |
| 278 | + item_hash = format(item_hash, '0' + str(hash_len) + 'x') |
| 279 | + deduplicated.add(item_hash) |
| 280 | + for item in db.find({'hash': item_hash}): |
| 281 | + item['file_name'] = item['_id'] |
| 282 | + similars.append(item) |
| 283 | + if len(similars) > 0: |
| 284 | + dups.append( |
| 285 | + { |
| 286 | + '_id': document['hash'], |
| 287 | + 'total': len(similars), |
| 288 | + 'items': similars |
| 289 | + } |
| 290 | + ) |
| 291 | + |
| 292 | + return dups |
247 | 293 |
|
248 | 294 | def delete_duplicates(duplicates, db): |
249 | 295 | results = [delete_picture(x['file_name'], db) |
@@ -355,7 +401,10 @@ def get_capture_time(img): |
355 | 401 | elif args['show']: |
356 | 402 | show(db) |
357 | 403 | elif args['find']: |
358 | | - dups = find(db, args['--match-time']) |
| 404 | + if args['--threshold'] is not None: |
| 405 | + dups = find_threshold(db, int(args['--threshold'])) |
| 406 | + else: |
| 407 | + dups = find(db, args['--match-time']) |
359 | 408 |
|
360 | 409 | if args['--delete']: |
361 | 410 | delete_duplicates(dups, db) |
|
0 commit comments