Added fuzzy image matching with pybktree.

bolshevik · bolshevik · commit 41e02539dc85 · 2019-01-19T19:08:16.000+01:00
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ Usage:
     duplicate_finder.py remove <path> ... [--db=<db_path>]
     duplicate_finder.py clear [--db=<db_path>]
     duplicate_finder.py show [--db=<db_path>]
-    duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=<trash_path>] [--db=<db_path>]
+    duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=<trash_path>] [--db=<db_path>] [--threshold=<num>]
     duplicate_finder.py -h | --help
 
 Options:
@@ -88,6 +88,7 @@ Options:
                                files (default: number of CPUs).
 
     find:
+        --threshold=<num>     Image matching threshold. Number of different bits in Hamming distance. False positives are possible.
         --print               Only print duplicate files rather than displaying HTML file
         --delete              Move all found duplicate pictures to the trash. This option takes priority over --print.
         --match-time          Adds the extra constraint that duplicate images must have the
@@ -125,7 +126,7 @@ Prints the contents database.
 
 ### Find
 ```bash
-duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=<trash_path>]
+duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=<trash_path>] [--threshold=<num>]
 ```
 
 Finds duplicate pictures that have been hashed. This will find images that have the same hash stored in the database. There are a few options associated with `find`. By default, when this command is run, a webpage is displayed showing duplicate pictures and a server is started that allows for the pictures to be deleted (images are not actually deleted, but moved to a trash folder -- I really don't want you to make a mistake). The first option, **`--print`**, prints all duplicate pictures and does not display a webpage or start the server. **`--delete`** automatically moves all duplicate images found to the trash. Be careful with this one. **`--match-time`** adds the extra constraint that images must have the same EXIF time stamp to be considered duplicate pictures. Last, `--trash=<trash_path>` lets you select a path to where you want files to be put when they are deleted. The default trash location is `./Trash`.
diff --git a/duplicate_finder.py b/duplicate_finder.py
@@ -7,7 +7,7 @@
     duplicate_finder.py remove <path> ... [--db=<db_path>]
     duplicate_finder.py clear [--db=<db_path>]
     duplicate_finder.py show [--db=<db_path>]
-    duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=<trash_path>] [--db=<db_path>]
+    duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=<trash_path>] [--db=<db_path>] [--threshold=<num>] 
     duplicate_finder.py -h | --help
 
 Options:
@@ -19,6 +19,7 @@
                                files (default: number of CPUs).
 
     find:
+        --threshold=<num>     Image matching threshold. Number of different bits in Hamming distance. False positives are possible.
         --print               Only print duplicate files rather than displaying HTML file
         --delete              Move all found duplicate pictures to the trash. This option takes priority over --print.
         --match-time          Adds the extra constraint that duplicate images must have the
@@ -45,7 +46,7 @@
 from PIL import Image, ExifTags
 import pymongo
 from termcolor import cprint
-
+import pybktree
 
 @contextmanager
 def connect_to_db(db_conn_string='./db'):
@@ -244,6 +245,51 @@ def find(db, match_time=False):
 
     return list(dups)
 
+def find_threshold(db, threshold=1):
+    dups = []
+    # Build a tree
+    cursor = db.find()
+    tree = pybktree.BKTree(pybktree.hamming_distance)
+
+    cprint('Finding fuzzy duplicates, it might take a while...')
+    cnt = 0
+    for document in db.find():
+        int_hash = int(document['hash'], 16)
+        tree.add(int_hash)
+        cnt = cnt + 1
+
+    deduplicated = set()
+
+    scanned = 0
+    for document in db.find():
+        cprint("\r%d%%" % (scanned * 100 / (cnt - 1)), end='')
+        scanned = scanned + 1
+        if document['hash'] in deduplicated:
+            continue
+        deduplicated.add(document['hash'])
+        hash_len = len(document['hash'])
+        int_hash = int(document['hash'], 16)
+        similar = tree.find(int_hash, threshold)
+        similar = list(set(similar))
+        if len(similar) > 1:
+           similars = []
+           for (distance, item_hash) in similar:
+               #if distance > 0:
+                   item_hash = format(item_hash, '0' + str(hash_len) + 'x')
+                   deduplicated.add(item_hash)
+                   for item in db.find({'hash': item_hash}):
+                       item['file_name'] = item['_id']
+                       similars.append(item)
+           if len(similars) > 0:
+               dups.append(
+                   {
+                      '_id': document['hash'],
+                      'total': len(similars),
+                      'items': similars
+                   }
+               )
+
+    return dups
 
 def delete_duplicates(duplicates, db):
     results = [delete_picture(x['file_name'], db)
@@ -355,7 +401,10 @@ def get_capture_time(img):
         elif args['show']:
             show(db)
         elif args['find']:
-            dups = find(db, args['--match-time'])
+            if args['--threshold'] is not None:
+                dups = find_threshold(db, int(args['--threshold']))
+            else:
+                dups = find(db, args['--match-time'])
 
             if args['--delete']:
                 delete_duplicates(dups, db)
diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ termcolor==1.1.0
 Werkzeug==0.14.1
 Flask-Cors==3.0.3
 dnspython>=1.15.0
+pybktree==1.1
diff --git a/template/index.html b/template/index.html
@@ -1,5 +1,6 @@
 <html>
 <head>
+    <meta charset="UTF-8">
     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css">
     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap-theme.min.css">
     <script src="https://code.jquery.com/jquery-2.1.4.min.js"></script>
@@ -9,7 +10,9 @@
 {% macro image(img, size) -%}
 <div class="col-xs-{{ size }}">
     <div class="thumbnail">
-        <img class="img-responsive" src="{{ img['file_name'] }}" alt="{{ img['file_name'] }}">
+        <a href="{{ img['file_name']  }}" target='_blank'>
+            <img class="img-responsive" src="{{ img['file_name'] }}" alt="{{ img['file_name'] }}">
+        </a>
         <div class="caption">
             <h5 class="name">{{ img['file_name'] }}</h5>
             <div class="file-size">{{ img['file_size'] | filesizeformat }}</div>