migrated code to python 3.6.6 and refactored some code to improve it.

Sodawater05 · Sep 18, 2019 · 78dfef0 · 78dfef0
1 parent d2b8761
commit 78dfef0
Show file tree

Hide file tree

Showing 18 changed files with 681 additions and 660 deletions.
diff --git a/dejavu.cnf.SAMPLE b/dejavu.cnf.SAMPLE
@@ -2,7 +2,7 @@
     "database": {
         "host": "127.0.0.1",
         "user": "root",
-        "passwd": "12345678", 
-        "db": "dejavu"
+        "password": "rootpass",
+        "database": "dejavu"
     }
-}
+}
diff --git a/dejavu.py b/dejavu.py
@@ -24,7 +24,7 @@ def init(configpath):
         with open(configpath) as f:
             config = json.load(f)
     except IOError as err:
-        print("Cannot open configuration: %s. Exiting" % (str(err)))
+        print(("Cannot open configuration: %s. Exiting" % (str(err))))
         sys.exit(1)
 
     # create a Dejavu instance
@@ -67,8 +67,8 @@ def init(configpath):
         if len(args.fingerprint) == 2:
             directory = args.fingerprint[0]
             extension = args.fingerprint[1]
-            print("Fingerprinting all .%s files in the %s directory"
-                  % (extension, directory))
+            print(("Fingerprinting all .%s files in the %s directory"
+                  % (extension, directory)))
             djv.fingerprint_directory(directory, ["." + extension], 4)
 
         elif len(args.fingerprint) == 1:

diff --git a/dejavu/__init__.py b/dejavu/__init__.py
@@ -1,28 +1,23 @@
-from dejavu.database import get_database, Database
-import dejavu.decoder as decoder
-import fingerprint
 import multiprocessing
 import os
-import traceback
 import sys
+import traceback
 
+import dejavu.decoder as decoder
+from dejavu.config.config import (CONFIDENCE, DEFAULT_FS,
+                                  DEFAULT_OVERLAP_RATIO, DEFAULT_WINDOW_SIZE,
+                                  FIELD_FILE_SHA1, OFFSET, OFFSET_SECS,
+                                  SONG_ID, SONG_NAME, TOPN)
+from dejavu.database import get_database
+from dejavu.fingerprint import fingerprint
 
-class Dejavu(object):
-
-    SONG_ID = "song_id"
-    SONG_NAME = 'song_name'
-    CONFIDENCE = 'confidence'
-    MATCH_TIME = 'match_time'
-    OFFSET = 'offset'
-    OFFSET_SECS = 'offset_seconds'
 
+class Dejavu:
     def __init__(self, config):
-        super(Dejavu, self).__init__()
-
         self.config = config
 
         # initialize db
-        db_cls = get_database(config.get("database_type", None))
+        db_cls = get_database(config.get("database_type", "mysql").lower())
 
         self.db = db_cls(**config.get("database", {}))
         self.db.setup()
@@ -39,7 +34,7 @@ def get_fingerprinted_songs(self):
         self.songs = self.db.get_songs()
         self.songhashes_set = set()  # to know which ones we've computed before
         for song in self.songs:
-            song_hash = song[Database.FIELD_FILE_SHA1]
+            song_hash = song[FIELD_FILE_SHA1]
             self.songhashes_set.add(song_hash)
 
     def fingerprint_directory(self, path, extensions, nprocesses=None):
@@ -55,26 +50,23 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):
 
         filenames_to_fingerprint = []
         for filename, _ in decoder.find_files(path, extensions):
-
             # don't refingerprint already fingerprinted files
             if decoder.unique_hash(filename) in self.songhashes_set:
-                print "%s already fingerprinted, continuing..." % filename
+                print(f"{filename} already fingerprinted, continuing...")
                 continue
 
             filenames_to_fingerprint.append(filename)
 
         # Prepare _fingerprint_worker input
-        worker_input = zip(filenames_to_fingerprint,
-                           [self.limit] * len(filenames_to_fingerprint))
+        worker_input = list(zip(filenames_to_fingerprint, [self.limit] * len(filenames_to_fingerprint)))
 
         # Send off our tasks
-        iterator = pool.imap_unordered(_fingerprint_worker,
-                                       worker_input)
+        iterator = pool.imap_unordered(_fingerprint_worker, worker_input)
 
         # Loop till we have all of them
         while True:
             try:
-                song_name, hashes, file_hash = iterator.next()
+                song_name, hashes, file_hash = next(iterator)
             except multiprocessing.TimeoutError:
                 continue
             except StopIteration:
@@ -99,7 +91,7 @@ def fingerprint_file(self, filepath, song_name=None):
         song_name = song_name or songname
         # don't refingerprint already fingerprinted files
         if song_hash in self.songhashes_set:
-            print "%s already fingerprinted, continuing..." % song_name
+            print(f"{song_name} already fingerprinted, continuing...")
         else:
             song_name, hashes, file_hash = _fingerprint_worker(
                 filepath,
@@ -112,22 +104,21 @@ def fingerprint_file(self, filepath, song_name=None):
             self.db.set_song_fingerprinted(sid)
             self.get_fingerprinted_songs()
 
-    def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
-        hashes = fingerprint.fingerprint(samples, Fs=Fs)
+    def find_matches(self, samples, Fs=DEFAULT_FS):
+        hashes = fingerprint(samples, Fs=Fs)
         return self.db.return_matches(hashes)
 
-    def align_matches(self, matches):
+    def align_matches(self, matches, topn=TOPN):
         """
             Finds hash matches that align in time with other matches and finds
             consensus about which hashes are "true" signal from the audio.
 
-            Returns a dictionary with match information.
+            Returns a list of dictionaries (based on topn) with match information.
         """
         # align by diffs
         diff_counter = {}
-        largest = 0
         largest_count = 0
-        song_id = -1
+
         for tup in matches:
             sid, diff = tup
             if diff not in diff_counter:
@@ -137,30 +128,65 @@ def align_matches(self, matches):
             diff_counter[diff][sid] += 1
 
             if diff_counter[diff][sid] > largest_count:
-                largest = diff
                 largest_count = diff_counter[diff][sid]
-                song_id = sid
 
-        # extract idenfication
-        song = self.db.get_song_by_id(song_id)
-        if song:
-            # TODO: Clarify what `get_song_by_id` should return.
-            songname = song.get(Dejavu.SONG_NAME, None)
-        else:
-            return None
-
-        # return match info
-        nseconds = round(float(largest) / fingerprint.DEFAULT_FS *
-                         fingerprint.DEFAULT_WINDOW_SIZE *
-                         fingerprint.DEFAULT_OVERLAP_RATIO, 5)
-        song = {
-            Dejavu.SONG_ID : song_id,
-            Dejavu.SONG_NAME : songname.encode("utf8"),
-            Dejavu.CONFIDENCE : largest_count,
-            Dejavu.OFFSET : int(largest),
-            Dejavu.OFFSET_SECS : nseconds,
-            Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None).encode("utf8"),}
-        return song
+        # create dic where key are songs ids
+        songs_num_matches = {}
+        for dc in diff_counter:
+            for sid in diff_counter[dc]:
+                match_val = diff_counter[dc][sid]
+                if (sid not in songs_num_matches) or (match_val > songs_num_matches[sid]['value']):
+                    songs_num_matches[sid] = {
+                        'sid': sid,
+                        'value': match_val,
+                        'largest': dc
+                    }
+
+        # use dicc of songs to create an ordered (descending) list using the match value property assigned to each song
+        songs_num_matches_list = []
+        for s in songs_num_matches:
+            songs_num_matches_list.append({
+                'sid': s,
+                'object': songs_num_matches[s]
+            })
+
+        songs_num_matches_list_ordered = sorted(songs_num_matches_list, key=lambda x: x['object']['value'],
+                                                reverse=True)
+
+        # iterate the ordered list and fill results
+        songs_result = []
+        for s in songs_num_matches_list_ordered:
+
+            # get expected variable by the original code
+            song_id = s['object']['sid']
+            largest = s['object']['largest']
+            largest_count = s['object']['value']
+
+            # extract identification
+            song = self.db.get_song_by_id(song_id)
+            if song:
+                # TODO: Clarify what `get_song_by_id` should return.
+                songname = song.get(SONG_NAME, None)
+
+                # return match info
+                nseconds = round(float(largest) / DEFAULT_FS *
+                                 DEFAULT_WINDOW_SIZE *
+                                 DEFAULT_OVERLAP_RATIO, 5)
+                song = {
+                    SONG_ID: song_id,
+                    SONG_NAME: songname.encode("utf8"),
+                    CONFIDENCE: largest_count,
+                    OFFSET: int(largest),
+                    OFFSET_SECS: nseconds,
+                    FIELD_FILE_SHA1: song.get(FIELD_FILE_SHA1, None).encode("utf8")
+                }
+
+                songs_result.append(song)
+
+                # only consider up to topn elements in the result
+                if len(songs_result) > topn:
+                    break
+        return songs_result
 
     def recognize(self, recognizer, *options, **kwoptions):
         r = recognizer(self)
@@ -177,26 +203,15 @@ def _fingerprint_worker(filename, limit=None, song_name=None):
 
     songname, extension = os.path.splitext(os.path.basename(filename))
     song_name = song_name or songname
-    channels, Fs, file_hash = decoder.read(filename, limit)
+    channels, fs, file_hash = decoder.read(filename, limit)
     result = set()
     channel_amount = len(channels)
 
     for channeln, channel in enumerate(channels):
         # TODO: Remove prints or change them into optional logging.
-        print("Fingerprinting channel %d/%d for %s" % (channeln + 1,
-                                                       channel_amount,
-                                                       filename))
-        hashes = fingerprint.fingerprint(channel, Fs=Fs)
-        print("Finished channel %d/%d for %s" % (channeln + 1, channel_amount,
-                                                 filename))
+        print(f"Fingerprinting channel {channeln + 1}/{channel_amount} for {filename}")
+        hashes = fingerprint(channel, Fs=fs)
+        print(f"Finished channel {channeln + 1}/{channel_amount} for {filename}")
         result |= set(hashes)
 
     return song_name, result, file_hash
-
-
-def chunkify(lst, n):
-    """
-    Splits a list into roughly n equal parts.
-    http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
-    """
-    return [lst[i::n] for i in xrange(n)]
diff --git a/dejavu/config/config.py b/dejavu/config/config.py
@@ -0,0 +1,74 @@
+# Dejavu
+SONG_ID = "song_id"
+SONG_NAME = 'song_name'
+CONFIDENCE = 'confidence'
+MATCH_TIME = 'match_time'
+OFFSET = 'offset'
+OFFSET_SECS = 'offset_seconds'
+
+# DATABASE CLASS INSTANCES:
+DATABASES = {
+    'mysql': ("dejavu.database_handler.mysql_database", "MySQLDatabase")
+}
+
+# TABLE SONGS
+SONGS_TABLENAME = "songs"
+
+# SONGS FIELDS
+FIELD_SONG_ID = 'song_id'
+FIELD_SONGNAME = 'song_name'
+FIELD_FINGERPRINTED = "fingerprinted"
+FIELD_FILE_SHA1 = 'file_sha1'
+
+# TABLE FINGERPRINTS
+FINGERPRINTS_TABLENAME = "fingerprints"
+
+# FINGERPRINTS FIELDS
+FIELD_HASH = 'hash'
+FIELD_OFFSET = 'offset'
+
+# FINGERPRINTS CONFIG:
+# Sampling rate, related to the Nyquist conditions, which affects
+# the range frequencies we can detect.
+DEFAULT_FS = 44100
+
+# Size of the FFT window, affects frequency granularity
+DEFAULT_WINDOW_SIZE = 4096
+
+# Ratio by which each sequential window overlaps the last and the
+# next window. Higher overlap will allow a higher granularity of offset
+# matching, but potentially more fingerprints.
+DEFAULT_OVERLAP_RATIO = 0.5
+
+# Degree to which a fingerprint can be paired with its neighbors --
+# higher will cause more fingerprints, but potentially better accuracy.
+DEFAULT_FAN_VALUE = 15
+
+# Minimum amplitude in spectrogram in order to be considered a peak.
+# This can be raised to reduce number of fingerprints, but can negatively
+# affect accuracy.
+DEFAULT_AMP_MIN = 10
+
+# Number of cells around an amplitude peak in the spectrogram in order
+# for Dejavu to consider it a spectral peak. Higher values mean less
+# fingerprints and faster matching, but can potentially affect accuracy.
+PEAK_NEIGHBORHOOD_SIZE = 20
+
+# Thresholds on how close or far fingerprints can be in time in order
+# to be paired as a fingerprint. If your max is too low, higher values of
+# DEFAULT_FAN_VALUE may not perform as expected.
+MIN_HASH_TIME_DELTA = 0
+MAX_HASH_TIME_DELTA = 200
+
+# If True, will sort peaks temporally for fingerprinting;
+# not sorting will cut down number of fingerprints, but potentially
+# affect performance.
+PEAK_SORT = True
+
+# Number of bits to grab from the front of the SHA1 hash in the
+# fingerprint calculation. The more you grab, the more memory storage,
+# with potentially lesser collisions of matches.
+FINGERPRINT_REDUCTION = 20
+
+# Number of results being returned for file recognition
+TOPN = 2
diff --git a/dejavu/database.py b/dejavu/database.py
@@ -1,22 +1,15 @@
-from __future__ import absolute_import
 import abc
+import importlib
+from dejavu.config.config import DATABASES
 
 
-class Database(object):
-    __metaclass__ = abc.ABCMeta
-
-    FIELD_FILE_SHA1 = 'file_sha1'
-    FIELD_SONG_ID = 'song_id'
-    FIELD_SONGNAME = 'song_name'
-    FIELD_OFFSET = 'offset'
-    FIELD_HASH = 'hash'
-
+class Database(object, metaclass=abc.ABCMeta):
     # Name of your Database subclass, this is used in configuration
     # to refer to your class
     type = None
 
     def __init__(self):
-        super(Database, self).__init__()
+        super().__init__()
 
     def before_fork(self):
         """
@@ -159,18 +152,11 @@ def return_matches(self, hashes):
         pass
 
 
-def get_database(database_type=None):
-    # Default to using the mysql database
-    database_type = database_type or "mysql"
-    # Lower all the input.
-    database_type = database_type.lower()
-
-    for db_cls in Database.__subclasses__():
-        if db_cls.type == database_type:
-            return db_cls
-
-    raise TypeError("Unsupported database type supplied.")
-
-
-# Import our default database handler
-import dejavu.database_sql
+def get_database(database_type="mysql"):
+    path, db_class_name = DATABASES[database_type]
+    try:
+        db_module = importlib.import_module(path)
+        db_class = getattr(db_module, db_class_name)
+        return db_class
+    except ImportError:
+        raise TypeError("Unsupported database type supplied.")
diff --git a/dejavu/database_handler/__init__.py b/dejavu/database_handler/__init__.py