Skip to content

Commit

Permalink
migrated code to python 3.6.6 and refactored some code to improve it.
Browse files Browse the repository at this point in the history
  • Loading branch information
mauricio-repetto committed Sep 18, 2019
1 parent d2b8761 commit 78dfef0
Show file tree
Hide file tree
Showing 18 changed files with 681 additions and 660 deletions.
6 changes: 3 additions & 3 deletions dejavu.cnf.SAMPLE
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"database": {
"host": "127.0.0.1",
"user": "root",
"passwd": "12345678",
"db": "dejavu"
"password": "rootpass",
"database": "dejavu"
}
}
}
6 changes: 3 additions & 3 deletions dejavu.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def init(configpath):
with open(configpath) as f:
config = json.load(f)
except IOError as err:
print("Cannot open configuration: %s. Exiting" % (str(err)))
print(("Cannot open configuration: %s. Exiting" % (str(err))))
sys.exit(1)

# create a Dejavu instance
Expand Down Expand Up @@ -67,8 +67,8 @@ def init(configpath):
if len(args.fingerprint) == 2:
directory = args.fingerprint[0]
extension = args.fingerprint[1]
print("Fingerprinting all .%s files in the %s directory"
% (extension, directory))
print(("Fingerprinting all .%s files in the %s directory"
% (extension, directory)))
djv.fingerprint_directory(directory, ["." + extension], 4)

elif len(args.fingerprint) == 1:
Expand Down
149 changes: 82 additions & 67 deletions dejavu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
from dejavu.database import get_database, Database
import dejavu.decoder as decoder
import fingerprint
import multiprocessing
import os
import traceback
import sys
import traceback

import dejavu.decoder as decoder
from dejavu.config.config import (CONFIDENCE, DEFAULT_FS,
DEFAULT_OVERLAP_RATIO, DEFAULT_WINDOW_SIZE,
FIELD_FILE_SHA1, OFFSET, OFFSET_SECS,
SONG_ID, SONG_NAME, TOPN)
from dejavu.database import get_database
from dejavu.fingerprint import fingerprint

class Dejavu(object):

SONG_ID = "song_id"
SONG_NAME = 'song_name'
CONFIDENCE = 'confidence'
MATCH_TIME = 'match_time'
OFFSET = 'offset'
OFFSET_SECS = 'offset_seconds'

class Dejavu:
def __init__(self, config):
super(Dejavu, self).__init__()

self.config = config

# initialize db
db_cls = get_database(config.get("database_type", None))
db_cls = get_database(config.get("database_type", "mysql").lower())

self.db = db_cls(**config.get("database", {}))
self.db.setup()
Expand All @@ -39,7 +34,7 @@ def get_fingerprinted_songs(self):
self.songs = self.db.get_songs()
self.songhashes_set = set() # to know which ones we've computed before
for song in self.songs:
song_hash = song[Database.FIELD_FILE_SHA1]
song_hash = song[FIELD_FILE_SHA1]
self.songhashes_set.add(song_hash)

def fingerprint_directory(self, path, extensions, nprocesses=None):
Expand All @@ -55,26 +50,23 @@ def fingerprint_directory(self, path, extensions, nprocesses=None):

filenames_to_fingerprint = []
for filename, _ in decoder.find_files(path, extensions):

# don't refingerprint already fingerprinted files
if decoder.unique_hash(filename) in self.songhashes_set:
print "%s already fingerprinted, continuing..." % filename
print(f"{filename} already fingerprinted, continuing...")
continue

filenames_to_fingerprint.append(filename)

# Prepare _fingerprint_worker input
worker_input = zip(filenames_to_fingerprint,
[self.limit] * len(filenames_to_fingerprint))
worker_input = list(zip(filenames_to_fingerprint, [self.limit] * len(filenames_to_fingerprint)))

# Send off our tasks
iterator = pool.imap_unordered(_fingerprint_worker,
worker_input)
iterator = pool.imap_unordered(_fingerprint_worker, worker_input)

# Loop till we have all of them
while True:
try:
song_name, hashes, file_hash = iterator.next()
song_name, hashes, file_hash = next(iterator)
except multiprocessing.TimeoutError:
continue
except StopIteration:
Expand All @@ -99,7 +91,7 @@ def fingerprint_file(self, filepath, song_name=None):
song_name = song_name or songname
# don't refingerprint already fingerprinted files
if song_hash in self.songhashes_set:
print "%s already fingerprinted, continuing..." % song_name
print(f"{song_name} already fingerprinted, continuing...")
else:
song_name, hashes, file_hash = _fingerprint_worker(
filepath,
Expand All @@ -112,22 +104,21 @@ def fingerprint_file(self, filepath, song_name=None):
self.db.set_song_fingerprinted(sid)
self.get_fingerprinted_songs()

def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS):
hashes = fingerprint.fingerprint(samples, Fs=Fs)
def find_matches(self, samples, Fs=DEFAULT_FS):
hashes = fingerprint(samples, Fs=Fs)
return self.db.return_matches(hashes)

def align_matches(self, matches):
def align_matches(self, matches, topn=TOPN):
"""
Finds hash matches that align in time with other matches and finds
consensus about which hashes are "true" signal from the audio.
Returns a dictionary with match information.
Returns a list of dictionaries (based on topn) with match information.
"""
# align by diffs
diff_counter = {}
largest = 0
largest_count = 0
song_id = -1

for tup in matches:
sid, diff = tup
if diff not in diff_counter:
Expand All @@ -137,30 +128,65 @@ def align_matches(self, matches):
diff_counter[diff][sid] += 1

if diff_counter[diff][sid] > largest_count:
largest = diff
largest_count = diff_counter[diff][sid]
song_id = sid

# extract idenfication
song = self.db.get_song_by_id(song_id)
if song:
# TODO: Clarify what `get_song_by_id` should return.
songname = song.get(Dejavu.SONG_NAME, None)
else:
return None

# return match info
nseconds = round(float(largest) / fingerprint.DEFAULT_FS *
fingerprint.DEFAULT_WINDOW_SIZE *
fingerprint.DEFAULT_OVERLAP_RATIO, 5)
song = {
Dejavu.SONG_ID : song_id,
Dejavu.SONG_NAME : songname.encode("utf8"),
Dejavu.CONFIDENCE : largest_count,
Dejavu.OFFSET : int(largest),
Dejavu.OFFSET_SECS : nseconds,
Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None).encode("utf8"),}
return song
# create dic where key are songs ids
songs_num_matches = {}
for dc in diff_counter:
for sid in diff_counter[dc]:
match_val = diff_counter[dc][sid]
if (sid not in songs_num_matches) or (match_val > songs_num_matches[sid]['value']):
songs_num_matches[sid] = {
'sid': sid,
'value': match_val,
'largest': dc
}

# use dicc of songs to create an ordered (descending) list using the match value property assigned to each song
songs_num_matches_list = []
for s in songs_num_matches:
songs_num_matches_list.append({
'sid': s,
'object': songs_num_matches[s]
})

songs_num_matches_list_ordered = sorted(songs_num_matches_list, key=lambda x: x['object']['value'],
reverse=True)

# iterate the ordered list and fill results
songs_result = []
for s in songs_num_matches_list_ordered:

# get expected variable by the original code
song_id = s['object']['sid']
largest = s['object']['largest']
largest_count = s['object']['value']

# extract identification
song = self.db.get_song_by_id(song_id)
if song:
# TODO: Clarify what `get_song_by_id` should return.
songname = song.get(SONG_NAME, None)

# return match info
nseconds = round(float(largest) / DEFAULT_FS *
DEFAULT_WINDOW_SIZE *
DEFAULT_OVERLAP_RATIO, 5)
song = {
SONG_ID: song_id,
SONG_NAME: songname.encode("utf8"),
CONFIDENCE: largest_count,
OFFSET: int(largest),
OFFSET_SECS: nseconds,
FIELD_FILE_SHA1: song.get(FIELD_FILE_SHA1, None).encode("utf8")
}

songs_result.append(song)

# only consider up to topn elements in the result
if len(songs_result) > topn:
break
return songs_result

def recognize(self, recognizer, *options, **kwoptions):
r = recognizer(self)
Expand All @@ -177,26 +203,15 @@ def _fingerprint_worker(filename, limit=None, song_name=None):

songname, extension = os.path.splitext(os.path.basename(filename))
song_name = song_name or songname
channels, Fs, file_hash = decoder.read(filename, limit)
channels, fs, file_hash = decoder.read(filename, limit)
result = set()
channel_amount = len(channels)

for channeln, channel in enumerate(channels):
# TODO: Remove prints or change them into optional logging.
print("Fingerprinting channel %d/%d for %s" % (channeln + 1,
channel_amount,
filename))
hashes = fingerprint.fingerprint(channel, Fs=Fs)
print("Finished channel %d/%d for %s" % (channeln + 1, channel_amount,
filename))
print(f"Fingerprinting channel {channeln + 1}/{channel_amount} for {filename}")
hashes = fingerprint(channel, Fs=fs)
print(f"Finished channel {channeln + 1}/{channel_amount} for {filename}")
result |= set(hashes)

return song_name, result, file_hash


def chunkify(lst, n):
"""
Splits a list into roughly n equal parts.
http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
"""
return [lst[i::n] for i in xrange(n)]
74 changes: 74 additions & 0 deletions dejavu/config/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Dejavu
SONG_ID = "song_id"
SONG_NAME = 'song_name'
CONFIDENCE = 'confidence'
MATCH_TIME = 'match_time'
OFFSET = 'offset'
OFFSET_SECS = 'offset_seconds'

# DATABASE CLASS INSTANCES:
DATABASES = {
'mysql': ("dejavu.database_handler.mysql_database", "MySQLDatabase")
}

# TABLE SONGS
SONGS_TABLENAME = "songs"

# SONGS FIELDS
FIELD_SONG_ID = 'song_id'
FIELD_SONGNAME = 'song_name'
FIELD_FINGERPRINTED = "fingerprinted"
FIELD_FILE_SHA1 = 'file_sha1'

# TABLE FINGERPRINTS
FINGERPRINTS_TABLENAME = "fingerprints"

# FINGERPRINTS FIELDS
FIELD_HASH = 'hash'
FIELD_OFFSET = 'offset'

# FINGERPRINTS CONFIG:
# Sampling rate, related to the Nyquist conditions, which affects
# the range frequencies we can detect.
DEFAULT_FS = 44100

# Size of the FFT window, affects frequency granularity
DEFAULT_WINDOW_SIZE = 4096

# Ratio by which each sequential window overlaps the last and the
# next window. Higher overlap will allow a higher granularity of offset
# matching, but potentially more fingerprints.
DEFAULT_OVERLAP_RATIO = 0.5

# Degree to which a fingerprint can be paired with its neighbors --
# higher will cause more fingerprints, but potentially better accuracy.
DEFAULT_FAN_VALUE = 15

# Minimum amplitude in spectrogram in order to be considered a peak.
# This can be raised to reduce number of fingerprints, but can negatively
# affect accuracy.
DEFAULT_AMP_MIN = 10

# Number of cells around an amplitude peak in the spectrogram in order
# for Dejavu to consider it a spectral peak. Higher values mean less
# fingerprints and faster matching, but can potentially affect accuracy.
PEAK_NEIGHBORHOOD_SIZE = 20

# Thresholds on how close or far fingerprints can be in time in order
# to be paired as a fingerprint. If your max is too low, higher values of
# DEFAULT_FAN_VALUE may not perform as expected.
MIN_HASH_TIME_DELTA = 0
MAX_HASH_TIME_DELTA = 200

# If True, will sort peaks temporally for fingerprinting;
# not sorting will cut down number of fingerprints, but potentially
# affect performance.
PEAK_SORT = True

# Number of bits to grab from the front of the SHA1 hash in the
# fingerprint calculation. The more you grab, the more memory storage,
# with potentially lesser collisions of matches.
FINGERPRINT_REDUCTION = 20

# Number of results being returned for file recognition
TOPN = 2
38 changes: 12 additions & 26 deletions dejavu/database.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,15 @@
from __future__ import absolute_import
import abc
import importlib
from dejavu.config.config import DATABASES


class Database(object):
__metaclass__ = abc.ABCMeta

FIELD_FILE_SHA1 = 'file_sha1'
FIELD_SONG_ID = 'song_id'
FIELD_SONGNAME = 'song_name'
FIELD_OFFSET = 'offset'
FIELD_HASH = 'hash'

class Database(object, metaclass=abc.ABCMeta):
# Name of your Database subclass, this is used in configuration
# to refer to your class
type = None

def __init__(self):
super(Database, self).__init__()
super().__init__()

def before_fork(self):
"""
Expand Down Expand Up @@ -159,18 +152,11 @@ def return_matches(self, hashes):
pass


def get_database(database_type=None):
# Default to using the mysql database
database_type = database_type or "mysql"
# Lower all the input.
database_type = database_type.lower()

for db_cls in Database.__subclasses__():
if db_cls.type == database_type:
return db_cls

raise TypeError("Unsupported database type supplied.")


# Import our default database handler
import dejavu.database_sql
def get_database(database_type="mysql"):
path, db_class_name = DATABASES[database_type]
try:
db_module = importlib.import_module(path)
db_class = getattr(db_module, db_class_name)
return db_class
except ImportError:
raise TypeError("Unsupported database type supplied.")
Empty file.
Loading

0 comments on commit 78dfef0

Please sign in to comment.