Skip to content

Commit ce00692

Browse files
committed
Merge pull request librosa#279 from bmcfee/complex-cqt
Further CQT enhancements
2 parents 015a8fe + 9770d06 commit ce00692

File tree

9 files changed

+168
-100
lines changed

9 files changed

+168
-100
lines changed

librosa/core/constantq.py

Lines changed: 97 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import numpy as np
77
import scipy.fftpack as fft
8+
from warnings import warn
89

910
from . import audio
1011
from .time_frequency import cqt_frequencies, note_to_hz
@@ -20,8 +21,9 @@
2021

2122
@cache
2223
def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
23-
bins_per_octave=12, tuning=None, resolution=2,
24-
aggregate=None, norm=1, sparsity=0.01):
24+
bins_per_octave=12, tuning=None, filter_scale=2,
25+
aggregate=None, norm=1, sparsity=0.01, real=True,
26+
resolution=util.Deprecated()):
2527
'''Compute the constant-Q transform of an audio signal.
2628
2729
This implementation is based on the recursive sub-sampling method
@@ -56,8 +58,9 @@ def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
5658
5759
If `None`, tuning will be automatically estimated.
5860
59-
resolution : float > 0
60-
Filter resolution factor. Larger values use longer windows.
61+
filter_scale : float > 0
62+
Filter scale factor. Small values (<1) use shorter windows
63+
for improved time resolution.
6164
6265
aggregate : None or function
6366
Aggregation function for time-oversampling energy aggregation.
@@ -73,15 +76,25 @@ def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
7376
7477
Set `sparsity=0` to disable sparsification.
7578
79+
real : bool
80+
If true, return only the magnitude of the CQT.
81+
82+
resolution : float
83+
.. warning:: This parameter name was in librosa 0.4.2
84+
Use the `filter_scale` parameter instead.
85+
The `resolution` parameter will be removed in librosa 0.5.0.
86+
87+
7688
Returns
7789
-------
78-
CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
79-
Constant-Q energy for each frequency at each time.
90+
CQT : np.ndarray [shape=(n_bins, t), dtype=np.complex or np.float]
91+
Constant-Q value each frequency at each time.
8092
8193
Raises
8294
------
8395
ParameterError
84-
If `hop_length` is not an integer multiple of `2**(n_bins / bins_per_octave)`
96+
If `hop_length` is not an integer multiple of
97+
`2**(n_bins / bins_per_octave)`
8598
8699
See Also
87100
--------
@@ -115,7 +128,7 @@ def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
115128
[ 2.363e-07, 5.329e-07, ..., 1.294e-07, 1.611e-07]])
116129
117130
118-
Using a higher resolution
131+
Using a higher frequency resolution
119132
120133
>>> C = librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
121134
... n_bins=60 * 2, bins_per_octave=12 * 2)
@@ -127,10 +140,20 @@ def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
127140
[ 4.896e-08, 5.407e-07, ..., 9.176e-08, 1.051e-07]])
128141
'''
129142

143+
filter_scale = util.rename_kw('resolution', resolution,
144+
'filter_scale', filter_scale,
145+
'0.4.2', '0.5.0')
146+
147+
if real:
148+
warn('Real-valued CQT (real=True) is deprecated in 0.4.2. '
149+
'Complex-valued CQT will become the default in 0.5.0. '
150+
'Consider using np.abs(librosa.cqt(..., real=False)) '
151+
'instead of real=True to maintain forward compatibility.',
152+
DeprecationWarning)
153+
130154
# How many octaves are we dealing with?
131155
n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
132156

133-
134157
if fmin is None:
135158
# C2 by default
136159
fmin = note_to_hz('C1')
@@ -146,7 +169,7 @@ def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
146169
fmax_t = np.max(freqs)
147170

148171
# Determine required resampling quality
149-
Q = float(resolution) / (2.0**(1. / bins_per_octave) - 1)
172+
Q = float(filter_scale) / (2.0**(1. / bins_per_octave) - 1)
150173

151174
filter_cutoff = fmax_t * (1 + filters.window_bandwidth('hann') / Q)
152175

@@ -176,7 +199,7 @@ def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
176199
n_filters,
177200
bins_per_octave,
178201
tuning,
179-
resolution,
202+
filter_scale,
180203
norm,
181204
sparsity)
182205
min_filter_length = np.min(filter_lengths)
@@ -203,15 +226,16 @@ def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
203226
# Make sure our hop is long enough to support the bottom octave
204227
num_twos = __num_two_factors(hop_length)
205228
if num_twos < n_octaves - 1:
206-
raise ParameterError('hop_length must be a positive integer multiple of 2^{0:d} '
207-
'for {1:d}-octave CQT'.format(n_octaves - 1, n_octaves))
229+
raise ParameterError('hop_length must be a positive integer '
230+
'multiple of 2^{0:d} for {1:d}-octave CQT'
231+
.format(n_octaves - 1, n_octaves))
208232

209233
# Now do the recursive bit
210234
fft_basis, n_fft, filter_lengths = __fft_filters(sr, fmin_t,
211235
n_filters,
212236
bins_per_octave,
213237
tuning,
214-
resolution,
238+
filter_scale,
215239
norm,
216240
sparsity)
217241

@@ -239,13 +263,14 @@ def cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
239263
# Convolve
240264
cqt_resp.append(my_cqt)
241265

242-
return __trim_stack(cqt_resp, n_bins)
266+
return __trim_stack(cqt_resp, n_bins, real)
243267

244268

245269
@cache
246270
def hybrid_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
247-
bins_per_octave=12, tuning=None, resolution=2,
248-
norm=1, sparsity=0.01):
271+
bins_per_octave=12, tuning=None, filter_scale=2,
272+
norm=1, sparsity=0.01,
273+
resolution=util.Deprecated()):
249274
'''Compute the hybrid constant-Q transform of an audio signal.
250275
251276
Here, the hybrid CQT uses the pseudo CQT for higher frequencies where
@@ -277,15 +302,21 @@ def hybrid_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
277302
278303
If `None`, tuning will be automatically estimated.
279304
280-
resolution : float > 0
281-
Filter resolution factor. Larger values use longer windows.
305+
filter_scale : float > 0
306+
Filter filter_scale factor. Larger values use longer windows.
282307
283308
sparsity : float in [0, 1)
284309
Sparsify the CQT basis by discarding up to `sparsity`
285310
fraction of the energy in each basis.
286311
287312
Set `sparsity=0` to disable sparsification.
288313
314+
resolution : float
315+
.. warning:: This parameter name was in librosa 0.4.2
316+
Use the `filter_scale` parameter instead.
317+
The `resolution` parameter will be removed in librosa 0.5.0.
318+
319+
289320
Returns
290321
-------
291322
CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
@@ -294,16 +325,18 @@ def hybrid_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
294325
Raises
295326
------
296327
ParameterError
297-
If `hop_length` is not an integer multiple of `2**(n_bins / bins_per_octave)`
328+
If `hop_length` is not an integer multiple of
329+
`2**(n_bins / bins_per_octave)`
298330
299331
See Also
300332
--------
301333
cqt
302334
pseudo_cqt
303335
'''
304336

305-
# How many octaves are we dealing with?
306-
n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
337+
filter_scale = util.rename_kw('resolution', resolution,
338+
'filter_scale', filter_scale,
339+
'0.4.2', '0.5.0')
307340

308341
if fmin is None:
309342
# C1 by default
@@ -322,7 +355,7 @@ def hybrid_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
322355
n_bins=n_bins,
323356
bins_per_octave=bins_per_octave,
324357
tuning=tuning,
325-
resolution=resolution)
358+
filter_scale=filter_scale)
326359

327360
# Determine which filters to use with Pseudo CQT
328361
pseudo_filters = lengths < 2*hop_length
@@ -338,7 +371,7 @@ def hybrid_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
338371
n_bins=n_bins_pseudo,
339372
bins_per_octave=bins_per_octave,
340373
tuning=tuning,
341-
resolution=resolution,
374+
filter_scale=filter_scale,
342375
norm=norm,
343376
sparsity=sparsity)
344377
cqt_resp.append(my_pseudo_cqt)
@@ -349,25 +382,27 @@ def hybrid_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
349382

350383
fmin_full = np.min(freqs[~pseudo_filters])
351384

352-
my_cqt = cqt(y, sr,
353-
hop_length=hop_length,
354-
fmin=fmin_full,
355-
n_bins=n_bins_full,
356-
bins_per_octave=bins_per_octave,
357-
tuning=tuning,
358-
resolution=resolution,
359-
norm=norm,
360-
sparsity=sparsity)
385+
my_cqt = np.abs(cqt(y, sr,
386+
hop_length=hop_length,
387+
fmin=fmin_full,
388+
n_bins=n_bins_full,
389+
bins_per_octave=bins_per_octave,
390+
tuning=tuning,
391+
filter_scale=filter_scale,
392+
norm=norm,
393+
sparsity=sparsity,
394+
real=False))
361395

362396
cqt_resp.append(my_cqt)
363397

364-
return __trim_stack(cqt_resp, n_bins)
398+
return __trim_stack(cqt_resp, n_bins, True)
365399

366400

367401
@cache
368402
def pseudo_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
369-
bins_per_octave=12, tuning=None, resolution=2,
370-
norm=1, sparsity=0.01):
403+
bins_per_octave=12, tuning=None, filter_scale=2,
404+
norm=1, sparsity=0.01,
405+
resolution=util.Deprecated()):
371406
'''Compute the pseudo constant-Q transform of an audio signal.
372407
373408
This uses a single fft size that is the smallest power of 2 that is greater
@@ -401,15 +436,21 @@ def pseudo_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
401436
402437
If `None`, tuning will be automatically estimated.
403438
404-
resolution : float > 0
405-
Filter resolution factor. Larger values use longer windows.
439+
filter_scale : float > 0
440+
Filter filter_scale factor. Larger values use longer windows.
406441
407442
sparsity : float in [0, 1)
408443
Sparsify the CQT basis by discarding up to `sparsity`
409444
fraction of the energy in each basis.
410445
411446
Set `sparsity=0` to disable sparsification.
412447
448+
resolution : float
449+
.. warning:: This parameter name was in librosa 0.4.2
450+
Use the `filter_scale` parameter instead.
451+
The `resolution` parameter will be removed in librosa 0.5.0.
452+
453+
413454
Returns
414455
-------
415456
CQT : np.ndarray [shape=(n_bins, t), dtype=np.float]
@@ -418,10 +459,15 @@ def pseudo_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
418459
Raises
419460
------
420461
ParameterError
421-
If `hop_length` is not an integer multiple of `2**(n_bins / bins_per_octave)`
462+
If `hop_length` is not an integer multiple of
463+
`2**(n_bins / bins_per_octave)`
422464
423465
'''
424466

467+
filter_scale = util.rename_kw('resolution', resolution,
468+
'filter_scale', filter_scale,
469+
'0.4.2', '0.5.0')
470+
425471
if fmin is None:
426472
# C1 by default
427473
fmin = note_to_hz('C1')
@@ -434,12 +480,11 @@ def pseudo_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
434480
n_bins,
435481
bins_per_octave,
436482
tuning,
437-
resolution,
483+
filter_scale,
438484
norm,
439485
sparsity,
440486
hop_length=hop_length)
441487

442-
# Remove phase for Pseudo CQT
443488
fft_basis = np.abs(fft_basis)
444489

445490
# Compute the magnitude STFT with Hann window
@@ -450,15 +495,15 @@ def pseudo_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84,
450495

451496

452497
def __fft_filters(sr, fmin, n_bins, bins_per_octave, tuning,
453-
resolution, norm, sparsity, hop_length=None):
498+
filter_scale, norm, sparsity, hop_length=None):
454499
'''Generate the frequency domain constant-Q filter basis.'''
455500

456501
basis, lengths = filters.constant_q(sr,
457502
fmin=fmin,
458503
n_bins=n_bins,
459504
bins_per_octave=bins_per_octave,
460505
tuning=tuning,
461-
resolution=resolution,
506+
filter_scale=filter_scale,
462507
norm=norm,
463508
pad_fft=True)
464509

@@ -480,7 +525,7 @@ def __fft_filters(sr, fmin, n_bins, bins_per_octave, tuning,
480525
return fft_basis, n_fft, lengths
481526

482527

483-
def __trim_stack(cqt_resp, n_bins):
528+
def __trim_stack(cqt_resp, n_bins, real):
484529
'''Helper function to trim and stack a collection of CQT responses'''
485530

486531
# cleanup any framing errors at the boundaries
@@ -490,7 +535,11 @@ def __trim_stack(cqt_resp, n_bins):
490535

491536
# Finally, clip out any bottom frequencies that we don't really want
492537
# Transpose magic here to ensure column-contiguity
493-
return np.ascontiguousarray(cqt_resp[-n_bins:].T).T
538+
539+
C = np.ascontiguousarray(cqt_resp[-n_bins:].T).T
540+
if real:
541+
C = np.abs(C)
542+
return C
494543

495544

496545
def __variable_hop_response(y, n_fft, hop_length, min_filter_length,
@@ -515,7 +564,7 @@ def __variable_hop_response(y, n_fft, hop_length, min_filter_length,
515564
window=np.ones)
516565

517566
# And filter response energy
518-
my_cqt = np.abs(fft_basis.dot(D))
567+
my_cqt = fft_basis.dot(D)
519568

520569
if zoom_factor > 1:
521570
# We need to aggregate. Generate the boundary frames
@@ -532,9 +581,8 @@ def __early_downsample(y, sr, hop_length, res_type, n_octaves,
532581
if not (res_type == 'sinc_fastest' and audio._HAS_SAMPLERATE):
533582
return y, sr, hop_length
534583

535-
536-
downsample_count1 = int(np.ceil(np.log2(audio.BW_FASTEST * nyquist
537-
/ filter_cutoff)) - 1)
584+
downsample_count1 = int(np.ceil(np.log2(audio.BW_FASTEST * nyquist /
585+
filter_cutoff)) - 1)
538586
num_twos = __num_two_factors(hop_length)
539587
downsample_count2 = max(0, num_twos - n_octaves + 1)
540588
downsample_count = min(downsample_count1, downsample_count2)
@@ -565,4 +613,3 @@ def __num_two_factors(x):
565613
x //= 2
566614

567615
return num_twos
568-

0 commit comments

Comments
 (0)