forked from PlayVoice/whisper-vits-svc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloudness.py
78 lines (55 loc) · 2.26 KB
/
loudness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import warnings
import librosa
import numpy as np
import resampy
import torch
import crepe
###############################################################################
# Constants
###############################################################################
# Minimum decibel level
MIN_DB = -100.
# Reference decibel level
REF_DB = 20.
###############################################################################
# A-weighted loudness
###############################################################################
def a_weighted(audio, sample_rate, hop_length=None, pad=True):
"""Retrieve the per-frame loudness"""
# Save device
device = audio.device
# Default hop length of 10 ms
hop_length = sample_rate // 100 if hop_length is None else hop_length
# Convert to numpy
audio = audio.detach().cpu().numpy().squeeze(0)
# Resample
if sample_rate != crepe.SAMPLE_RATE:
audio = resampy.resample(audio, sample_rate, crepe.SAMPLE_RATE)
hop_length = int(hop_length * crepe.SAMPLE_RATE / sample_rate)
# Cache weights
if not hasattr(a_weighted, 'weights'):
a_weighted.weights = perceptual_weights()
# Take stft
stft = librosa.stft(audio,
n_fft=crepe.WINDOW_SIZE,
hop_length=hop_length,
win_length=crepe.WINDOW_SIZE,
center=pad,
pad_mode='constant')
# Compute magnitude on db scale
db = librosa.amplitude_to_db(np.abs(stft))
# Apply A-weighting
weighted = db + a_weighted.weights
# Threshold
weighted[weighted < MIN_DB] = MIN_DB
# Average over weighted frequencies
return torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None]
def perceptual_weights():
"""A-weighted frequency-dependent perceptual loudness weights"""
frequencies = librosa.fft_frequencies(sr=crepe.SAMPLE_RATE,
n_fft=crepe.WINDOW_SIZE)
# A warning is raised for nearly inaudible frequencies, but it ends up
# defaulting to -100 db. That default is fine for our purposes.
with warnings.catch_warnings():
warnings.simplefilter('ignore', RuntimeWarning)
return librosa.A_weighting(frequencies)[:, None] - REF_DB