@@ -858,6 +858,162 @@ def compute_deltas(specgram, win_length=5, mode="replicate"):
858
858
return output
859
859
860
860
861
+ def gain (waveform , gain_db = 1.0 ):
862
+ # type: (Tensor, float) -> Tensor
863
+ r"""Apply amplification or attenuation to the whole waveform.
864
+
865
+ Args:
866
+ waveform (torch.Tensor): Tensor of audio of dimension (channel, time).
867
+ gain_db (float) Gain adjustment in decibels (dB) (Default: `1.0`).
868
+
869
+ Returns:
870
+ torch.Tensor: the whole waveform amplified by gain_db.
871
+ """
872
+ if (gain_db == 0 ):
873
+ return waveform
874
+
875
+ ratio = 10 ** (gain_db / 20 )
876
+
877
+ return waveform * ratio
878
+
879
+
880
+ def scale_to_interval (waveform , interval_max = 1.0 ):
881
+ # type: (Tensor, float) -> Tensor
882
+ r"""Scale the waveform to the interval [-interval_max, interval_max] across all dimensions.
883
+
884
+ Args:
885
+ waveform (torch.Tensor): Tensor of audio of dimension (channel, time).
886
+ interval_max (float): The bounds of the interval, where the float indicates
887
+ the upper bound and the negative of the float indicates the lower
888
+ bound (Default: `1.0`).
889
+ Example: interval=1.0 -> [-1.0, 1.0]
890
+
891
+ Returns:
892
+ torch.Tensor: the whole waveform scaled to interval.
893
+ """
894
+ abs_max = torch .max (torch .abs (waveform ))
895
+ ratio = abs_max / interval_max
896
+ waveform /= ratio
897
+
898
+ return waveform
899
+
900
+
901
+ def _add_noise_shaping (dithered_waveform , waveform ):
902
+ r"""Noise shaping is calculated by error:
903
+ error[n] = dithered[n] - original[n]
904
+ noise_shaped_waveform[n] = dithered[n] + error[n-1]
905
+ """
906
+ wf_shape = waveform .size ()
907
+ waveform = waveform .reshape (- 1 , wf_shape [- 1 ])
908
+
909
+ dithered_shape = dithered_waveform .size ()
910
+ dithered_waveform = dithered_waveform .reshape (- 1 , dithered_shape [- 1 ])
911
+
912
+ error = dithered_waveform - waveform
913
+
914
+ # add error[n-1] to dithered_waveform[n], so offset the error by 1 index
915
+ for index in range (error .size ()[0 ]):
916
+ err = error [index ]
917
+ error_offset = torch .cat ((torch .zeros (1 ), err ))
918
+ error [index ] = error_offset [:waveform .size ()[1 ]]
919
+
920
+ noise_shaped = dithered_waveform + error
921
+ return noise_shaped .reshape (dithered_shape [:- 1 ] + noise_shaped .shape [- 1 :])
922
+
923
+
924
+ def probability_distribution (waveform , density_function = "TPDF" ):
925
+ # type: (Tensor, str) -> Tensor
926
+ r"""Apply a probability distribution function on a waveform.
927
+
928
+ Triangular probability density function (TPDF) dither noise has a
929
+ triangular distribution; values in the center of the range have a higher
930
+ probability of occurring.
931
+
932
+ Rectangular probability density function (RPDF) dither noise has a
933
+ uniform distribution; any value in the specified range has the same
934
+ probability of occurring.
935
+
936
+ Gaussian probability density function (GPDF) has a normal distribution.
937
+ The relationship of probabilities of results follows a bell-shaped,
938
+ or Gaussian curve, typical of dither generated by analog sources.
939
+ Args:
940
+ waveform (torch.Tensor): Tensor of audio of dimension (channel, time)
941
+ probability_density_function (string): The density function of a
942
+ continuous random variable (Default: `TPDF`)
943
+ Options: Triangular Probability Density Function - `TPDF`
944
+ Rectangular Probability Density Function - `RPDF`
945
+ Gaussian Probability Density Function - `GPDF`
946
+ Returns:
947
+ torch.Tensor: waveform dithered with TPDF
948
+ """
949
+ shape = waveform .size ()
950
+ waveform = waveform .reshape (- 1 , shape [- 1 ])
951
+
952
+ channel_size = waveform .size ()[0 ] - 1
953
+ time_size = waveform .size ()[- 1 ] - 1
954
+
955
+ random_channel = int (torch .randint (channel_size , [1 , ]).item ()) if channel_size > 0 else 0
956
+ random_time = int (torch .randint (time_size , [1 , ]).item ()) if time_size > 0 else 0
957
+
958
+ number_of_bits = 16
959
+ up_scaling = 2 ** (number_of_bits - 1 ) - 2
960
+ signal_scaled = waveform * up_scaling
961
+ down_scaling = 2 ** (number_of_bits - 1 )
962
+
963
+ signal_scaled_dis = waveform
964
+ if (density_function == "RPDF" ):
965
+ RPDF = waveform [random_channel ][random_time ] - 0.5
966
+
967
+ signal_scaled_dis = signal_scaled + RPDF
968
+ elif (density_function == "GPDF" ):
969
+ # TODO Replace by distribution code once
970
+ # https://github.com/pytorch/pytorch/issues/29843 is resolved
971
+ # gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample()
972
+
973
+ num_rand_variables = 6
974
+
975
+ gaussian = waveform [random_channel ][random_time ]
976
+ for ws in num_rand_variables * [time_size ]:
977
+ rand_chan = int (torch .randint (channel_size , [1 , ]).item ())
978
+ gaussian += waveform [rand_chan ][int (torch .randint (ws , [1 , ]).item ())]
979
+
980
+ signal_scaled_dis = signal_scaled + gaussian
981
+ else :
982
+ TPDF = torch .bartlett_window (time_size + 1 )
983
+ TPDF = TPDF .repeat ((channel_size + 1 ), 1 )
984
+ signal_scaled_dis = signal_scaled + TPDF
985
+
986
+ quantised_signal_scaled = torch .round (signal_scaled_dis )
987
+ quantised_signal = quantised_signal_scaled / down_scaling
988
+ return quantised_signal .reshape (shape [:- 1 ] + quantised_signal .shape [- 1 :])
989
+
990
+
991
+ def dither (waveform , density_function = "TPDF" , noise_shaping = False ):
992
+ # type: (Tensor, str, bool) -> Tensor
993
+ r"""Dither increases the perceived dynamic range of audio stored at a
994
+ particular bit-depth by eliminating nonlinear truncation distortion
995
+ (i.e. adding minimally perceived noise to mask distortion caused by quantization).
996
+ Args:
997
+ waveform (torch.Tensor): Tensor of audio of dimension (channel, time)
998
+ density_function (string): The density function of a
999
+ continuous random variable (Default: `TPDF`)
1000
+ Options: Triangular Probability Density Function - `TPDF`
1001
+ Rectangular Probability Density Function - `RPDF`
1002
+ Gaussian Probability Density Function - `GPDF`
1003
+ noise_shaping (boolean): a filtering process that shapes the spectral
1004
+ energy of quantisation error (Default: `False`)
1005
+
1006
+ Returns:
1007
+ torch.Tensor: waveform dithered
1008
+ """
1009
+ dithered = probability_distribution (waveform , density_function = density_function )
1010
+
1011
+ if noise_shaping :
1012
+ return _add_noise_shaping (dithered , waveform )
1013
+ else :
1014
+ return dithered
1015
+
1016
+
861
1017
def _compute_nccf (waveform , sample_rate , frame_time , freq_low ):
862
1018
# type: (Tensor, int, float, int) -> Tensor
863
1019
r"""
0 commit comments