-
Notifications
You must be signed in to change notification settings - Fork 21
/
stream.py
366 lines (302 loc) · 13.5 KB
/
stream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import os
import abc
import yaml
import time
import queue
import threading
import torch
import torchaudio
import numpy as np
from typing import Union
class AudioCodec(abc.ABC):
def __init__(
self,
tx_device: str = "cpu",
rx_device: str = "cpu",
receptive_length: int = 8192,
):
self.tx_device = tx_device
self.rx_device = rx_device
self.receptive_length = receptive_length
self.tx_encoder = None
self.rx_encoder = None
self.decoder = None
@abc.abstractmethod
def _load_encoder(self, checkpoint):
pass
@abc.abstractmethod
def _load_decoder(self, checkpoint):
pass
def _load_config(self, checkpoint, config_name='config.yml'):
dirname = os.path.dirname(checkpoint)
config_path = os.path.join(dirname, config_name)
with open(config_path) as f:
config = yaml.load(f, Loader=yaml.Loader)
return config
def load_transmitter(self, encoder_checkpoint):
# load transmitter model(s)
assert os.path.exists(encoder_checkpoint), f'{encoder_checkpoint} does not exist!'
self.tx_encoder = self._load_encoder(encoder_checkpoint)
self.tx_encoder.eval().to(self.tx_device)
self.tx_encoder.initial_encoder(self.receptive_length, self.tx_device)
print("Load tx_encoder: %s" % (encoder_checkpoint))
def load_receiver(self, encoder_checkpoint, decoder_checkpoint):
# load receiver model(s)
assert os.path.exists(encoder_checkpoint), f'{encoder_checkpoint} does not exist!'
self.rx_encoder = self._load_encoder(encoder_checkpoint)
self.rx_encoder.eval().to(self.rx_device)
zq = self.rx_encoder.initial_encoder(self.receptive_length, self.rx_device)
print("Load rx_encoder: %s" % (encoder_checkpoint))
assert os.path.exists(decoder_checkpoint), f'{decoder_checkpoint} does not exist!'
self.decoder = self._load_decoder(decoder_checkpoint)
self.decoder.eval().to(self.rx_device)
self.decoder.initial_decoder(zq)
print("Load decoder: %s" % (decoder_checkpoint))
class AudioCodecStreamer(abc.ABC):
"""
Streams audio from an input microphone to headpones/speakers.
For each model that can be optionally provided (encoder, decoder), the input audio is processed by the forward call of these models.
Main functions (see function definition for detailed documentation):
* __init__
* enable_filedump
* set_tx_rx_poses
* run
Example usage:
streamer = AudioCodecStreamer(
input_device=1,
output_device=4,
frame_size=512,
encoder=my_encoder_network,
tx_device="cuda:0",
decoder=my_decoder_network,
rx_device="cuda:1",
)
streamer.enable_filedump(input_stream_file="input.wav", output_stream_file="output.wav")
streamer.run()
"""
def __init__(
self,
input_device: Union[str, int],
output_device: Union[str, int],
input_channels: int = 1,
output_channels: int = 1,
frame_size: int = 512,
sample_rate: int = 48000,
gain: int = 1.0,
max_latency: float = 0.1,
# Transmitter params
tx_encoder = None,
tx_device: str = "cpu",
# Receiver params
rx_encoder = None,
decoder = None,
rx_device: str = "cpu",
):
"""
Sounddevice parameters
:param input_device: int or str, name or index of the input device.
To get a list of all input devices call python3 -m sounddevice.
:param output_device: int or str, name or index of the output device.
To get a list of all output devices call python3 -m sounddevice.
:param input_channels: number of input channels, usually 1 but might be multiple microphones as well
:param output_channels: number of output channels, usually 2 for binaural audio
:param frame_size: number of audio samples in a frame
:param sample_rate: sample rate of the audio signal
:param gain: linear factor to scale the input audio by
:param max_latency: maximal accepted latency in seconds before frames get dropped
#######
Transmitter parameters
:param tx_encoder: encoder network in the transimtter side
Is an instance of torch.nn.Module and must be fully initialized and loaded.
Must have a forward function that expects a batch x input_channels x frame_size tensor as input.
Default: None (input tensor is forwarded to decoder without change)
:param tx_device: device on transmitter (cpu, cuda:0, cuda:1, ...)
#######
Receiver parameters
:param rx_encoder: encoder network in the receiver side
Is an instance of torch.nn.Module and must be fully initialized and loaded.
Must have a forward function that expects a batch x input_channels x frame_size tensor as input.
Default: None (input tensor is forwarded to decoder without change)
:param decoder: decoder network
Is an instance of torch.nn.Module and must be fully initialized and loaded.
Must have a forward function that expects a tensor of the shape produced by the encoder.
Default: None (input tensor is forwarded to binauralizer without change)
:param rx_device: device on receiver (cpu, cuda:0, cuda:1, ...)
"""
self.input_device = input_device
self.output_device = output_device
self.input_channels = input_channels
self.output_channels = output_channels
self.frame_size = frame_size
self.sample_rate = sample_rate
self.gain = gain
self.max_latency = max_latency
# encoder
self.tx_encoder = tx_encoder
self.tx_device = tx_device
print(f'Encoder device: {tx_device}')
# decoder
self.rx_encoder = rx_encoder
self.decoder = decoder
self.rx_device = rx_device
print(f'Decoder device: {rx_device}')
# queues for encoder, decoder, and output
self.encoder_queue = queue.Queue()
self.decoder_queue = queue.Queue()
self.output_queue = queue.Queue()
# file dump if requested
self.input_dump = []
self.output_dump = []
self.input_dump_filename = None
self.output_dump_filename = None
# streaming statistics
self.frame_drops = 0
self.n_frames = 0
self.encoder_times = []
self.decoder_times = []
self.latency_queue = queue.Queue()
self.latencies = []
@abc.abstractmethod
def _encode(self, x):
pass
@abc.abstractmethod
def _decode(self, x):
pass
def _run_encoder(self):
while threading.main_thread().is_alive():
try:
x = self.encoder_queue.get(timeout=1)
except:
continue
start = time.time()
x = x.to(self.tx_device)
with torch.no_grad():
if self.tx_encoder is not None:
x = self._encode(x)
self.encoder_times.append(time.time() - start)
self.decoder_queue.put(x)
def _run_decoder(self):
while threading.main_thread().is_alive():
try:
x = self.decoder_queue.get(timeout=1)
except:
continue
start = time.time()
x = x.to(self.rx_device)
with torch.no_grad():
if (self.rx_encoder is not None) and (self.decoder is not None):
x = self._decode(x)
self.decoder_times.append(time.time() - start)
self.output_queue.put(x)
def _process(self, data):
data = data * self.gain
input_data = torch.from_numpy(data).transpose(1, 0).contiguous() # channels x frame_size
if self.input_dump_filename is not None:
self.input_dump.append(input_data)
# add batch dimension
input_data = input_data.unsqueeze(0)
# process data
self.encoder_queue.put(input_data)
self.latency_queue.put(time.time())
try:
output_data = self.output_queue.get_nowait()
latency = time.time() - self.latency_queue.get_nowait()
self.latencies.append(latency)
# clear queues if latency get too high; this will lead to frame drops
if latency > self.max_latency:
self.encoder_queue.queue.clear()
self.decoder_queue.queue.clear()
self.output_queue.queue.clear()
while not self.latency_queue.empty():
self.frame_drops += 1
self.latency_queue.get_nowait()
except queue.Empty:
output_data = torch.zeros(1, self.output_channels, self.frame_size)
output_data = output_data.squeeze(0).detach().cpu()
self.n_frames += 1
if self.output_dump_filename is not None:
self.output_dump.append(output_data)
data = output_data.transpose(1, 0).contiguous().numpy()
return data
def _callback(self, indata, outdata, frames, _time, status):
if status:
print(status)
outdata[:] = self._process(indata)
def _exit(self):
# dump data to file if required
if self.input_dump_filename is not None:
audio = torch.clamp(torch.cat(self.input_dump, dim=-1), min=-1, max=1)
torchaudio.save(self.input_dump_filename, audio, self.sample_rate)
if self.output_dump_filename is not None:
audio = torch.clamp(torch.cat(self.output_dump, dim=-1), min=-1, max=1)
torchaudio.save(self.output_dump_filename, audio, self.sample_rate)
# compute statistics
with threading.Lock():
encoder_mean = np.mean(np.array(self.encoder_times) * 1000.0)
encoder_std = np.std(np.array(self.encoder_times) * 1000.0)
decoder_mean = np.mean(np.array(self.decoder_times) * 1000.0)
decoder_std = np.std(np.array(self.decoder_times) * 1000.0)
latency_mean = np.mean(np.array(self.latencies) * 1000.0)
latency_std = np.std(np.array(self.latencies) * 1000.0)
frame_drops_ratio = self.frame_drops / self.n_frames
# print statistics
print('#' * 80)
print(f"encoder processing time (ms): {encoder_mean:.2f} +- {encoder_std:.2f}")
print(f"decoder processing time (ms): {decoder_mean:.2f} +- {decoder_std:.2f}")
print(f"system latency (ms): {latency_mean:.2f} +- {latency_std:.2f}")
print(f"frame drops: {self.frame_drops} ({frame_drops_ratio * 100:.2f}%)")
print('#' * 80)
def enable_filedump(self, input_stream_file: str = None, output_stream_file: str = None):
"""
dumps input/output audio to file if input/output filenames are specified
call this function before run()
:param input_stream_file: name of the file to dump input audio to
:param output_stream_file: name of the file to dump output audio to
at least one of the files needs to be specified
"""
if input_stream_file is None and output_stream_file is None:
raise Exception("At least one of input_stream_file and output_stream_file must be specified.")
if input_stream_file is not None:
if not input_stream_file[-4:] == ".wav":
input_stream_file += ".wav"
self.input_dump_filename = input_stream_file
if output_stream_file is not None:
if not output_stream_file[-4:] == ".wav":
output_stream_file += ".wav"
self.output_dump_filename = output_stream_file
def run(self, latency):
"""
start streaming from the input device and forward the processed audio to the output device
prints statistics about mean processing time, standard deviation of each processing pass, and percentage of buffer underflows
"""
# start encoder and decoder threads
encoder_thread = threading.Thread(target=self._run_encoder, daemon=True)
encoder_thread.start()
decoder_thread = threading.Thread(target=self._run_decoder, daemon=True)
decoder_thread.start()
try:
# import device
import sounddevice as sd
with sd.Stream(
device=(self.input_device, self.output_device),
samplerate=self.sample_rate,
blocksize=self.frame_size,
dtype=np.float32,
latency=latency,
channels=(self.input_channels, self.output_channels),
callback=self._callback
):
print('### starting stream [press Return to quit] ###')
input()
self._exit()
except KeyboardInterrupt:
self._exit()
except Exception as e:
print(type(e).__name__ + ': ' + str(e))