1+ '''
2+ Requirements:
3+ + pyaudio - `pip install pyaudio`
4+ + py-webrtcvad - `pip install webrtcvad`
5+ '''
6+ import webrtcvad
7+ import collections
8+ import sys
9+ import signal
10+ import pyaudio
11+
12+ from array import array
13+ from struct import pack
14+ import wave
15+ import time
16+
17+ FORMAT = pyaudio .paInt16
18+ CHANNELS = 1
19+ RATE = 16000
20+ CHUNK_DURATION_MS = 30 # supports 10, 20 and 30 (ms)
21+ PADDING_DURATION_MS = 1500 # 1 sec jugement
22+ CHUNK_SIZE = int (RATE * CHUNK_DURATION_MS / 1000 ) # chunk to read
23+ CHUNK_BYTES = CHUNK_SIZE * 2 # 16bit = 2 bytes, PCM
24+ NUM_PADDING_CHUNKS = int (PADDING_DURATION_MS / CHUNK_DURATION_MS )
25+ # NUM_WINDOW_CHUNKS = int(240 / CHUNK_DURATION_MS)
26+ NUM_WINDOW_CHUNKS = int (400 / CHUNK_DURATION_MS ) # 400 ms/ 30ms ge
27+ NUM_WINDOW_CHUNKS_END = NUM_WINDOW_CHUNKS * 2
28+
29+ START_OFFSET = int (NUM_WINDOW_CHUNKS * CHUNK_DURATION_MS * 0.5 * RATE )
30+
31+ vad = webrtcvad .Vad (1 )
32+
33+ pa = pyaudio .PyAudio ()
34+ stream = pa .open (format = FORMAT ,
35+ channels = CHANNELS ,
36+ rate = RATE ,
37+ input = True ,
38+ start = False ,
39+ # input_device_index=2,
40+ frames_per_buffer = CHUNK_SIZE )
41+
42+
43+ got_a_sentence = False
44+ leave = False
45+
46+
47+ def handle_int (sig , chunk ):
48+ global leave , got_a_sentence
49+ leave = True
50+ got_a_sentence = True
51+
52+
53+ def record_to_file (path , data , sample_width ):
54+ "Records from the microphone and outputs the resulting data to 'path'"
55+ # sample_width, data = record()
56+ data = pack ('<' + ('h' * len (data )), * data )
57+ wf = wave .open (path , 'wb' )
58+ wf .setnchannels (1 )
59+ wf .setsampwidth (sample_width )
60+ wf .setframerate (RATE )
61+ wf .writeframes (data )
62+ wf .close ()
63+
64+
65+ def normalize (snd_data ):
66+ "Average the volume out"
67+ MAXIMUM = 32767 # 16384
68+ times = float (MAXIMUM ) / max (abs (i ) for i in snd_data )
69+ r = array ('h' )
70+ for i in snd_data :
71+ r .append (int (i * times ))
72+ return r
73+
74+ signal .signal (signal .SIGINT , handle_int )
75+
76+ while not leave :
77+ ring_buffer = collections .deque (maxlen = NUM_PADDING_CHUNKS )
78+ triggered = False
79+ voiced_frames = []
80+ ring_buffer_flags = [0 ] * NUM_WINDOW_CHUNKS
81+ ring_buffer_index = 0
82+
83+ ring_buffer_flags_end = [0 ] * NUM_WINDOW_CHUNKS_END
84+ ring_buffer_index_end = 0
85+ buffer_in = ''
86+ # WangS
87+ raw_data = array ('h' )
88+ index = 0
89+ start_point = 0
90+ StartTime = time .time ()
91+ print ("* recording: " )
92+ stream .start_stream ()
93+
94+ while not got_a_sentence and not leave :
95+ chunk = stream .read (CHUNK_SIZE )
96+ # add WangS
97+ raw_data .extend (array ('h' , chunk ))
98+ index += CHUNK_SIZE
99+ TimeUse = time .time () - StartTime
100+
101+ active = vad .is_speech (chunk , RATE )
102+
103+ sys .stdout .write ('1' if active else '_' )
104+ ring_buffer_flags [ring_buffer_index ] = 1 if active else 0
105+ ring_buffer_index += 1
106+ ring_buffer_index %= NUM_WINDOW_CHUNKS
107+
108+ ring_buffer_flags_end [ring_buffer_index_end ] = 1 if active else 0
109+ ring_buffer_index_end += 1
110+ ring_buffer_index_end %= NUM_WINDOW_CHUNKS_END
111+
112+ # start point detection
113+ if not triggered :
114+ ring_buffer .append (chunk )
115+ num_voiced = sum (ring_buffer_flags )
116+ if num_voiced > 0.8 * NUM_WINDOW_CHUNKS :
117+ sys .stdout .write (' Open ' )
118+ triggered = True
119+ start_point = index - CHUNK_SIZE * 20 # start point
120+ # voiced_frames.extend(ring_buffer)
121+ ring_buffer .clear ()
122+ # end point detection
123+ else :
124+ # voiced_frames.append(chunk)
125+ ring_buffer .append (chunk )
126+ num_unvoiced = NUM_WINDOW_CHUNKS_END - sum (ring_buffer_flags_end )
127+ if num_unvoiced > 0.90 * NUM_WINDOW_CHUNKS_END or TimeUse > 10 :
128+ sys .stdout .write (' Close ' )
129+ triggered = False
130+ got_a_sentence = True
131+
132+ sys .stdout .flush ()
133+
134+ sys .stdout .write ('\n ' )
135+ # data = b''.join(voiced_frames)
136+
137+ stream .stop_stream ()
138+ print ("* done recording" )
139+ got_a_sentence = False
140+
141+ # write to file
142+ raw_data .reverse ()
143+ for index in range (start_point ):
144+ raw_data .pop ()
145+ raw_data .reverse ()
146+ raw_data = normalize (raw_data )
147+ record_to_file ("recording.wav" , raw_data , 2 )
148+ leave = True
149+
150+ stream .close ()
0 commit comments