@@ -88,18 +88,34 @@ def tune():
88
88
augmentation_config = '{}' ,
89
89
specgram_type = args .specgram_type ,
90
90
num_threads = args .num_proc_data ,
91
- keep_transcription_text = True )
91
+ keep_transcription_text = True ,
92
+ num_conv_layers = args .num_conv_layers )
92
93
93
94
audio_data = paddle .layer .data (
94
95
name = "audio_spectrogram" ,
95
96
type = paddle .data_type .dense_array (161 * 161 ))
96
97
text_data = paddle .layer .data (
97
98
name = "transcript_text" ,
98
99
type = paddle .data_type .integer_value_sequence (data_generator .vocab_size ))
100
+ seq_offset_data = paddle .layer .data (
101
+ name = 'sequence_offset' ,
102
+ type = paddle .data_type .integer_value_sequence (1 ))
103
+ seq_len_data = paddle .layer .data (
104
+ name = 'sequence_length' ,
105
+ type = paddle .data_type .integer_value_sequence (1 ))
106
+ index_range_datas = []
107
+ for i in xrange (args .num_rnn_layers ):
108
+ index_range_datas .append (
109
+ paddle .layer .data (
110
+ name = 'conv%d_index_range' % i ,
111
+ type = paddle .data_type .dense_vector (6 )))
99
112
100
113
output_probs , _ = deep_speech_v2_network (
101
114
audio_data = audio_data ,
102
115
text_data = text_data ,
116
+ seq_offset_data = seq_offset_data ,
117
+ seq_len_data = seq_len_data ,
118
+ index_range_datas = index_range_datas ,
103
119
dict_size = data_generator .vocab_size ,
104
120
num_conv_layers = args .num_conv_layers ,
105
121
num_rnn_layers = args .num_rnn_layers ,
@@ -156,15 +172,17 @@ def tune():
156
172
for infer_data in batch_reader ():
157
173
if (args .num_batches >= 0 ) and (cur_batch >= args .num_batches ):
158
174
break
159
- infer_results = inferer .infer (input = infer_data )
160
-
161
- num_steps = len (infer_results ) // len (infer_data )
175
+ infer_results = inferer .infer (input = infer_data ,
176
+ feeding = data_generator .feeding )
177
+ start_pos = [0 ] * (len (infer_data ) + 1 )
178
+ for i in xrange (len (infer_data )):
179
+ start_pos [i + 1 ] = start_pos [i ] + infer_data [i ][3 ][0 ]
162
180
probs_split = [
163
- infer_results [i * num_steps :( i + 1 ) * num_steps ]
164
- for i in xrange (len (infer_data ))
181
+ infer_results [start_pos [ i ]: start_pos [ i + 1 ] ]
182
+ for i in xrange (0 , len (infer_data ))
165
183
]
166
184
167
- target_transcripts = [transcript for _ , transcript in infer_data ]
185
+ target_transcripts = [ data [ 1 ] for data in infer_data ]
168
186
169
187
num_ins += len (target_transcripts )
170
188
# grid search
0 commit comments