[egs] Chime6: bug fix sorting utterances by their start time and end time (kaldi-asr#3953)

aarora8 · web-flow · commit a2573871ba18 · 2020-02-25T13:38:27.000+08:00
diff --git a/egs/chime6/s5_track2/RESULTS b/egs/chime6/s5_track2/RESULTS
@@ -14,5 +14,5 @@ Dev     57.15     83.96
 Eval    54.12     80.33
 
 # ASR nnet3 tdnn+chain
-Dev:  U06 %WER 81.18 [ 58881 / 47798, 1638 ins, 30528 del, 15632 sub ]
-Eval: U06 %WER 85.39 [ 55132 / 47076, 1107 ins, 27768 del, 18201 sub ]
+Dev:  %WER 84.33 [ 49653 / 58881, 1529 ins, 35813 del, 12311 sub ]
+Eval: %WER 78.08 [ 43046 / 55132, 957 ins, 32045 del, 10044 sub ]
diff --git a/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py b/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py
@@ -39,11 +39,18 @@ def main():
         combined_hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + '_comb'
         combined_hyp_writer = open(combined_hyp_file, 'w')
         utterances = sessionid_micid_speakerid_dict[sessionid_micid_speakerid]
-        text = ''
+        # sorting utterances by start and end time
+        sessionid_micid_speakerid_utterances={}
         for line in utterances:
             parts = line.strip().split()
+            utt_parts = parts[0].strip().split('-')
+            time ='-'.join(utt_parts[2:])
+            sessionid_micid_speakerid_utterances[time] = line
+        text = ''
+        for time_key in sorted(sessionid_micid_speakerid_utterances):
+            parts = sessionid_micid_speakerid_utterances[time_key].strip().split()
             text = text + ' ' + ' '.join(parts[1:])
-            hyp_writer.write(line)
+            hyp_writer.write(sessionid_micid_speakerid_utterances[time_key])
         combined_utterance = 'utt' + " " + text
         combined_hyp_writer.write(combined_utterance)
         combined_hyp_writer.write('\n')
diff --git a/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py b/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py
@@ -55,14 +55,21 @@ def main():
             spkrid_mapping[sessionid_speakerid.split('_')[1]]) + '_comb'
         combined_ref_writer = open(combined_ref_file, 'w')
         utterances = sessionid_speakerid_dict[sessionid_speakerid]
-        text = ''
-        uttid_wc = 'utt'
+        sessionid_speakerid_utterances = {}
+        # sorting utterances by start and end time
         for line in utterances:
             parts = line.strip().split()
+            utt_parts = parts[0].strip().split('-')
+            time ='-'.join(utt_parts[1:])
+            sessionid_speakerid_utterances[time] = line
+        text = ''
+        uttid_wc = 'utt'
+        for time_key in sorted(sessionid_speakerid_utterances):
+            parts = sessionid_speakerid_utterances[time_key].strip().split()
             uttid_id = parts[0]
             utt_text = ' '.join(parts[1:])
             text = text + ' ' + ' '.join(parts[1:])
-            ref_writer.write(line)
+            ref_writer.write(sessionid_speakerid_utterances[time_key])
             length = str(len(utt_text.split()))
             uttid_id_len = uttid_id + ":" + length
             uttid_wc = uttid_wc + ' ' + uttid_id_len