Handle the special tokens in scoring cer

Yibing Liu · Yibing Liu · commit 18db3cf77324 · 2018-07-04T18:50:08.000-07:00
diff --git a/fluid/DeepASR/score_error_rate.py b/fluid/DeepASR/score_error_rate.py
@@ -16,10 +16,18 @@ def parse_args():
         default='cer',
         choices=['cer', 'wer'],
         help="Error rate type. (default: %(default)s)")
+    parser.add_argument(
+        '--special_tokens',
+        type=str,
+        default='<SPOKEN_NOISE>',
+        help="Special tokens in scoring CER, seperated by space. "
+        "They shouldn't be splitted and should be treated as one special "
+        "character. Example: '<SPOKEN_NOISE> <bos> <eos>' "
+        "(default: %(default)s)")
     parser.add_argument(
         '--ref', type=str, required=True, help="The ground truth text.")
     parser.add_argument(
-        '--hyp', type=str, required=True, help="The decoding result.")
+        '--hyp', type=str, required=True, help="The decoding result text.")
     args = parser.parse_args()
     return args
 
@@ -31,6 +39,8 @@ def parse_args():
     sum_errors, sum_ref_len = 0.0, 0
     sent_cnt, not_in_ref_cnt = 0, 0
 
+    special_tokens = args.special_tokens.split(" ")
+
     with open(args.ref, "r") as ref_txt:
         line = ref_txt.readline()
         while line:
@@ -51,6 +61,8 @@ def parse_args():
                 continue
 
             if args.error_rate_type == 'cer':
+                for sp_tok in special_tokens:
+                    sent = sent.replace(sp_tok, '\0')
                 errors, ref_len = char_errors(
                     ref_dict[key].decode("utf8"),
                     sent.decode("utf8"),