Skip to content

添加新规则报错,_pywrapfst.FstOpError: Operation failed #159

@HankLiu10

Description

@HankLiu10

你好,我尝试为ITN加入新的基本规则,运行后报错, 麻烦看看能否解决,谢谢!:

其中,添加了WeTextProcessing/itn/chinese/data/dataflow/flow.tsv如下:

兆	M
g	G

另添加了WeTextProcessing/itn/chinese/rules/dataflow.py如下:

from itn.chinese.rules.cardinal import Cardinal
from tn.processor import Processor

from pynini import string_file
from pynini.lib.pynutil import delete, insert


class DataFlow(Processor):

    def __init__(self, enable_0_to_9=True):
        super().__init__(name='dataflow')
        self.enable_0_to_9 = enable_0_to_9
        self.build_tagger()
        self.build_verbalizer()

    def build_tagger(self):
        flow = string_file('itn/chinese/data/dataflow/flow.tsv')
        digit = string_file('itn/chinese/data/number/digit.tsv')  # 1 ~ 9

        number = Cardinal().number if self.enable_0_to_9 else \
            Cardinal().number_exclude_0_to_9
        number |= digit
        tagger = (insert('value: "') + number + insert('"') +
                  insert(' dataflow: "') + flow + insert('"'))
        self.tagger = self.add_tokens(tagger)

    def build_verbalizer(self):
        dataflow = delete('dataflow: "') + self.SIGMA + delete('"')
        value = delete(' value: "') + self.SIGMA + delete('"')
        verbalizer = value + dataflow
        self.verbalizer = self.delete_tokens(verbalizer)

同时,修改了WeTextProcessing/itn/chinese/inverse_normalizer.py 的内容如下:

from tn.processor import Processor
from itn.chinese.rules.cardinal import Cardinal
from itn.chinese.rules.char import Char
from itn.chinese.rules.date import Date
from itn.chinese.rules.fraction import Fraction
from itn.chinese.rules.math import Math
from itn.chinese.rules.measure import Measure
from itn.chinese.rules.money import Money
from itn.chinese.rules.whitelist import Whitelist
from itn.chinese.rules.time import Time
from itn.chinese.rules.postprocessor import PostProcessor
from itn.chinese.rules.license_plate import LicensePlate
from itn.chinese.rules.dataflow import DataFlow     ####

from pynini.lib.pynutil import add_weight, delete
from importlib_resources import files


class InverseNormalizer(Processor):

    def __init__(self, cache_dir=None, overwrite_cache=False,
                 enable_standalone_number=True,
                 enable_0_to_9=False):
        super().__init__(name='inverse_normalizer', ordertype='itn')
        self.convert_number = enable_standalone_number
        self.enable_0_to_9 = enable_0_to_9
        if cache_dir is None:
            cache_dir = files("itn")
        self.build_fst('zh_itn', cache_dir, overwrite_cache)

    def build_tagger(self):
        tagger = (add_weight(Date().tagger, 1.02)
                  | add_weight(Whitelist().tagger, 1.01)
                  | add_weight(Fraction().tagger, 1.05)
                  | add_weight(Measure(enable_0_to_9=self.enable_0_to_9).tagger, 1.05)  # noqa
                  | add_weight(Money(enable_0_to_9=self.enable_0_to_9).tagger, 1.04)  # noqa
                  | add_weight(Time().tagger, 1.05)
                  | add_weight(Cardinal(self.convert_number, self.enable_0_to_9).tagger, 1.06)  # noqa
                  | add_weight(Math().tagger, 1.10)
                  | add_weight(LicensePlate().tagger, 1.0)
                  | add_weight(DataFlow().tagger, 1.11)            ####
                  | add_weight(Char().tagger, 100)).optimize()

        tagger = tagger.star
        # remove the last space
        self.tagger = tagger @ self.build_rule(delete(' '), '', '[EOS]')

    def build_verbalizer(self):
        verbalizer = (Cardinal(self.convert_number, self.enable_0_to_9).verbalizer  # noqa
                      | Char().verbalizer
                      | Date().verbalizer
                      | Fraction().verbalizer
                      | Math().verbalizer
                      | Measure(enable_0_to_9=self.enable_0_to_9).verbalizer
                      | Money(enable_0_to_9=self.enable_0_to_9).verbalizer
                      | Time().verbalizer
                      | LicensePlate().verbalizer
                      | DataFlow(enable_0_to_9=self.enable_0_to_9).verbalizer        ####
                      | Whitelist().verbalizer).optimize()
        postprocessor = PostProcessor(remove_interjections=True).processor

        self.verbalizer = (verbalizer @ postprocessor).star

最后运行python -m itn --text "八兆流量" --overwrite_cache后报错如下,
(wenetITN) liuhangchen@G08:~/WeNetITN/WeTextProcessing$ python -m itn --text "八兆流量" --overwrite_cache
dataflow { value: "8" dataflow: "M" } char { value: "流" } char { value: "量" }
ERROR: StringFstToOutputLabels: Invalid start state
Traceback (most recent call last):
File "/storage1/liuhangchen/anaconda3/envs/wenetITN/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/storage1/liuhangchen/anaconda3/envs/wenetITN/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/itn/main.py", line 4, in
main()
File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/itn/main.py", line 53, in main
print(normalizer.normalize(args.text))
File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/tn/processor.py", line 96, in normalize
return self.verbalize(self.tag(input))
File "/storage1/liuhangchen/WeNetITN/WeTextProcessing/tn/processor.py", line 93, in verbalize
return shortestpath(lattice, nshortest=1, unique=True).string()
File "extensions/_pynini.pyx", line 462, in _pynini.Fst.string
File "extensions/_pynini.pyx", line 507, in _pynini.Fst.string
_pywrapfst.FstOpError: Operation failed

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions