|
| 1 | + |
| 2 | + |
| 3 | +from pyhanlp import * |
| 4 | +import zipfile |
| 5 | +import os |
| 6 | +from pyhanlp.static import download, remove_file, HANLP_DATA_PATH |
| 7 | + |
| 8 | +def test_data_path(): |
| 9 | + """ |
| 10 | + 获取测试数据路径,位于$root/data/test,根目录由配置文件指定。 |
| 11 | + :return: |
| 12 | + """ |
| 13 | + data_path = os.path.join(HANLP_DATA_PATH, 'test') |
| 14 | + if not os.path.isdir(data_path): |
| 15 | + os.mkdir(data_path) |
| 16 | + return data_path |
| 17 | + |
| 18 | + |
| 19 | + |
| 20 | +## 验证是否存在 MSR语料库,如果没有自动下载 |
| 21 | +def ensure_data(data_name, data_url): |
| 22 | + root_path = test_data_path() |
| 23 | + dest_path = os.path.join(root_path, data_name) |
| 24 | + if os.path.exists(dest_path): |
| 25 | + return dest_path |
| 26 | + |
| 27 | + if data_url.endswith('.zip'): |
| 28 | + dest_path += '.zip' |
| 29 | + download(data_url, dest_path) |
| 30 | + if data_url.endswith('.zip'): |
| 31 | + with zipfile.ZipFile(dest_path, "r") as archive: |
| 32 | + archive.extractall(root_path) |
| 33 | + remove_file(dest_path) |
| 34 | + dest_path = dest_path[:-len('.zip')] |
| 35 | + return dest_path |
| 36 | + |
| 37 | + |
| 38 | +## 指定 PKU 语料库 |
| 39 | +PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip") |
| 40 | +PKU199801 = os.path.join(PKU98, '199801.txt') |
| 41 | +PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt') |
| 42 | +PKU199801_TEST = os.path.join(PKU98, '199801-test.txt') |
| 43 | +POS_MODEL = os.path.join(PKU98, 'pos.bin') |
| 44 | +NER_MODEL = os.path.join(PKU98, 'ner.bin') |
| 45 | + |
| 46 | + |
| 47 | +## =============================================== |
| 48 | +## 以下开始 HMM 词性标注 |
| 49 | + |
| 50 | + |
| 51 | + |
| 52 | +HMMPOSTagger = JClass('com.hankcs.hanlp.model.hmm.HMMPOSTagger') |
| 53 | +AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer') |
| 54 | +PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter') |
| 55 | +FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel') |
| 56 | +SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel') |
| 57 | + |
| 58 | +def train_hmm_pos(corpus, model): |
| 59 | + tagger = HMMPOSTagger(model) # 创建词性标注器 |
| 60 | + tagger.train(corpus) # 训练 |
| 61 | + print(', '.join(tagger.tag("他", "的", "希望", "是", "希望", "上学"))) # 预测 |
| 62 | + analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger) # 构造词法分析器,与感知机分词器结合,能同时进行分词和词性标注。 |
| 63 | + print(analyzer.analyze("他的希望是希望上学")) # 分词+词性标注 |
| 64 | + print(analyzer.analyze("他的希望是希望上学").translateLabels()) # 对词性进行翻译 |
| 65 | + print(analyzer.analyze("李狗蛋的希望是希望上学").translateLabels()) # 对词性进行翻译 |
| 66 | + return tagger |
| 67 | + |
| 68 | + |
| 69 | +if __name__ == '__main__': |
| 70 | + print('一阶隐马尔可夫模型:') |
| 71 | + tagger1 = train_hmm_pos(PKU199801_TRAIN, FirstOrderHiddenMarkovModel()) # 一阶隐马 |
| 72 | + print('') |
| 73 | + print('二阶隐马尔可夫模型:') |
| 74 | + tagger = train_hmm_pos(PKU199801_TRAIN, SecondOrderHiddenMarkovModel()) # 二阶隐马 |
0 commit comments