|
| 1 | + |
| 2 | +from pyhanlp import * |
| 3 | +import zipfile |
| 4 | +import os |
| 5 | +from pyhanlp.static import download, remove_file, HANLP_DATA_PATH |
| 6 | + |
| 7 | +def test_data_path(): |
| 8 | + """ |
| 9 | + 获取测试数据路径,位于$root/data/test,根目录由配置文件指定。 |
| 10 | + :return: |
| 11 | + """ |
| 12 | + data_path = os.path.join(HANLP_DATA_PATH, 'test') |
| 13 | + if not os.path.isdir(data_path): |
| 14 | + os.mkdir(data_path) |
| 15 | + return data_path |
| 16 | + |
| 17 | + |
| 18 | + |
| 19 | +## 验证是否存在 MSR语料库,如果没有自动下载 |
| 20 | +def ensure_data(data_name, data_url): |
| 21 | + root_path = test_data_path() |
| 22 | + dest_path = os.path.join(root_path, data_name) |
| 23 | + if os.path.exists(dest_path): |
| 24 | + return dest_path |
| 25 | + |
| 26 | + if data_url.endswith('.zip'): |
| 27 | + dest_path += '.zip' |
| 28 | + download(data_url, dest_path) |
| 29 | + if data_url.endswith('.zip'): |
| 30 | + with zipfile.ZipFile(dest_path, "r") as archive: |
| 31 | + archive.extractall(root_path) |
| 32 | + remove_file(dest_path) |
| 33 | + dest_path = dest_path[:-len('.zip')] |
| 34 | + return dest_path |
| 35 | + |
| 36 | + |
| 37 | +## 指定 PKU 语料库 |
| 38 | +PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip") |
| 39 | +PKU199801 = os.path.join(PKU98, '199801.txt') |
| 40 | +PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt') |
| 41 | +PKU199801_TEST = os.path.join(PKU98, '199801-test.txt') |
| 42 | +POS_MODEL = os.path.join(PKU98, 'pos.bin') |
| 43 | +NER_MODEL = os.path.join(PKU98, 'ner.bin') |
| 44 | + |
| 45 | + |
| 46 | +## =============================================== |
| 47 | +## 以下开始 感知机 命名实体识别 |
| 48 | + |
| 49 | +NERTrainer = JClass('com.hankcs.hanlp.model.perceptron.NERTrainer') |
| 50 | +PerceptronNERecognizer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronNERecognizer') |
| 51 | +PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter') |
| 52 | +PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger') |
| 53 | +Sentence = JClass('com.hankcs.hanlp.corpus.document.sentence.Sentence') |
| 54 | +AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer') |
| 55 | +Utility = JClass('com.hankcs.hanlp.model.perceptron.utility.Utility') |
| 56 | + |
| 57 | + |
| 58 | + |
| 59 | +def train(corpus, model): |
| 60 | + trainer = NERTrainer() |
| 61 | + return PerceptronNERecognizer(trainer.train(corpus, model).getModel()) |
| 62 | + |
| 63 | + |
| 64 | +def test(recognizer): |
| 65 | + # 包装了感知机分词器和词性标注器的词法分析器 |
| 66 | + analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), PerceptronPOSTagger(), recognizer) |
| 67 | + print(analyzer.analyze("华北电力公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观")) |
| 68 | + scores = Utility.evaluateNER(recognizer, PKU199801_TEST) |
| 69 | + Utility.printNERScore(scores) |
| 70 | + |
| 71 | + |
| 72 | +if __name__ == '__main__': |
| 73 | + recognizer = train(PKU199801_TRAIN, NER_MODEL) |
| 74 | + test(recognizer) |
| 75 | + |
| 76 | + ## 支持在线学习 |
| 77 | + # 创建了感知机词法分析器 |
| 78 | + analyzer = PerceptronLexicalAnalyzer(PerceptronSegmenter(), PerceptronPOSTagger(), recognizer) # ① |
| 79 | + # 根据标注样本的字符串形式创建等价的 Sentence对象 |
| 80 | + sentence = Sentence.create("与/c 特朗普/nr 通/v 电话/n 讨论/v [太空/s 探索/vn 技术/n 公司/n]/nt") # ② |
| 81 | + # 测试词法分析器对样本的分析结果是否与标注一致,若不一致重复在线学习,直到两者一致。 |
| 82 | + while not analyzer.analyze(sentence.text()).equals(sentence): # ③ |
| 83 | + analyzer.learn(sentence) |
| 84 | + |
| 85 | + |
| 86 | + |
0 commit comments