Skip to content

Commit c86a2ef

Browse files
authored
Add files via upload
1 parent f9f96dd commit c86a2ef

File tree

3 files changed

+236
-0
lines changed

3 files changed

+236
-0
lines changed

code/ch08/crf_ner.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
2+
from pyhanlp import *
3+
import zipfile
4+
import os
5+
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
6+
7+
def test_data_path():
8+
"""
9+
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
10+
:return:
11+
"""
12+
data_path = os.path.join(HANLP_DATA_PATH, 'test')
13+
if not os.path.isdir(data_path):
14+
os.mkdir(data_path)
15+
return data_path
16+
17+
18+
19+
## 验证是否存在 MSR语料库,如果没有自动下载
20+
def ensure_data(data_name, data_url):
21+
root_path = test_data_path()
22+
dest_path = os.path.join(root_path, data_name)
23+
if os.path.exists(dest_path):
24+
return dest_path
25+
26+
if data_url.endswith('.zip'):
27+
dest_path += '.zip'
28+
download(data_url, dest_path)
29+
if data_url.endswith('.zip'):
30+
with zipfile.ZipFile(dest_path, "r") as archive:
31+
archive.extractall(root_path)
32+
remove_file(dest_path)
33+
dest_path = dest_path[:-len('.zip')]
34+
return dest_path
35+
36+
37+
## 指定 PKU 语料库
38+
PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
39+
PKU199801 = os.path.join(PKU98, '199801.txt')
40+
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
41+
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
42+
POS_MODEL = os.path.join(PKU98, 'pos.bin')
43+
NER_MODEL = os.path.join(PKU98, 'ner.bin')
44+
45+
46+
## ===============================================
47+
## 以下开始 CRF 命名实体识别
48+
49+
CRFNERecognizer = JClass('com.hankcs.hanlp.model.crf.CRFNERecognizer')
50+
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')
51+
Utility = JClass('com.hankcs.hanlp.model.perceptron.utility.Utility')
52+
53+
54+
55+
def train(corpus, model):
56+
# 零参数的构造函数代表加载配置文件默认的模型,必须用null None 与之区分。
57+
recognizer = CRFNERecognizer(None) # 空白
58+
recognizer.train(corpus, model)
59+
return recognizer
60+
61+
62+
def test(recognizer):
63+
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), PerceptronPOSTagger(), recognizer)
64+
print(analyzer.analyze("华北电力公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观"))
65+
scores = Utility.evaluateNER(recognizer, PKU199801_TEST)
66+
Utility.printNERScore(scores)
67+
68+
69+
if __name__ == '__main__':
70+
recognizer = train(PKU199801_TRAIN, NER_MODEL)
71+
test(recognizer)
72+
73+
74+

code/ch08/hmm_ner.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
2+
from pyhanlp import *
3+
import zipfile
4+
import os
5+
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
6+
7+
def test_data_path():
8+
"""
9+
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
10+
:return:
11+
"""
12+
data_path = os.path.join(HANLP_DATA_PATH, 'test')
13+
if not os.path.isdir(data_path):
14+
os.mkdir(data_path)
15+
return data_path
16+
17+
18+
19+
## 验证是否存在 MSR语料库,如果没有自动下载
20+
def ensure_data(data_name, data_url):
21+
root_path = test_data_path()
22+
dest_path = os.path.join(root_path, data_name)
23+
if os.path.exists(dest_path):
24+
return dest_path
25+
26+
if data_url.endswith('.zip'):
27+
dest_path += '.zip'
28+
download(data_url, dest_path)
29+
if data_url.endswith('.zip'):
30+
with zipfile.ZipFile(dest_path, "r") as archive:
31+
archive.extractall(root_path)
32+
remove_file(dest_path)
33+
dest_path = dest_path[:-len('.zip')]
34+
return dest_path
35+
36+
37+
## 指定 PKU 语料库
38+
PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
39+
PKU199801 = os.path.join(PKU98, '199801.txt')
40+
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
41+
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
42+
POS_MODEL = os.path.join(PKU98, 'pos.bin')
43+
NER_MODEL = os.path.join(PKU98, 'ner.bin')
44+
45+
46+
## ===============================================
47+
## 以下开始 HMM 命名实体识别
48+
49+
HMMNERecognizer = JClass('com.hankcs.hanlp.model.hmm.HMMNERecognizer')
50+
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')
51+
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
52+
PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')
53+
Utility = JClass('com.hankcs.hanlp.model.perceptron.utility.Utility')
54+
55+
56+
57+
def train(corpus):
58+
recognizer = HMMNERecognizer()
59+
recognizer.train(corpus) # data/test/pku98/199801-train.txt
60+
return recognizer
61+
62+
63+
def test(recognizer):
64+
# 包装了感知机分词器和词性标注器的词法分析器
65+
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), PerceptronPOSTagger(), recognizer)
66+
print(analyzer.analyze("华北电力公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观"))
67+
scores = Utility.evaluateNER(recognizer, PKU199801_TEST)
68+
Utility.printNERScore(scores)
69+
70+
71+
if __name__ == '__main__':
72+
recognizer = train(PKU199801_TRAIN)
73+
test(recognizer)
74+
75+
76+

code/ch08/perceptron_ner.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
2+
from pyhanlp import *
3+
import zipfile
4+
import os
5+
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
6+
7+
def test_data_path():
8+
"""
9+
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
10+
:return:
11+
"""
12+
data_path = os.path.join(HANLP_DATA_PATH, 'test')
13+
if not os.path.isdir(data_path):
14+
os.mkdir(data_path)
15+
return data_path
16+
17+
18+
19+
## 验证是否存在 MSR语料库,如果没有自动下载
20+
def ensure_data(data_name, data_url):
21+
root_path = test_data_path()
22+
dest_path = os.path.join(root_path, data_name)
23+
if os.path.exists(dest_path):
24+
return dest_path
25+
26+
if data_url.endswith('.zip'):
27+
dest_path += '.zip'
28+
download(data_url, dest_path)
29+
if data_url.endswith('.zip'):
30+
with zipfile.ZipFile(dest_path, "r") as archive:
31+
archive.extractall(root_path)
32+
remove_file(dest_path)
33+
dest_path = dest_path[:-len('.zip')]
34+
return dest_path
35+
36+
37+
## 指定 PKU 语料库
38+
PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
39+
PKU199801 = os.path.join(PKU98, '199801.txt')
40+
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
41+
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
42+
POS_MODEL = os.path.join(PKU98, 'pos.bin')
43+
NER_MODEL = os.path.join(PKU98, 'ner.bin')
44+
45+
46+
## ===============================================
47+
## 以下开始 感知机 命名实体识别
48+
49+
NERTrainer = JClass('com.hankcs.hanlp.model.perceptron.NERTrainer')
50+
PerceptronNERecognizer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronNERecognizer')
51+
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
52+
PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')
53+
Sentence = JClass('com.hankcs.hanlp.corpus.document.sentence.Sentence')
54+
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')
55+
Utility = JClass('com.hankcs.hanlp.model.perceptron.utility.Utility')
56+
57+
58+
59+
def train(corpus, model):
60+
trainer = NERTrainer()
61+
return PerceptronNERecognizer(trainer.train(corpus, model).getModel())
62+
63+
64+
def test(recognizer):
65+
# 包装了感知机分词器和词性标注器的词法分析器
66+
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), PerceptronPOSTagger(), recognizer)
67+
print(analyzer.analyze("华北电力公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观"))
68+
scores = Utility.evaluateNER(recognizer, PKU199801_TEST)
69+
Utility.printNERScore(scores)
70+
71+
72+
if __name__ == '__main__':
73+
recognizer = train(PKU199801_TRAIN, NER_MODEL)
74+
test(recognizer)
75+
76+
## 支持在线学习
77+
# 创建了感知机词法分析器
78+
analyzer = PerceptronLexicalAnalyzer(PerceptronSegmenter(), PerceptronPOSTagger(), recognizer) # ①
79+
# 根据标注样本的字符串形式创建等价的 Sentence对象
80+
sentence = Sentence.create("与/c 特朗普/nr 通/v 电话/n 讨论/v [太空/s 探索/vn 技术/n 公司/n]/nt") # ②
81+
# 测试词法分析器对样本的分析结果是否与标注一致,若不一致重复在线学习,直到两者一致。
82+
while not analyzer.analyze(sentence.text()).equals(sentence): # ③
83+
analyzer.learn(sentence)
84+
85+
86+

0 commit comments

Comments
 (0)