Skip to content

Commit f247686

Browse files
authored
Add files via upload
1 parent 8ea9977 commit f247686

File tree

3 files changed

+212
-0
lines changed

3 files changed

+212
-0
lines changed

code/ch07/crf_pos.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
2+
3+
from pyhanlp import *
4+
import zipfile
5+
import os
6+
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
7+
8+
def test_data_path():
9+
"""
10+
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
11+
:return:
12+
"""
13+
data_path = os.path.join(HANLP_DATA_PATH, 'test')
14+
if not os.path.isdir(data_path):
15+
os.mkdir(data_path)
16+
return data_path
17+
18+
19+
20+
## 验证是否存在 MSR语料库,如果没有自动下载
21+
def ensure_data(data_name, data_url):
22+
root_path = test_data_path()
23+
dest_path = os.path.join(root_path, data_name)
24+
if os.path.exists(dest_path):
25+
return dest_path
26+
27+
if data_url.endswith('.zip'):
28+
dest_path += '.zip'
29+
download(data_url, dest_path)
30+
if data_url.endswith('.zip'):
31+
with zipfile.ZipFile(dest_path, "r") as archive:
32+
archive.extractall(root_path)
33+
remove_file(dest_path)
34+
dest_path = dest_path[:-len('.zip')]
35+
return dest_path
36+
37+
38+
## 指定 PKU 语料库
39+
PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
40+
PKU199801 = os.path.join(PKU98, '199801.txt')
41+
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
42+
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
43+
POS_MODEL = os.path.join(PKU98, 'pos.bin')
44+
NER_MODEL = os.path.join(PKU98, 'ner.bin')
45+
46+
47+
## ===============================================
48+
## 以下开始 HMM 词性标注
49+
50+
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')
51+
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
52+
CRFPOSTagger = JClass('com.hankcs.hanlp.model.crf.CRFPOSTagger')
53+
54+
55+
56+
def train_crf_pos(corpus):
57+
# 选项1.使用HanLP的Java API训练,慢
58+
tagger = CRFPOSTagger(None) # 创建空白标注器
59+
tagger.train(corpus, POS_MODEL) # 训练
60+
tagger = CRFPOSTagger(POS_MODEL) # 加载
61+
# 选项2.使用CRF++训练,HanLP加载。(训练命令由选项1给出)
62+
# tagger = CRFPOSTagger(POS_MODEL + ".txt")
63+
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger) # 构造词法分析器,与感知机分词器结合,能同时进行分词和词性标注。
64+
print(analyzer.analyze("李狗蛋的希望是希望上学")) # 分词+词性标注
65+
print(analyzer.analyze("李狗蛋的希望是希望上学").translateLabels()) # 对词性进行翻译
66+
return tagger
67+
68+
69+
if __name__ == '__main__':
70+
tagger = train_crf_pos(PKU199801_TRAIN)

code/ch07/hmm_pos.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
2+
3+
from pyhanlp import *
4+
import zipfile
5+
import os
6+
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
7+
8+
def test_data_path():
9+
"""
10+
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
11+
:return:
12+
"""
13+
data_path = os.path.join(HANLP_DATA_PATH, 'test')
14+
if not os.path.isdir(data_path):
15+
os.mkdir(data_path)
16+
return data_path
17+
18+
19+
20+
## 验证是否存在 MSR语料库,如果没有自动下载
21+
def ensure_data(data_name, data_url):
22+
root_path = test_data_path()
23+
dest_path = os.path.join(root_path, data_name)
24+
if os.path.exists(dest_path):
25+
return dest_path
26+
27+
if data_url.endswith('.zip'):
28+
dest_path += '.zip'
29+
download(data_url, dest_path)
30+
if data_url.endswith('.zip'):
31+
with zipfile.ZipFile(dest_path, "r") as archive:
32+
archive.extractall(root_path)
33+
remove_file(dest_path)
34+
dest_path = dest_path[:-len('.zip')]
35+
return dest_path
36+
37+
38+
## 指定 PKU 语料库
39+
PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
40+
PKU199801 = os.path.join(PKU98, '199801.txt')
41+
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
42+
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
43+
POS_MODEL = os.path.join(PKU98, 'pos.bin')
44+
NER_MODEL = os.path.join(PKU98, 'ner.bin')
45+
46+
47+
## ===============================================
48+
## 以下开始 HMM 词性标注
49+
50+
51+
52+
HMMPOSTagger = JClass('com.hankcs.hanlp.model.hmm.HMMPOSTagger')
53+
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')
54+
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
55+
FirstOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.FirstOrderHiddenMarkovModel')
56+
SecondOrderHiddenMarkovModel = JClass('com.hankcs.hanlp.model.hmm.SecondOrderHiddenMarkovModel')
57+
58+
def train_hmm_pos(corpus, model):
59+
tagger = HMMPOSTagger(model) # 创建词性标注器
60+
tagger.train(corpus) # 训练
61+
print(', '.join(tagger.tag("他", "的", "希望", "是", "希望", "上学"))) # 预测
62+
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger) # 构造词法分析器,与感知机分词器结合,能同时进行分词和词性标注。
63+
print(analyzer.analyze("他的希望是希望上学")) # 分词+词性标注
64+
print(analyzer.analyze("他的希望是希望上学").translateLabels()) # 对词性进行翻译
65+
print(analyzer.analyze("李狗蛋的希望是希望上学").translateLabels()) # 对词性进行翻译
66+
return tagger
67+
68+
69+
if __name__ == '__main__':
70+
print('一阶隐马尔可夫模型:')
71+
tagger1 = train_hmm_pos(PKU199801_TRAIN, FirstOrderHiddenMarkovModel()) # 一阶隐马
72+
print('')
73+
print('二阶隐马尔可夫模型:')
74+
tagger = train_hmm_pos(PKU199801_TRAIN, SecondOrderHiddenMarkovModel()) # 二阶隐马

code/ch07/perceptron_pos.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
2+
3+
from pyhanlp import *
4+
import zipfile
5+
import os
6+
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
7+
8+
def test_data_path():
9+
"""
10+
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
11+
:return:
12+
"""
13+
data_path = os.path.join(HANLP_DATA_PATH, 'test')
14+
if not os.path.isdir(data_path):
15+
os.mkdir(data_path)
16+
return data_path
17+
18+
19+
20+
## 验证是否存在 MSR语料库,如果没有自动下载
21+
def ensure_data(data_name, data_url):
22+
root_path = test_data_path()
23+
dest_path = os.path.join(root_path, data_name)
24+
if os.path.exists(dest_path):
25+
return dest_path
26+
27+
if data_url.endswith('.zip'):
28+
dest_path += '.zip'
29+
download(data_url, dest_path)
30+
if data_url.endswith('.zip'):
31+
with zipfile.ZipFile(dest_path, "r") as archive:
32+
archive.extractall(root_path)
33+
remove_file(dest_path)
34+
dest_path = dest_path[:-len('.zip')]
35+
return dest_path
36+
37+
38+
## 指定 PKU 语料库
39+
PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
40+
PKU199801 = os.path.join(PKU98, '199801.txt')
41+
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
42+
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
43+
POS_MODEL = os.path.join(PKU98, 'pos.bin')
44+
NER_MODEL = os.path.join(PKU98, 'ner.bin')
45+
46+
47+
## ===============================================
48+
## 以下开始 HMM 词性标注
49+
50+
51+
52+
AbstractLexicalAnalyzer = JClass('com.hankcs.hanlp.tokenizer.lexical.AbstractLexicalAnalyzer')
53+
PerceptronSegmenter = JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
54+
POSTrainer = JClass('com.hankcs.hanlp.model.perceptron.POSTrainer')
55+
PerceptronPOSTagger = JClass('com.hankcs.hanlp.model.perceptron.PerceptronPOSTagger')
56+
57+
def train_perceptron_pos(corpus):
58+
trainer = POSTrainer()
59+
trainer.train(corpus, POS_MODEL) # 训练感知机模型
60+
tagger = PerceptronPOSTagger(POS_MODEL) # 加载
61+
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), tagger) # 构造词法分析器,与感知机分词器结合,能同时进行分词和词性标注。
62+
print(analyzer.analyze("李狗蛋的希望是希望上学")) # 分词+词性标注
63+
print(analyzer.analyze("李狗蛋的希望是希望上学").translateLabels()) # 对词性进行翻译
64+
return tagger
65+
66+
67+
if __name__ == '__main__':
68+
train_perceptron_pos(PKU199801_TRAIN)

0 commit comments

Comments
 (0)