Skip to content

Commit 1ab98c7

Browse files
authored
Create train_parser.py
1 parent 40b1dae commit 1ab98c7

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed

code/ch12/train_parser.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from pyhanlp import *
2+
import zipfile
3+
import os
4+
from pyhanlp.static import download, remove_file, HANLP_DATA_PATH
5+
6+
def test_data_path():
7+
"""
8+
获取测试数据路径,位于$root/data/test,根目录由配置文件指定。
9+
:return:
10+
"""
11+
data_path = os.path.join(HANLP_DATA_PATH, 'test')
12+
if not os.path.isdir(data_path):
13+
os.mkdir(data_path)
14+
return data_path
15+
16+
17+
18+
## 验证是否存在语料库,如果没有自动下载
19+
def ensure_data(data_name, data_url):
20+
root_path = test_data_path()
21+
dest_path = os.path.join(root_path, data_name)
22+
if os.path.exists(dest_path):
23+
return dest_path
24+
25+
if data_url.endswith('.zip'):
26+
dest_path += '.zip'
27+
download(data_url, dest_path)
28+
if data_url.endswith('.zip'):
29+
with zipfile.ZipFile(dest_path, "r") as archive:
30+
archive.extractall(root_path)
31+
remove_file(dest_path)
32+
dest_path = dest_path[:-len('.zip')]
33+
return dest_path
34+
35+
36+
37+
## ===============================================
38+
## 以下开始 依存句法分析
39+
40+
KBeamArcEagerDependencyParser = JClass('com.hankcs.hanlp.dependency.perceptron.parser.KBeamArcEagerDependencyParser')
41+
CTB_ROOT = ensure_data("ctb8.0-dep", "http://file.hankcs.com/corpus/ctb8.0-dep.zip")
42+
CTB_TRAIN = CTB_ROOT + "/train.conll"
43+
CTB_DEV = CTB_ROOT + "/dev.conll"
44+
CTB_TEST = CTB_ROOT + "/test.conll"
45+
CTB_MODEL = CTB_ROOT + "/ctb.bin"
46+
BROWN_CLUSTER = ensure_data("wiki-cn-cluster.txt", "http://file.hankcs.com/corpus/wiki-cn-cluster.zip")
47+
48+
49+
parser = KBeamArcEagerDependencyParser.train(CTB_TRAIN, CTB_DEV, BROWN_CLUSTER, CTB_MODEL)
50+
print(parser.parse("人吃鱼"))
51+
score = parser.evaluate(CTB_TEST)
52+
print("UAS=%.1f LAS=%.1f\n" % (score[0], score[1]))

0 commit comments

Comments
 (0)