forked from taozhijiang/chinese_correct_wsd
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hanzi_prep.py
executable file
·130 lines (114 loc) · 3.34 KB
/
hanzi_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/python3
# -*- encoding: utf-8 -*-
#
# a script to preprocess chinese HTML pages to be used as raw corpus for
# character based tagging, like CRF and maxent
# the output would be a list of sentence, each sentence is composed of tokens.
# Token is either Chinese character, English word, or punctuation.
#
# this script will
# !1. remove all HTML tags in the input file
# !2. remove all JS and CSS
# 3. replace all spaces with newline
# 4. replace a line full of alphanumber with a newline
# 5. replace successive punctuations (including full-width puncts) with the leading one
# 6. put a newline after each punctuation of it
# 7. replace successive newlines with one newline
import sys
import codecs
from hanzi_util import is_terminator, is_punct, is_zh
def split_into_sentences_j(line):
tokens = []
en_token = []
def close_token(token):
if token:
tokens.append(''.join(token))
del(token[:])
for c in line:
if is_terminator(c):
# close current token
if not tokens: continue
close_token(en_token)
#tokens.append(c)
yield tokens
tokens = []
elif is_punct(c):
close_token(en_token)
#tokens.append(c)
elif is_zh(c):
close_token(en_token)
tokens.append(c)
elif c == u' ' or c == u'\t':
close_token(en_token)
else:
#en_token.append(c)
pass
if tokens:
yield tokens
def split_into_sentences_e(line):
tokens = []
en_token = []
def close_token(token):
if token:
tokens.append(''.join(token))
del(token[:])
for c in line:
if is_terminator(c):
# close current token
if not tokens: continue
close_token(en_token)
#tokens.append(c)
yield tokens
tokens = []
elif is_punct(c):
close_token(en_token)
#tokens.append(c)
elif is_zh(c):
close_token(en_token)
tokens.append(c)
elif c == u' ' or c == u'\t':
close_token(en_token)
else:
#en_token.append(c)
pass
if tokens:
yield tokens
def split_into_sentences(line):
tokens = []
en_token = []
def close_token(token):
if token:
tokens.append(''.join(token))
del(token[:])
for c in line:
if is_terminator(c):
# close current token
if not tokens: continue
close_token(en_token)
tokens.append(c)
yield tokens
tokens = []
elif is_punct(c):
close_token(en_token)
tokens.append(c)
elif is_zh(c):
close_token(en_token)
tokens.append(c)
elif c == u' ' or c == u'\t':
close_token(en_token)
else:
en_token.append(c)
if tokens:
yield tokens
def process(input):
for line in input:
for sentence in split_into_sentences(line.strip()):
yield sentence
def print_sentence(sentence):
s = u' '.join(sentence)
print (s)
if __name__ == "__main__":
for fn in sys.argv[1:]:
with codecs.open(fn, 'r', 'utf-8') as f:
for sentence in process(f):
print_sentence(sentence)