forked from felixonmars/fcitx5-pinyin-zhwiki
-
Notifications
You must be signed in to change notification settings - Fork 2
/
convert.py
executable file
·264 lines (224 loc) · 9.31 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Usage:
# convert.py input_filename
# input_filename is a file of Wikipedia article titles, one title per line.
import logging
import re
import sys
import opencc
from pypinyin import lazy_pinyin, pinyin, Style, load_single_dict, load_phrases_dict
from pypinyin.core import PHRASES_DICT, PINYIN_DICT
# Require at least 3 characters
_MINIMUM_LEN = 3
_LIST_PAGE_ENDINGS = [
'列表',
'对照表',
]
_LOG_EVERY = 100000
_PINYIN_SEPARATOR = '\''
_HANZI_RE = re.compile('^[\u4e00-\u9fa5]+$')
_TO_SIMPLIFIED_CHINESE = opencc.OpenCC('t2s.json')
_TO_TRADITIONAL_CHINESE = opencc.OpenCC('s2t.json')
_PINYIN_FIXES = {
'n': 'en', # https://github.com/felixonmars/fcitx5-pinyin-zhwiki/issues/13
}
logging.basicConfig(level=logging.INFO)
def is_good_title(title, previous_title=None):
if not _HANZI_RE.match(title):
return False
# Skip single character & too long pages
if len(title) < _MINIMUM_LEN:
return False
# Skip list pages
if title.endswith(tuple(_LIST_PAGE_ENDINGS)):
return False
if previous_title and \
len(previous_title) >= 4 and \
title.startswith(previous_title):
return False
return True
def log_count(count):
logging.info(f'{count} words generated')
def process(convert_title, title_to_line, convert_titles):
previous_title = None
result_count = 0
with open(sys.argv[1]) as f:
titles = []
for line in f:
# 日本动画很多以`!``。`结尾,移除这些符号
title = convert_title(line.strip().rstrip('!。'))
titles.append(title)
titles = convert_titles(titles)
for title in titles:
if is_good_title(title, previous_title):
line = title_to_line(title)
if line is not None:
print(line)
result_count += 1
if result_count % _LOG_EVERY == 0:
log_count(result_count)
previous_title = title
log_count(result_count)
def flat_phrases(phrases):
"""
@see https://zhuanlan.zhihu.com/p/66930500
"""
lines = []
def process(arr, index=0):
if index >= len(phrases):
lines.append(arr.copy())
return
for it in phrases[index]:
arr.append(it)
process(arr, index+1)
arr.pop()
process([])
return [' '.join(line) for line in lines]
def main():
if len(sys.argv) > 2 and sys.argv[2] == '--rime':
load_luna_dict()
def title_to_line(title):
phrases = pinyin(title, style=Style.NORMAL, heteronym=True)
return '\n'.join((f'{title}\t{phrase}' for phrase in flat_phrases(phrases)))
def convert_titles(titles):
map = {}
map_traditional = {}
for indexed_title in enumerate(titles):
index = indexed_title[0]
title = indexed_title[1]
title_simple = _TO_SIMPLIFIED_CHINESE.convert(title)
if title != title_simple:
# 和简体不同, 则title一定是繁体字
# 但这个繁体不一定是正确的繁体, 一般是存在多条错误的繁体+一条正确的繁体, 这里使用OpenCC筛选出正确的繁体, 优先用它
if title_simple in map_traditional:
# 如果title_simple已添加, 则只有当前title是OpenCC认可的title_simple的繁体形式时, 才替换之前添加的内容
title_traditional = _TO_TRADITIONAL_CHINESE.convert(title_simple)
if title_traditional == title:
map_traditional[title_simple] = indexed_title
else:
map_traditional[title_simple] = indexed_title
else:
# 和简体相同, 则title大概率是简体, 用OpenCC将它转成繁体
title_traditional = _TO_TRADITIONAL_CHINESE.convert(title)
map[title] = (index, title_traditional)
# map_traditional是原始文本中的繁体, map是OpenCC转换出来的繁体
# 我们认为map_traditional的准确率更高, 用它覆盖map
map.update(map_traditional)
# 依index排序, 保证词条顺序不变
return (title for index, title in sorted(map.values(), key=lambda it: it[0]))
process(
convert_title=lambda it: it,
title_to_line=title_to_line,
convert_titles=convert_titles,
)
else:
def title_to_line(title):
pinyin = [_PINYIN_FIXES.get(item, item) for item in lazy_pinyin(title)]
pinyin = _PINYIN_SEPARATOR.join(pinyin)
if pinyin == title:
logging.info(
f'Failed to convert to Pinyin. Ignoring: {pinyin}')
return None
return '\t'.join([title, pinyin, '0'])
process(
convert_title=lambda it: _TO_SIMPLIFIED_CHINESE.convert(it),
title_to_line=title_to_line,
convert_titles=lambda it: list(dict.fromkeys(it)),
)
# example:
# 於 wu 0%
PATTERN_RIME_DICT_ITEM = re.compile(r'^(?P<word>\w+)\t(?P<pinyin>[a-z ]+)(\t(?P<percent>[\d.]+)%)?$')
class Dict(dict):
def __setattr__(self, key, value):
self[key] = value
def __getattr__(self, key):
return self[key]
def load_luna_dict():
single_dict = {}
phrases_dict = {}
# 朙月拼音是专为繁体字设计的字典, 里面的简体字被看成"被大陸簡化字借用字形的傳承字"标注的是"古音"
# 直接用来处理带简体字的zhwiki效果惨不忍睹(-_-#), 这里使用opencc尝试规避该问题
luna_dict = {}
luna_dict_simple = {}
with open('./rime-luna-pinyin/luna_pinyin.dict.yaml', mode='r') as f:
for line in f:
match = PATTERN_RIME_DICT_ITEM.match(line)
if match:
item = Dict(match.groupdict())
# item中的words字段进用来debug时追踪item的来源
word = item['word']
item.pop('word')
item.words = word
item.percent = float(item.percent) if item.percent is not None else 100
if luna_dict.get(word) is None:
luna_dict[word] = [item]
else:
# 多音字
luna_dict[word].append(item)
word_simple = _TO_SIMPLIFIED_CHINESE.convert(word)
if word != word_simple:
item_simple = Dict(item)
if luna_dict_simple.get(word_simple) is None:
luna_dict_simple[word_simple] = [item_simple]
else:
# 多繁转一简后同音的情况, 此时应该将词频累加
for exist_item in luna_dict_simple[word_simple]:
if exist_item.pinyin == item_simple.pinyin:
exist_item.percent += item_simple.percent
exist_item.words += item_simple.words
# logging.info(f'exist_item: {exist_item}')
break
else:
luna_dict_simple[word_simple].append(item_simple)
# 使用简体字的注音覆盖繁体字的注音, 则那些"被大陸簡化字借用字形的傳承字"的注音大多会被覆盖掉...
luna_dict.update(luna_dict_simple)
for (word, items) in luna_dict.items():
for item in items:
if item.percent < 5:
# 排除低频词
continue
if len(word) == 1:
codePoint = ord(word)
if single_dict.get(codePoint) is None:
single_dict[codePoint] = item.pinyin
else:
single_dict[codePoint] = f'{single_dict[codePoint]},{item.pinyin}'
else:
w = item.pinyin.split(' ')
if phrases_dict.get(word) is None:
phrases_dict[word] = [[it] for it in w]
elif len(phrases_dict[word]) == len(w):
for i in range(len(w)):
phrases_dict[word][i].append(w[i])
else:
logging.warn(f'invalid pinyin: {word} -> {item}')
# 移除内置单字词典的多音字
for (word, pinyins) in PINYIN_DICT.items():
pinyin_list = pinyins.split(',')
if len(pinyin_list) > 1:
PINYIN_DICT[word] = pinyin_list[0]
# 移除内置词组的多音字
for (word, phrases) in PHRASES_DICT.items():
for p in phrases:
while (len(p) > 1):
p.pop()
# 加载luna词典
load_single_dict(single_dict)
load_phrases_dict(phrases_dict)
def test_load_luna_dict():
load_luna_dict()
print(lazy_pinyin('長月達平'))
print(pinyin('長月達平', style=Style.NORMAL, heteronym=True))
print(pinyin('一个女大学生的情思', style=Style.NORMAL, heteronym=True))
def test_flat_phrases():
print(flat_phrases([
['a', 'b', 'c'],
['1', '2', '3'],
['fuck', 'you'],
]))
if __name__ == '__main__':
if len(sys.argv) > 1:
main()
else:
test_load_luna_dict()