Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

添加中英文混输特性 #5

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@ pinyin.py
Example:

from pinyin import PinYin

test = PinYin()
test.load_word()
test.hanzi2pinyin(string='钓鱼岛是中国的')

test = PinYin()

Out:

test.hanzi2pinyin(string='钓鱼岛是中国的')
['diao', 'yu', 'dao', 'shi', 'zhong', 'guo', 'de']
['diao', 'yu', 'dao', 'shi', 'zhong', 'guo', 'de']
test.hanzi2pinyin_split(string='钓鱼岛是中国的', split="-")
diao-yu-dao-shi-zhong-guo-de

test.hanzi2pinyin(string='hello world 123')
out: [u'helloworld123']
test.hanzi2pinyin_split(string='hello world 123', split="_")
out: helloworld123

test.hanzi2pinyin(string='hello 中国 123')
out: [u'hello', 'zhong', 'guo', u'123']
test.hanzi2pinyin_split(string='hello 中国 123', split="_")
out: hello_zhong_guo_123
34 changes: 26 additions & 8 deletions pinyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class PinYin(object):
def __init__(self, dict_file='word.data'):
self.word_dict = {}
self.dict_file = dict_file

self.load_word()

def load_word(self):
if not os.path.exists(self.dict_file):
Expand All @@ -31,19 +31,30 @@ def load_word(self):
line = f_line.split(' ')
self.word_dict[line[0]] = line[1]


def hanzi2pinyin(self, string=""):
result = []
alnum = []

if not isinstance(string, unicode):
string = string.decode("utf-8")
for char in string:

for char in string.replace(' ', ''):
key = '%X' % ord(char)
result.append(self.word_dict.get(key, char).split()[0][:-1].lower())
word = self.word_dict.get(key, char).split()[0]
if len(word) == 1:
# 拼音都有声调 长度大于1
alnum.append(word)
else:
if alnum:
words = ''.join(alnum)
alnum = []
result.append(words)
result.append(word[:-1].lower())
if alnum:
result.append(''.join(alnum))

return result


def hanzi2pinyin_split(self, string="", split=""):
result = self.hanzi2pinyin(string=string)
if split == "":
Expand All @@ -54,8 +65,15 @@ def hanzi2pinyin_split(self, string="", split=""):

if __name__ == "__main__":
test = PinYin()
test.load_word()
string = "钓鱼岛是中国的"
print "in: %s" % string
print "out: %s" % str(test.hanzi2pinyin(string=string))
print "out: %s" % test.hanzi2pinyin_split(string=string, split="-")
print "out: %s" % test.hanzi2pinyin_split(string=string, split="_")
string = "hello world 123"
print "in: %s" % string
print "out: %s" % str(test.hanzi2pinyin(string=string))
print "out: %s" % test.hanzi2pinyin_split(string=string, split="_")
string = "hello 中国 123"
print "in: %s" % string
print "out: %s" % str(test.hanzi2pinyin(string=string))
print "out: %s" % test.hanzi2pinyin_split(string=string, split="_")
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,4 @@
'Topic :: Software Development :: Libraries',
'Topic :: Software Development :: Libraries :: Python Modules'
]

)
62 changes: 62 additions & 0 deletions word.data
Original file line number Diff line number Diff line change
@@ -1,3 +1,65 @@
0x30 0
0x31 1
0x32 2
0x33 3
0x34 4
0x35 5
0x36 6
0x37 7
0x38 8
0x39 9
0x41 A
0x42 B
0x43 C
0x44 D
0x45 E
0x46 F
0x47 G
0x48 H
0x49 I
0x4a J
0x4b K
0x4c L
0x4d M
0x4e N
0x4f O
0x50 P
0x51 Q
0x52 R
0x53 S
0x54 T
0x55 U
0x56 V
0x57 W
0x58 X
0x59 Y
0x5a Z
0x61 a
0x62 b
0x63 c
0x64 d
0x65 e
0x66 f
0x67 g
0x68 h
0x69 i
0x6a j
0x6b k
0x6c l
0x6d m
0x6e n
0x6f o
0x70 p
0x71 q
0x72 r
0x73 s
0x74 t
0x75 u
0x76 v
0x77 w
0x78 x
0x79 y
0x7a z
3400 QIU1
3401 TIAN3 TIAN4
3404 KUA4
Expand Down