This repository has been archived by the owner on Feb 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
71 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,26 @@ | ||
Pinyin | ||
====== | ||
------ | ||
|
||
Smart Chinese-to-Pinyin converter. | ||
|
||
Install | ||
======= | ||
|
||
Getting Started | ||
--------------- | ||
|
||
pip install smart_pinyin | ||
|
||
Usage | ||
===== | ||
----- | ||
|
||
>>> from pinyin import Pinyin | ||
>>> pinyin = Pinyin() | ||
>>> " ".join(pinyin.get_pinyin("银行行长潘玮柏长了一头乌黑的白发, 睡觉睡的很晚, 道行很深")) | ||
u'yin hang hang zhang pan wei bo zhang le yi tou wu hei de bai fa \uff0c shui jiao shui de hen wan , dao heng hen shen' | ||
>>> ' '.join(p.get_pinyin('银行行长潘玮柏长了一头乌黑的白发, 睡觉睡的很晚, 道行很深', failure='')) | ||
>>> u'yin hang hang zhang pan wei bo zhang le yi tou wu hei de bai fa \uff0c shui jiao shui de hen wan , dao heng hen shen' | ||
|
||
>>> for i in pinyin.get_pinyin_all('自行车'): print list(i) | ||
[u'zi', u'hang', u'che'] | ||
[u'zi', u'hang', u'ju'] | ||
[u'zi', u'heng', u'che'] | ||
[u'zi', u'heng', u'ju'] | ||
[u'zi', u'xing', u'che'] | ||
[u'zi', u'xing', u'ju'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,72 +1,59 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import re | ||
import jieba | ||
from collections import defaultdict | ||
from itertools import product, islice | ||
from os.path import join, abspath, dirname | ||
|
||
__all__ = ["Pinyin"] | ||
from pinyin.config import FILE_WORDS, FILE_WORD, FILE_TERM, FILE_USER_DICT, CHINESE_RE | ||
from pinyin.utils import Singleton | ||
|
||
BASE = abspath(dirname(__file__)) | ||
ZH_CN_RE = re.compile(ur'^[\u4e00-\u9fa5]+$') | ||
__all__ = ["Pinyin"] | ||
|
||
|
||
class Pinyin(object): | ||
|
||
__metaclass__ = Singleton | ||
|
||
def __init__(self): | ||
|
||
self.word2pinyins = defaultdict(list) | ||
for line in open(join(BASE, "data/words.dic")): | ||
pinyin, words = line.strip().decode('utf-8').split(" ", 1) | ||
self.word_to_pinyins = defaultdict(list) | ||
for line in open(FILE_WORDS): | ||
pinyin, words = line.strip().decode("utf-8").split() | ||
for item in words: | ||
self.word2pinyins[item].append(pinyin) | ||
self.word_to_pinyins[item].append(pinyin) | ||
|
||
self.word2pinyin = {} | ||
for l in open(join(BASE, "data/word.dic")): | ||
word, pinyin = l.strip().decode('utf-8').split(',') | ||
self.word2pinyin[word] = pinyin | ||
self.word_to_pinyin = {} | ||
for line in open(FILE_WORD): | ||
word, pinyin = line.strip().decode("utf-8").split(",") | ||
self.word_to_pinyin[word] = pinyin | ||
|
||
self.term2pinyin = {} | ||
for line in open(join(BASE, "data/term.dic")): | ||
term, pinyin = line.strip().decode('utf-8').split('#') | ||
self.term2pinyin[term] = pinyin.split('@') | ||
self.term_to_pinyin = {} | ||
for line in open(FILE_TERM): | ||
term, pinyin = line.strip().decode("utf-8").split("#") | ||
self.term_to_pinyin[term] = pinyin.split("@") | ||
|
||
jieba.initialize() | ||
jieba.load_userdict(join(BASE, "data/user_dict.dic")) | ||
jieba.load_userdict(FILE_USER_DICT) | ||
|
||
def _pinyin(self, term): | ||
pinyin_list = self.term2pinyin.get(term, None) | ||
def _pinyin(self, term, failure=None): | ||
pinyin_list = self.term_to_pinyin.get(term, None) | ||
if not pinyin_list: | ||
pinyin_list = [self.word2pinyin.get(word, word) for word in term] | ||
pinyin_list = [self.word_to_pinyin.get(word, word if failure is None else failure) for word in term] | ||
return pinyin_list | ||
|
||
def get_pinyin(self, text): | ||
def get_pinyin(self, text, failure=None): | ||
term_list = jieba.cut(text, cut_all=False) | ||
pinyin_list_iter = (self._pinyin(term) if ZH_CN_RE.match(term) else [term] for term in term_list) | ||
pinyin_list_iter = ( | ||
self._pinyin(term, failure) if CHINESE_RE.match(term) else [term if failure is None else failure] | ||
for term in term_list | ||
) | ||
return [pinyin for pinyin_list in pinyin_list_iter for pinyin in pinyin_list] | ||
|
||
def get_pinyin_all(self, text, max_return=None): | ||
def get_pinyin_all(self, text, max_return=None, failure=None): | ||
if not isinstance(text, unicode): | ||
text = text.decode('utf-8', 'ignore') | ||
rs = [self.word2pinyins.get(word, [word]) for word in text] | ||
text = text.decode("utf-8", "ignore") | ||
rs = [self.word_to_pinyins.get(word, [word if failure is None else failure]) for word in text] | ||
pinyin_all_iter = product(*rs) | ||
if max_return is not None: | ||
pinyin_all_iter = islice(pinyin_all_iter, 0, max_return) | ||
return pinyin_all_iter | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
pinyin = Pinyin() | ||
text = "银行行长潘玮柏长了一头乌黑的白发, 睡觉睡的很晚, 道行很深" | ||
|
||
print "# get_pinyin(%s)" % text | ||
for i in pinyin.get_pinyin(text): | ||
print i, | ||
print "\n" | ||
|
||
print "# get_pinyin_all(%s)" % text | ||
for i in pinyin.get_pinyin_all(text, max_return=8): | ||
print ' '.join(i), | ||
print "\n" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import re | ||
from os.path import join, abspath, dirname | ||
|
||
|
||
DB_DIR = abspath(dirname(__file__)) | ||
|
||
CHINESE_RE = re.compile(ur'^[\u4e00-\u9fa5]+$') | ||
|
||
FILE_WORDS = join(DB_DIR, "data/words.dic") | ||
FILE_WORD = join(DB_DIR, "data/word.dic") | ||
FILE_TERM = join(DB_DIR, "data/term.dic") | ||
FILE_USER_DICT = join(DB_DIR, "data/user_dict.dic") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
|
||
class Singleton(type): | ||
|
||
_instances = {} | ||
|
||
def __call__(cls, *args, **kwargs): | ||
if cls not in cls._instances: | ||
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) | ||
return cls._instances[cls] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters