Skip to content
This repository has been archived by the owner on Feb 9, 2023. It is now read-only.

Commit

Permalink
add failure params & restruct code
Browse files Browse the repository at this point in the history
  • Loading branch information
mapix committed May 24, 2014
1 parent fd6a634 commit 09ddda1
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 50 deletions.
21 changes: 15 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
Pinyin
======
------

Smart Chinese-to-Pinyin converter.

Install
=======

Getting Started
---------------

pip install smart_pinyin

Usage
=====
-----

>>> from pinyin import Pinyin
>>> pinyin = Pinyin()
>>> " ".join(pinyin.get_pinyin("银行行长潘玮柏长了一头乌黑的白发, 睡觉睡的很晚, 道行很深"))
u'yin hang hang zhang pan wei bo zhang le yi tou wu hei de bai fa \uff0c shui jiao shui de hen wan , dao heng hen shen'
>>> ' '.join(p.get_pinyin('银行行长潘玮柏长了一头乌黑的白发, 睡觉睡的很晚, 道行很深', failure=''))
>>> u'yin hang hang zhang pan wei bo zhang le yi tou wu hei de bai fa \uff0c shui jiao shui de hen wan , dao heng hen shen'

>>> for i in pinyin.get_pinyin_all('自行车'): print list(i)
[u'zi', u'hang', u'che']
[u'zi', u'hang', u'ju']
[u'zi', u'heng', u'che']
[u'zi', u'heng', u'ju']
[u'zi', u'xing', u'che']
[u'zi', u'xing', u'ju']
71 changes: 29 additions & 42 deletions pinyin/__init__.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,59 @@
# -*- coding: utf-8 -*-

import re
import jieba
from collections import defaultdict
from itertools import product, islice
from os.path import join, abspath, dirname

__all__ = ["Pinyin"]
from pinyin.config import FILE_WORDS, FILE_WORD, FILE_TERM, FILE_USER_DICT, CHINESE_RE
from pinyin.utils import Singleton

BASE = abspath(dirname(__file__))
ZH_CN_RE = re.compile(ur'^[\u4e00-\u9fa5]+$')
__all__ = ["Pinyin"]


class Pinyin(object):

__metaclass__ = Singleton

def __init__(self):

self.word2pinyins = defaultdict(list)
for line in open(join(BASE, "data/words.dic")):
pinyin, words = line.strip().decode('utf-8').split(" ", 1)
self.word_to_pinyins = defaultdict(list)
for line in open(FILE_WORDS):
pinyin, words = line.strip().decode("utf-8").split()
for item in words:
self.word2pinyins[item].append(pinyin)
self.word_to_pinyins[item].append(pinyin)

self.word2pinyin = {}
for l in open(join(BASE, "data/word.dic")):
word, pinyin = l.strip().decode('utf-8').split(',')
self.word2pinyin[word] = pinyin
self.word_to_pinyin = {}
for line in open(FILE_WORD):
word, pinyin = line.strip().decode("utf-8").split(",")
self.word_to_pinyin[word] = pinyin

self.term2pinyin = {}
for line in open(join(BASE, "data/term.dic")):
term, pinyin = line.strip().decode('utf-8').split('#')
self.term2pinyin[term] = pinyin.split('@')
self.term_to_pinyin = {}
for line in open(FILE_TERM):
term, pinyin = line.strip().decode("utf-8").split("#")
self.term_to_pinyin[term] = pinyin.split("@")

jieba.initialize()
jieba.load_userdict(join(BASE, "data/user_dict.dic"))
jieba.load_userdict(FILE_USER_DICT)

def _pinyin(self, term):
pinyin_list = self.term2pinyin.get(term, None)
def _pinyin(self, term, failure=None):
pinyin_list = self.term_to_pinyin.get(term, None)
if not pinyin_list:
pinyin_list = [self.word2pinyin.get(word, word) for word in term]
pinyin_list = [self.word_to_pinyin.get(word, word if failure is None else failure) for word in term]
return pinyin_list

def get_pinyin(self, text):
def get_pinyin(self, text, failure=None):
term_list = jieba.cut(text, cut_all=False)
pinyin_list_iter = (self._pinyin(term) if ZH_CN_RE.match(term) else [term] for term in term_list)
pinyin_list_iter = (
self._pinyin(term, failure) if CHINESE_RE.match(term) else [term if failure is None else failure]
for term in term_list
)
return [pinyin for pinyin_list in pinyin_list_iter for pinyin in pinyin_list]

def get_pinyin_all(self, text, max_return=None):
def get_pinyin_all(self, text, max_return=None, failure=None):
if not isinstance(text, unicode):
text = text.decode('utf-8', 'ignore')
rs = [self.word2pinyins.get(word, [word]) for word in text]
text = text.decode("utf-8", "ignore")
rs = [self.word_to_pinyins.get(word, [word if failure is None else failure]) for word in text]
pinyin_all_iter = product(*rs)
if max_return is not None:
pinyin_all_iter = islice(pinyin_all_iter, 0, max_return)
return pinyin_all_iter



if __name__ == "__main__":
pinyin = Pinyin()
text = "银行行长潘玮柏长了一头乌黑的白发, 睡觉睡的很晚, 道行很深"

print "# get_pinyin(%s)" % text
for i in pinyin.get_pinyin(text):
print i,
print "\n"

print "# get_pinyin_all(%s)" % text
for i in pinyin.get_pinyin_all(text, max_return=8):
print ' '.join(i),
print "\n"
14 changes: 14 additions & 0 deletions pinyin/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-

import re
from os.path import join, abspath, dirname


DB_DIR = abspath(dirname(__file__))

CHINESE_RE = re.compile(ur'^[\u4e00-\u9fa5]+$')

FILE_WORDS = join(DB_DIR, "data/words.dic")
FILE_WORD = join(DB_DIR, "data/word.dic")
FILE_TERM = join(DB_DIR, "data/term.dic")
FILE_USER_DICT = join(DB_DIR, "data/user_dict.dic")
11 changes: 11 additions & 0 deletions pinyin/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-


class Singleton(type):

_instances = {}

def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

setup(
name='smart_pinyin',
version='0.3.0',
description='Smart Chinese-to-Pinyin converter.)',
version='0.3.1',
description='Smart Chinese-to-Pinyin converter.',
author='mapix',
author_email='mapix.me@gmail.com',
url='https://github.com/mapix/pinyin',
Expand Down

0 comments on commit 09ddda1

Please sign in to comment.