add failure params & restruct code

mapix · May 24, 2014 · 09ddda1 · 09ddda1
1 parent fd6a634
commit 09ddda1
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -1,17 +1,26 @@
 Pinyin
-======
+------
 
 Smart Chinese-to-Pinyin converter.
 
-Install
-=======
+
+Getting Started
+---------------
 
     pip install smart_pinyin
 
 Usage
-=====
+-----
 
     >>> from pinyin import Pinyin
     >>> pinyin = Pinyin()
-    >>> " ".join(pinyin.get_pinyin("银行行长潘玮柏长了一头乌黑的白发， 睡觉睡的很晚, 道行很深"))
-    u'yin hang hang zhang pan wei bo zhang le yi tou wu hei de bai fa \uff0c   shui jiao shui de hen wan ,   dao heng hen shen'
+    >>> ' '.join(p.get_pinyin('银行行长潘玮柏长了一头乌黑的白发， 睡觉睡的很晚, 道行很深', failure=''))
+    >>> u'yin hang hang zhang pan wei bo zhang le yi tou wu hei de bai fa \uff0c   shui jiao shui de hen wan ,   dao heng hen shen'
+
+    >>> for i in pinyin.get_pinyin_all('自行车'): print list(i)
+    [u'zi', u'hang', u'che']
+    [u'zi', u'hang', u'ju']
+    [u'zi', u'heng', u'che']
+    [u'zi', u'heng', u'ju']
+    [u'zi', u'xing', u'che']
+    [u'zi', u'xing', u'ju']
diff --git a/pinyin/__init__.py b/pinyin/__init__.py
@@ -1,72 +1,59 @@
 # -*- coding: utf-8 -*-
 
-import re
 import jieba
 from collections import defaultdict
 from itertools import product, islice
-from os.path import join, abspath, dirname
 
-__all__ = ["Pinyin"]
+from pinyin.config import FILE_WORDS, FILE_WORD, FILE_TERM, FILE_USER_DICT, CHINESE_RE
+from pinyin.utils import Singleton
 
-BASE = abspath(dirname(__file__))
-ZH_CN_RE = re.compile(ur'^[\u4e00-\u9fa5]+$')
+__all__ = ["Pinyin"]
 
 
 class Pinyin(object):
 
+    __metaclass__ = Singleton
+
     def __init__(self):
 
-        self.word2pinyins = defaultdict(list)
-        for line in open(join(BASE, "data/words.dic")):
-            pinyin, words = line.strip().decode('utf-8').split(" ", 1)
+        self.word_to_pinyins = defaultdict(list)
+        for line in open(FILE_WORDS):
+            pinyin, words = line.strip().decode("utf-8").split()
             for item in words:
-                self.word2pinyins[item].append(pinyin)
+                self.word_to_pinyins[item].append(pinyin)
 
-        self.word2pinyin = {}
-        for l in open(join(BASE, "data/word.dic")):
-            word, pinyin = l.strip().decode('utf-8').split(',')
-            self.word2pinyin[word] = pinyin
+        self.word_to_pinyin = {}
+        for line in open(FILE_WORD):
+            word, pinyin = line.strip().decode("utf-8").split(",")
+            self.word_to_pinyin[word] = pinyin
 
-        self.term2pinyin = {}
-        for line in open(join(BASE, "data/term.dic")):
-            term, pinyin = line.strip().decode('utf-8').split('#')
-            self.term2pinyin[term] = pinyin.split('@')
+        self.term_to_pinyin = {}
+        for line in open(FILE_TERM):
+            term, pinyin = line.strip().decode("utf-8").split("#")
+            self.term_to_pinyin[term] = pinyin.split("@")
 
         jieba.initialize()
-        jieba.load_userdict(join(BASE, "data/user_dict.dic"))
+        jieba.load_userdict(FILE_USER_DICT)
 
-    def _pinyin(self, term):
-        pinyin_list = self.term2pinyin.get(term, None)
+    def _pinyin(self, term, failure=None):
+        pinyin_list = self.term_to_pinyin.get(term, None)
         if not pinyin_list:
-            pinyin_list = [self.word2pinyin.get(word, word) for word in term]
+            pinyin_list = [self.word_to_pinyin.get(word, word if failure is None else failure) for word in term]
         return pinyin_list
 
-    def get_pinyin(self, text):
+    def get_pinyin(self, text, failure=None):
         term_list = jieba.cut(text, cut_all=False)
-        pinyin_list_iter = (self._pinyin(term) if ZH_CN_RE.match(term) else [term] for term in term_list)
+        pinyin_list_iter = (
+            self._pinyin(term, failure) if CHINESE_RE.match(term) else [term if failure is None else failure]
+            for term in term_list
+        )
         return [pinyin for pinyin_list in pinyin_list_iter for pinyin in pinyin_list]
 
-    def get_pinyin_all(self, text, max_return=None):
+    def get_pinyin_all(self, text, max_return=None, failure=None):
         if not isinstance(text, unicode):
-            text = text.decode('utf-8', 'ignore')
-        rs = [self.word2pinyins.get(word, [word]) for word in text]
+            text = text.decode("utf-8", "ignore")
+        rs = [self.word_to_pinyins.get(word, [word if failure is None else failure]) for word in text]
         pinyin_all_iter = product(*rs)
         if max_return is not None:
             pinyin_all_iter = islice(pinyin_all_iter, 0, max_return)
         return pinyin_all_iter
-
-
-
-if __name__ == "__main__":
-    pinyin = Pinyin()
-    text = "银行行长潘玮柏长了一头乌黑的白发， 睡觉睡的很晚, 道行很深"
-
-    print "# get_pinyin(%s)" %  text
-    for i in pinyin.get_pinyin(text):
-        print i,
-    print "\n"
-
-    print "# get_pinyin_all(%s)" % text
-    for i in pinyin.get_pinyin_all(text, max_return=8):
-        print ' '.join(i),
-    print "\n"
diff --git a/pinyin/config.py b/pinyin/config.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+import re
+from os.path import join, abspath, dirname
+
+
+DB_DIR = abspath(dirname(__file__))
+
+CHINESE_RE = re.compile(ur'^[\u4e00-\u9fa5]+$')
+
+FILE_WORDS = join(DB_DIR, "data/words.dic")
+FILE_WORD = join(DB_DIR, "data/word.dic")
+FILE_TERM = join(DB_DIR, "data/term.dic")
+FILE_USER_DICT = join(DB_DIR, "data/user_dict.dic")
diff --git a/pinyin/utils.py b/pinyin/utils.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+
+class Singleton(type):
+
+    _instances = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
diff --git a/setup.py b/setup.py
@@ -9,8 +9,8 @@
 
 setup(
     name='smart_pinyin',
-    version='0.3.0',
-    description='Smart Chinese-to-Pinyin converter.)',
+    version='0.3.1',
+    description='Smart Chinese-to-Pinyin converter.',
     author='mapix',
     author_email='mapix.me@gmail.com',
     url='https://github.com/mapix/pinyin',