diff --git a/jieba/__init__.py b/jieba/__init__.py index 17358a6c..079e193c 100644 --- a/jieba/__init__.py +++ b/jieba/__init__.py @@ -20,11 +20,10 @@ else: _replace_file = os.rename -_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), - os.path.dirname(__file__), path)) _get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path)) -DEFAULT_DICT = _get_module_path("dict.txt") +DEFAULT_DICT = None +DEFAULT_DICT_NAME = "dict.txt" log_console = logging.StreamHandler(sys.stderr) default_logger = logging.getLogger(__name__) @@ -54,7 +53,10 @@ class Tokenizer(object): def __init__(self, dictionary=DEFAULT_DICT): self.lock = threading.RLock() - self.dictionary = _get_abs_path(dictionary) + if dictionary == DEFAULT_DICT: + self.dictionary = dictionary + else: + self.dictionary = _get_abs_path(dictionary) self.FREQ = {} self.total = 0 self.user_word_tag_tab = {} @@ -65,10 +67,11 @@ def __init__(self, dictionary=DEFAULT_DICT): def __repr__(self): return '' % self.dictionary - def gen_pfdict(self, f_name): + def gen_pfdict(self, f): lfreq = {} ltotal = 0 - with open(f_name, 'rb') as f: + f_name = resolve_filename(f) + with f: for lineno, line in enumerate(f, 1): try: line = line.strip().decode('utf-8') @@ -105,7 +108,7 @@ def initialize(self, dictionary=None): if self.initialized: return - default_logger.debug("Building prefix dict from %s ..." % abs_path) + default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) t1 = time.time() if self.cache_file: cache_file = self.cache_file @@ -122,7 +125,8 @@ def initialize(self, dictionary=None): tmpdir = os.path.dirname(cache_file) load_from_cache_fail = True - if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): + if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or + os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): default_logger.debug( "Loading model from cache %s" % cache_file) try: @@ -136,7 +140,7 @@ def initialize(self, dictionary=None): wlock = DICT_WRITING.get(abs_path, threading.RLock()) DICT_WRITING[abs_path] = wlock with wlock: - self.FREQ, self.total = self.gen_pfdict(abs_path) + self.FREQ, self.total = self.gen_pfdict(self.get_dict_file()) default_logger.debug( "Dumping model to file cache %s" % cache_file) try: @@ -343,8 +347,11 @@ def _lcut_all(self, sentence): def _lcut_for_search_no_hmm(self, sentence): return self.lcut_for_search(sentence, False) - def get_abs_path_dict(self): - return _get_abs_path(self.dictionary) + def get_dict_file(self): + if self.dictionary == DEFAULT_DICT: + return get_module_res(DEFAULT_DICT_NAME) + else: + return open(self.dictionary, 'rb') def load_userdict(self, f): ''' @@ -363,14 +370,17 @@ def load_userdict(self, f): ''' self.check_initialized() if isinstance(f, string_types): + f_name = f f = open(f, 'rb') + else: + f_name = resolve_filename(f) for lineno, ln in enumerate(f, 1): line = ln.strip() if not isinstance(line, text_type): try: line = line.decode('utf-8').lstrip('\ufeff') except UnicodeDecodeError: - raise ValueError('dictionary file %s must be utf-8' % f.name) + raise ValueError('dictionary file %s must be utf-8' % f_name) if not line: continue # match won't be None because there's at least one character @@ -494,7 +504,7 @@ def set_dictionary(self, dictionary_path): lcut_for_search = dt.lcut_for_search del_word = dt.del_word get_DAG = dt.get_DAG -get_abs_path_dict = dt.get_abs_path_dict +get_dict_file = dt.get_dict_file initialize = dt.initialize load_userdict = dt.load_userdict set_dictionary = dt.set_dictionary diff --git a/jieba/_compat.py b/jieba/_compat.py index 018bec6e..ef1812ad 100644 --- a/jieba/_compat.py +++ b/jieba/_compat.py @@ -1,6 +1,15 @@ # -*- coding: utf-8 -*- +import os import sys +try: + import pkg_resources + get_module_res = lambda *res: pkg_resources.resource_stream(__name__, + os.path.join(*res)) +except ImportError: + get_module_res = lambda *res: open(os.path.normpath(os.path.join( + os.getcwd(), os.path.dirname(__file__), *res)), 'rb') + PY2 = sys.version_info[0] == 2 default_encoding = sys.getfilesystemencoding() @@ -29,3 +38,9 @@ def strdecode(sentence): except UnicodeDecodeError: sentence = sentence.decode('gbk', 'ignore') return sentence + +def resolve_filename(f): + try: + return f.name + except AttributeError: + return repr(f) diff --git a/jieba/finalseg/__init__.py b/jieba/finalseg/__init__.py index 95bb718c..5d4a968c 100644 --- a/jieba/finalseg/__init__.py +++ b/jieba/finalseg/__init__.py @@ -1,8 +1,8 @@ from __future__ import absolute_import, unicode_literals import re import os -import marshal import sys +import pickle from .._compat import * MIN_FLOAT = -3.14e100 @@ -21,24 +21,9 @@ def load_model(): - _curpath = os.path.normpath( - os.path.join(os.getcwd(), os.path.dirname(__file__))) - - start_p = {} - abs_path = os.path.join(_curpath, PROB_START_P) - with open(abs_path, 'rb') as f: - start_p = marshal.load(f) - - trans_p = {} - abs_path = os.path.join(_curpath, PROB_TRANS_P) - with open(abs_path, 'rb') as f: - trans_p = marshal.load(f) - - emit_p = {} - abs_path = os.path.join(_curpath, PROB_EMIT_P) - with open(abs_path, 'rb') as f: - emit_p = marshal.load(f) - + start_p = pickle.load(get_module_res("finalseg", PROB_START_P)) + trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P)) + emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P)) return start_p, trans_p, emit_p if sys.platform.startswith("java"): diff --git a/jieba/finalseg/prob_emit.p b/jieba/finalseg/prob_emit.p index 6552b5a4..87dbaea0 100644 Binary files a/jieba/finalseg/prob_emit.p and b/jieba/finalseg/prob_emit.p differ diff --git a/jieba/finalseg/prob_start.p b/jieba/finalseg/prob_start.p index 4f7c5edd..f0644e26 100644 Binary files a/jieba/finalseg/prob_start.p and b/jieba/finalseg/prob_start.p differ diff --git a/jieba/finalseg/prob_trans.p b/jieba/finalseg/prob_trans.p index d6d36a32..40f1b205 100644 Binary files a/jieba/finalseg/prob_trans.p and b/jieba/finalseg/prob_trans.p differ diff --git a/jieba/posseg/__init__.py b/jieba/posseg/__init__.py index 724d7e82..429ba69d 100644 --- a/jieba/posseg/__init__.py +++ b/jieba/posseg/__init__.py @@ -3,7 +3,7 @@ import re import sys import jieba -import marshal +import pickle from .._compat import * from .viterbi import viterbi @@ -23,36 +23,17 @@ re_eng1 = re.compile('^[a-zA-Z0-9]$', re.U) -def load_model(f_name): - _curpath = os.path.normpath( - os.path.join(os.getcwd(), os.path.dirname(__file__))) +def load_model(): # For Jython - start_p = {} - abs_path = os.path.join(_curpath, PROB_START_P) - with open(abs_path, 'rb') as f: - start_p = marshal.load(f) - - trans_p = {} - abs_path = os.path.join(_curpath, PROB_TRANS_P) - with open(abs_path, 'rb') as f: - trans_p = marshal.load(f) - - emit_p = {} - abs_path = os.path.join(_curpath, PROB_EMIT_P) - with open(abs_path, 'rb') as f: - emit_p = marshal.load(f) - - state = {} - abs_path = os.path.join(_curpath, CHAR_STATE_TAB_P) - with open(abs_path, 'rb') as f: - state = marshal.load(f) - f.closed - - return state, start_p, trans_p, emit_p, result + start_p = pickle.load(get_module_res("posseg", PROB_START_P)) + trans_p = pickle.load(get_module_res("posseg", PROB_TRANS_P)) + emit_p = pickle.load(get_module_res("posseg", PROB_EMIT_P)) + state = pickle.load(get_module_res("posseg", CHAR_STATE_TAB_P)) + return state, start_p, trans_p, emit_p if sys.platform.startswith("java"): - char_state_tab_P, start_P, trans_P, emit_P, word_tag_tab = load_model() + char_state_tab_P, start_P, trans_P, emit_P = load_model() else: from .char_state_tab import P as char_state_tab_P from .prob_start import P as start_P @@ -89,7 +70,7 @@ class POSTokenizer(object): def __init__(self, tokenizer=None): self.tokenizer = tokenizer or jieba.Tokenizer() - self.load_word_tag(self.tokenizer.get_abs_path_dict()) + self.load_word_tag(self.tokenizer.get_dict_file()) def __repr__(self): return '' % self.tokenizer @@ -102,11 +83,12 @@ def __getattr__(self, name): def initialize(self, dictionary=None): self.tokenizer.initialize(dictionary) - self.load_word_tag(self.tokenizer.get_abs_path_dict()) + self.load_word_tag(self.tokenizer.get_dict_file()) - def load_word_tag(self, f_name): + def load_word_tag(self, f): self.word_tag_tab = {} - with open(f_name, "rb") as f: + f_name = resolve_filename(f) + with f: for lineno, line in enumerate(f, 1): try: line = line.strip().decode("utf-8") diff --git a/jieba/posseg/char_state_tab.p b/jieba/posseg/char_state_tab.p index 600b68c9..14bc2c03 100644 Binary files a/jieba/posseg/char_state_tab.p and b/jieba/posseg/char_state_tab.p differ diff --git a/jieba/posseg/prob_emit.p b/jieba/posseg/prob_emit.p index 3bf54e8a..ce9d43e3 100644 Binary files a/jieba/posseg/prob_emit.p and b/jieba/posseg/prob_emit.p differ diff --git a/jieba/posseg/prob_start.p b/jieba/posseg/prob_start.p index b9d2f1d6..5e6f44a4 100644 Binary files a/jieba/posseg/prob_start.p and b/jieba/posseg/prob_start.p differ diff --git a/jieba/posseg/prob_trans.p b/jieba/posseg/prob_trans.p index 4f67f0c8..805fdda4 100644 Binary files a/jieba/posseg/prob_trans.p and b/jieba/posseg/prob_trans.p differ