-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
executable file
·84 lines (77 loc) · 2.88 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import contextlib, sys
class DummyFile(object):
def write(self, x): pass
@contextlib.contextmanager
def nostdout():
save_stdout = sys.stdout
sys.stdout = DummyFile()
yield
sys.stdout = save_stdout
def reporthook(t):
"""https://github.com/tqdm/tqdm"""
last_b = [0]
def inner(b=1, bsize=1, tsize=None):
"""
b: int, optionala
Number of blocks just transferred [default: 1].
bsize: int, optional
Size of each block (in tqdm units) [default: 1].
tsize: int, optional
Total size (in tqdm units). If [default: None] remains unchanged.
"""
if tsize is not None:
t.total = tsize
t.update((b - last_b[0]) * bsize)
last_b[0] = b
return inner
def get_tokenizer(tokenizer):
if callable(tokenizer):
return tokenizer
if tokenizer == "spacy":
try:
import spacy
spacy_en = spacy.load('en_core_web_sm')
print("now we are loading the spacy tokenizer")
return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
except ImportError:
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
except AttributeError:
print("Please install SpaCy and the SpaCy English tokenizer. "
"See the docs at https://spacy.io for more information.")
raise
elif tokenizer == "moses":
try:
from nltk.tokenize.moses import MosesTokenizer
moses_tokenizer = MosesTokenizer()
return moses_tokenizer.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at http://nltk.org for more information.")
raise
except LookupError:
print("Please install the necessary NLTK corpora. "
"See the docs at http://nltk.org for more information.")
raise
elif tokenizer == 'revtok':
try:
import revtok
return revtok.tokenize
except ImportError:
print("Please install revtok.")
raise
elif tokenizer == 'subword':
try:
import revtok
return lambda x: revtok.tokenize(x, decap=True)
except ImportError:
print("Please install revtok.")
raise
raise ValueError("Requested tokenizer {}, valid choices are a "
"callable that takes a single string as input, "
"\"revtok\" for the revtok reversible tokenizer, "
"\"subword\" for the revtok caps-aware tokenizer, "
"\"spacy\" for the SpaCy English tokenizer, or "
"\"moses\" for the NLTK port of the Moses tokenization "
"script.".format(tokenizer))