Skip to content

Commit b190ef0

Browse files
cclausskpe
authored andcommitted
Add to_unicode_utf8() to text_encoder.py (tensorflow#1321)
1 parent 20ac4f2 commit b190ef0

File tree

13 files changed

+17
-63
lines changed

13 files changed

+17
-63
lines changed

tensor2tensor/data_generators/cnn_dailymail.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import os
2525
import random
2626
import tarfile
27-
import six
2827
from tensor2tensor.data_generators import generator_utils
2928
from tensor2tensor.data_generators import problem
3029
from tensor2tensor.data_generators import text_encoder
@@ -157,10 +156,7 @@ def fix_run_on_sents(line):
157156
summary = []
158157
reading_highlights = False
159158
for line in tf.gfile.Open(story_file, "rb"):
160-
if six.PY2:
161-
line = unicode(line.strip(), "utf-8")
162-
else:
163-
line = line.strip().decode("utf-8")
159+
line = text_encoder.to_unicode_utf8(line.strip())
164160
line = fix_run_on_sents(line)
165161
if not line:
166162
continue

tensor2tensor/data_generators/cola.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import problem
2726
from tensor2tensor.data_generators import text_encoder
@@ -83,10 +82,7 @@ def _maybe_download_corpora(self, tmp_dir):
8382

8483
def example_generator(self, filename):
8584
for line in tf.gfile.Open(filename, "rb"):
86-
if six.PY2:
87-
line = unicode(line.strip(), "utf-8")
88-
else:
89-
line = line.strip().decode("utf-8")
85+
line = text_encoder.to_unicode_utf8(line.strip())
9086
_, label, _, sent = line.split("\t")
9187
yield {
9288
"inputs": sent,

tensor2tensor/data_generators/mrpc.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from __future__ import print_function
2121

2222
import os
23-
import six
2423
from tensor2tensor.data_generators import generator_utils
2524
from tensor2tensor.data_generators import problem
2625
from tensor2tensor.data_generators import text_encoder
@@ -95,10 +94,7 @@ def download_file(tdir, filepath, url):
9594
def example_generator(self, filename, dev_ids, dataset_split):
9695
for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
9796
if idx == 0: continue # skip header
98-
if six.PY2:
99-
line = unicode(line.strip(), "utf-8")
100-
else:
101-
line = line.strip().decode("utf-8")
97+
line = text_encoder.to_unicode_utf8(line.strip())
10298
l, id1, id2, s1, s2 = line.split("\t")
10399
is_dev = [id1, id2] in dev_ids
104100
if dataset_split == problem.DatasetSplit.TRAIN and is_dev:

tensor2tensor/data_generators/multinli.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import lm1b
2726
from tensor2tensor.data_generators import problem
@@ -87,10 +86,7 @@ def example_generator(self, filename):
8786
label_list = self.class_labels(data_dir=None)
8887
for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
8988
if idx == 0: continue # skip header
90-
if six.PY2:
91-
line = unicode(line.strip(), "utf-8")
92-
else:
93-
line = line.strip().decode("utf-8")
89+
line = text_encoder.to_unicode_utf8(line.strip())
9490
split_line = line.split("\t")
9591
# Works for both splits even though dev has some extra human labels.
9692
s1, s2 = split_line[8:10]

tensor2tensor/data_generators/qnli.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import problem
2726
from tensor2tensor.data_generators import text_encoder
@@ -85,10 +84,7 @@ def example_generator(self, filename):
8584
label_list = self.class_labels(data_dir=None)
8685
for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
8786
if idx == 0: continue # skip header
88-
if six.PY2:
89-
line = unicode(line.strip(), "utf-8")
90-
else:
91-
line = line.strip().decode("utf-8")
87+
line = text_encoder.to_unicode_utf8(line.strip())
9288
_, s1, s2, l = line.split("\t")
9389
inputs = [s1, s2]
9490
l = label_list.index(l)

tensor2tensor/data_generators/quora_qpairs.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import problem
2726
from tensor2tensor.data_generators import text_encoder
@@ -84,10 +83,7 @@ def example_generator(self, filename):
8483
skipped = 0
8584
for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
8685
if idx == 0: continue # skip header
87-
if six.PY2:
88-
line = unicode(line.strip(), "utf-8")
89-
else:
90-
line = line.strip().decode("utf-8")
86+
line = text_encoder.to_unicode_utf8(line.strip())
9187
split_line = line.split("\t")
9288
if len(split_line) < 6:
9389
skipped += 1

tensor2tensor/data_generators/rte.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import problem
2726
from tensor2tensor.data_generators import text_encoder
@@ -85,10 +84,7 @@ def example_generator(self, filename):
8584
label_list = self.class_labels(data_dir=None)
8685
for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
8786
if idx == 0: continue # skip header
88-
if six.PY2:
89-
line = unicode(line.strip(), "utf-8")
90-
else:
91-
line = line.strip().decode("utf-8")
87+
line = text_encoder.to_unicode_utf8(line.strip())
9288
_, s1, s2, l = line.split("\t")
9389
inputs = [s1, s2]
9490
l = label_list.index(l)

tensor2tensor/data_generators/scitail.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import lm1b
2726
from tensor2tensor.data_generators import problem
@@ -83,10 +82,7 @@ def _maybe_download_corpora(self, tmp_dir):
8382
def example_generator(self, filename):
8483
label_list = self.class_labels(data_dir=None)
8584
for line in tf.gfile.Open(filename, "rb"):
86-
if six.PY2:
87-
line = unicode(line.strip(), "utf-8")
88-
else:
89-
line = line.strip().decode("utf-8")
85+
line = text_encoder.to_unicode_utf8(line.strip())
9086
split_line = line.split("\t")
9187
s1, s2 = split_line[:2]
9288
l = label_list.index(split_line[2])

tensor2tensor/data_generators/sst_binary.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import problem
2726
from tensor2tensor.data_generators import text_encoder
@@ -84,10 +83,7 @@ def _maybe_download_corpora(self, tmp_dir):
8483
def example_generator(self, filename):
8584
for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
8685
if idx == 0: continue # skip header
87-
if six.PY2:
88-
line = unicode(line.strip(), "utf-8")
89-
else:
90-
line = line.strip().decode("utf-8")
86+
line = text_encoder.to_unicode_utf8(line.strip())
9187
sent, label = line.split("\t")
9288
yield {
9389
"inputs": sent,

tensor2tensor/data_generators/stanford_nli.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import lm1b
2726
from tensor2tensor.data_generators import problem
@@ -84,10 +83,7 @@ def example_generator(self, filename):
8483
label_list = self.class_labels(data_dir=None)
8584
for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
8685
if idx == 0: continue # skip header
87-
if six.PY2:
88-
line = unicode(line.strip(), "utf-8")
89-
else:
90-
line = line.strip().decode("utf-8")
86+
line = text_encoder.to_unicode_utf8(line.strip())
9187
split_line = line.split("\t")
9288
# Works for both splits even though dev has some extra human labels.
9389
s1, s2 = split_line[5:7]

tensor2tensor/data_generators/text_encoder.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ def to_unicode_ignore_errors(s):
9898
return to_unicode(s, ignore_errors=True)
9999

100100

101+
def to_unicode_utf8(s):
102+
return unicode(s, "utf-8") if six.PY2 else s.decode("utf-8")
103+
104+
101105
def strip_ids(ids, ids_to_strip):
102106
"""Strip ids_to_strip from the end ids."""
103107
ids = list(ids)

tensor2tensor/data_generators/wiki_revision_utils.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,12 @@
2727
import re
2828
import subprocess
2929

30-
import six
31-
3230
from tensor2tensor.data_generators import generator_utils
3331
from tensor2tensor.data_generators import text_encoder
3432

3533
import tensorflow as tf
3634

3735

38-
def to_unicode(s):
39-
return unicode(s, "utf-8") if six.PY2 else s.decode("utf-8")
40-
41-
4236
def include_revision(revision_num, skip_factor=1.1):
4337
"""Decide whether to include a revision.
4438
@@ -118,7 +112,7 @@ def get_title(page):
118112
assert start_pos != -1
119113
assert end_pos != -1
120114
start_pos += len("<title>")
121-
return to_unicode(page[start_pos:end_pos])
115+
return text_encoder.to_unicode_utf8(page[start_pos:end_pos])
122116

123117

124118
def get_id(page):
@@ -257,7 +251,7 @@ def get_text(revision, strip=True):
257251
ret = revision[end_tag_pos:end_pos]
258252
if strip:
259253
ret = strip_text(ret)
260-
ret = to_unicode(ret)
254+
ret = text_encoder.to_unicode_utf8(ret)
261255
return ret
262256

263257

tensor2tensor/data_generators/wnli.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import os
2323
import zipfile
24-
import six
2524
from tensor2tensor.data_generators import generator_utils
2625
from tensor2tensor.data_generators import problem
2726
from tensor2tensor.data_generators import text_encoder
@@ -88,10 +87,7 @@ def _maybe_download_corpora(self, tmp_dir):
8887
def example_generator(self, filename):
8988
for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
9089
if idx == 0: continue # skip header
91-
if six.PY2:
92-
line = unicode(line.strip(), "utf-8")
93-
else:
94-
line = line.strip().decode("utf-8")
90+
line = text_encoder.to_unicode_utf8(line.strip())
9591
_, s1, s2, l = line.split("\t")
9692
inputs = [s1, s2]
9793
yield {

0 commit comments

Comments
 (0)