Skip to content

Commit

Permalink
Remove unnecessary <s> and </s> tokens. And add a test.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 246080224
  • Loading branch information
TensorFlow Datasets Team authored and copybara-github committed May 1, 2019
1 parent dbda817 commit 2429381
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 9 deletions.
15 changes: 6 additions & 9 deletions tensorflow_datasets/text/cnn_dailymail.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,6 @@ def _subset_filenames(dl_paths, split):

DM_SINGLE_CLOSE_QUOTE = u'\u2019' # unicode
DM_DOUBLE_CLOSE_QUOTE = u'\u201d'
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
# acceptable ways to end a sentence
END_TOKENS = ['.', '!', '?', '...', "'", '`', '"',
DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ')']
Expand Down Expand Up @@ -201,31 +199,31 @@ def fix_missing_period(line):

# Make abstract into a single string, putting <s> and </s> tags around
# the sentences.
abstract = ' '.join(['%s %s %s' % (SENTENCE_START, sent,
SENTENCE_END) for sent in highlights])
abstract = ' '.join(highlights)

return article, abstract


class CnnDailymail(tfds.core.GeneratorBasedBuilder):
"""CNN/DailyMail non-anonymized summarization dataset."""
# 0.0.2 is like 0.0.1 but without special tokens <s> and </s>.
BUILDER_CONFIGS = [
CnnDailymailConfig(
name='plain_text',
version='0.0.1',
version='0.0.2',
description='Plain text',
),
CnnDailymailConfig(
name='bytes',
version='0.0.1',
version='0.0.2',
description=('Uses byte-level text encoding with '
'`tfds.features.text.ByteTextEncoder`'),
text_encoder_config=tfds.features.text.TextEncoderConfig(
encoder=tfds.features.text.ByteTextEncoder()),
),
CnnDailymailConfig(
name='subwords32k',
version='0.0.1',
version='0.0.2',
description=('Uses `tfds.features.text.SubwordTextEncoder` with '
'32k vocab size'),
text_encoder_config=tfds.features.text.TextEncoderConfig(
Expand Down Expand Up @@ -260,8 +258,7 @@ def _split_generators(self, dl_manager):
# Generate shared vocabulary
# maybe_build_from_corpus uses SubwordTextEncoder if that's configured
self.info.features[_ARTICLE].maybe_build_from_corpus(
self._vocab_text_gen(train_files),
reserved_tokens=[SENTENCE_START, SENTENCE_END])
self._vocab_text_gen(train_files))
encoder = self.info.features[_ARTICLE].encoder
# Use maybe_set_encoder because the encoder may have been restored from
# package data.
Expand Down
32 changes: 32 additions & 0 deletions tensorflow_datasets/text/cnn_dailymail_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,29 @@
from __future__ import division
from __future__ import print_function

import tempfile

import tensorflow_datasets.testing as tfds_test
from tensorflow_datasets.text import cnn_dailymail


_STORY_FILE = b"""Some article.
This is some article text.
@highlight
highlight text
@highlight
Highlight two
@highlight
highlight Three
"""


class CnnDailymailTest(tfds_test.DatasetBuilderTestCase):
DATASET_CLASS = cnn_dailymail.CnnDailymail
SPLITS = {
Expand All @@ -35,6 +54,19 @@ class CnnDailymailTest(tfds_test.DatasetBuilderTestCase):
'train_urls': 'all_train.txt',
'val_urls': 'all_val.txt'}

def test_get_art_abs(self):
with tempfile.NamedTemporaryFile(delete=True) as f:
f.write(_STORY_FILE)
f.flush()
article, abstract = cnn_dailymail._get_art_abs(f.name)
self.assertEqual('some article. this is some article text.',
article)
# This is a bit weird, but the original code at
# https://github.com/abisee/cnn-dailymail/ adds space before period
# for abstracts and we retain this behavior.
self.assertEqual('highlight text . highlight two . highlight three .',
abstract)


if __name__ == '__main__':
tfds_test.test_main()

0 comments on commit 2429381

Please sign in to comment.