From 242938110e9c51271883c9b2331869c8e487ac8a Mon Sep 17 00:00:00 2001 From: TensorFlow Datasets Team Date: Tue, 30 Apr 2019 22:03:08 -0700 Subject: [PATCH] Remove unnecessary and tokens. And add a test. PiperOrigin-RevId: 246080224 --- tensorflow_datasets/text/cnn_dailymail.py | 15 ++++----- .../text/cnn_dailymail_test.py | 32 +++++++++++++++++++ 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/tensorflow_datasets/text/cnn_dailymail.py b/tensorflow_datasets/text/cnn_dailymail.py index 310f929b97f..93442244013 100644 --- a/tensorflow_datasets/text/cnn_dailymail.py +++ b/tensorflow_datasets/text/cnn_dailymail.py @@ -145,8 +145,6 @@ def _subset_filenames(dl_paths, split): DM_SINGLE_CLOSE_QUOTE = u'\u2019' # unicode DM_DOUBLE_CLOSE_QUOTE = u'\u201d' -SENTENCE_START = '' -SENTENCE_END = '' # acceptable ways to end a sentence END_TOKENS = ['.', '!', '?', '...', "'", '`', '"', DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ')'] @@ -201,23 +199,23 @@ def fix_missing_period(line): # Make abstract into a single string, putting and tags around # the sentences. - abstract = ' '.join(['%s %s %s' % (SENTENCE_START, sent, - SENTENCE_END) for sent in highlights]) + abstract = ' '.join(highlights) return article, abstract class CnnDailymail(tfds.core.GeneratorBasedBuilder): """CNN/DailyMail non-anonymized summarization dataset.""" + # 0.0.2 is like 0.0.1 but without special tokens and . BUILDER_CONFIGS = [ CnnDailymailConfig( name='plain_text', - version='0.0.1', + version='0.0.2', description='Plain text', ), CnnDailymailConfig( name='bytes', - version='0.0.1', + version='0.0.2', description=('Uses byte-level text encoding with ' '`tfds.features.text.ByteTextEncoder`'), text_encoder_config=tfds.features.text.TextEncoderConfig( @@ -225,7 +223,7 @@ class CnnDailymail(tfds.core.GeneratorBasedBuilder): ), CnnDailymailConfig( name='subwords32k', - version='0.0.1', + version='0.0.2', description=('Uses `tfds.features.text.SubwordTextEncoder` with ' '32k vocab size'), text_encoder_config=tfds.features.text.TextEncoderConfig( @@ -260,8 +258,7 @@ def _split_generators(self, dl_manager): # Generate shared vocabulary # maybe_build_from_corpus uses SubwordTextEncoder if that's configured self.info.features[_ARTICLE].maybe_build_from_corpus( - self._vocab_text_gen(train_files), - reserved_tokens=[SENTENCE_START, SENTENCE_END]) + self._vocab_text_gen(train_files)) encoder = self.info.features[_ARTICLE].encoder # Use maybe_set_encoder because the encoder may have been restored from # package data. diff --git a/tensorflow_datasets/text/cnn_dailymail_test.py b/tensorflow_datasets/text/cnn_dailymail_test.py index ea0677748f2..a4c4240ca83 100644 --- a/tensorflow_datasets/text/cnn_dailymail_test.py +++ b/tensorflow_datasets/text/cnn_dailymail_test.py @@ -18,10 +18,29 @@ from __future__ import division from __future__ import print_function +import tempfile + import tensorflow_datasets.testing as tfds_test from tensorflow_datasets.text import cnn_dailymail +_STORY_FILE = b"""Some article. +This is some article text. + +@highlight + +highlight text + +@highlight + +Highlight two + +@highlight + +highlight Three +""" + + class CnnDailymailTest(tfds_test.DatasetBuilderTestCase): DATASET_CLASS = cnn_dailymail.CnnDailymail SPLITS = { @@ -35,6 +54,19 @@ class CnnDailymailTest(tfds_test.DatasetBuilderTestCase): 'train_urls': 'all_train.txt', 'val_urls': 'all_val.txt'} + def test_get_art_abs(self): + with tempfile.NamedTemporaryFile(delete=True) as f: + f.write(_STORY_FILE) + f.flush() + article, abstract = cnn_dailymail._get_art_abs(f.name) + self.assertEqual('some article. this is some article text.', + article) + # This is a bit weird, but the original code at + # https://github.com/abisee/cnn-dailymail/ adds space before period + # for abstracts and we retain this behavior. + self.assertEqual('highlight text . highlight two . highlight three .', + abstract) + if __name__ == '__main__': tfds_test.test_main()