From 563485bf95f5c9fd066f2874019ea1e08d3c9770 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 30 Aug 2020 03:19:57 -0700 Subject: [PATCH] [tests] fix typos in inputs (#6818) --- tests/test_tokenization_bart.py | 14 +++++++------- tests/test_tokenization_t5.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/test_tokenization_bart.py b/tests/test_tokenization_bart.py index 59fe1786dab7..bbd448b24ac1 100644 --- a/tests/test_tokenization_bart.py +++ b/tests/test_tokenization_bart.py @@ -69,12 +69,12 @@ def default_tokenizer_fast(self): @require_torch def test_prepare_seq2seq_batch(self): - src_text = ["A long paragraph for summrization.", "Another paragraph for summrization."] + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] tgt_text = [ "Summary of the text.", "Another summary.", ] - expected_src_tokens = [0, 250, 251, 17818, 13, 32933, 21645, 1258, 4, 2] + expected_src_tokens = [0, 250, 251, 17818, 13, 39186, 1938, 4, 2] for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: batch = tokenizer.prepare_seq2seq_batch( @@ -82,8 +82,8 @@ def test_prepare_seq2seq_batch(self): ) self.assertIsInstance(batch, BatchEncoding) - self.assertEqual((2, 10), batch.input_ids.shape) - self.assertEqual((2, 10), batch.attention_mask.shape) + self.assertEqual((2, 9), batch.input_ids.shape) + self.assertEqual((2, 9), batch.attention_mask.shape) result = batch.input_ids.tolist()[0] self.assertListEqual(expected_src_tokens, result) # Test that special tokens are reset @@ -91,7 +91,7 @@ def test_prepare_seq2seq_batch(self): # Test Prepare Seq @require_torch def test_seq2seq_batch_empty_target_text(self): - src_text = ["A long paragraph for summrization.", "Another paragraph for summrization."] + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]: batch = tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt") # check if input_ids are returned and no labels @@ -102,7 +102,7 @@ def test_seq2seq_batch_empty_target_text(self): @require_torch def test_seq2seq_batch_max_target_length(self): - src_text = ["A long paragraph for summrization.", "Another paragraph for summrization."] + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] tgt_text = [ "Summary of the text.", "Another summary.", @@ -131,7 +131,7 @@ def test_seq2seq_batch_not_longer_than_maxlen(self): @require_torch def test_special_tokens(self): - src_text = ["A long paragraph for summrization."] + src_text = ["A long paragraph for summarization."] tgt_text = [ "Summary of the text.", ] diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py index 130680a57d73..05424ab834da 100644 --- a/tests/test_tokenization_t5.py +++ b/tests/test_tokenization_t5.py @@ -120,12 +120,12 @@ def test_eos_treatment(self): def test_prepare_seq2seq_batch(self): tokenizer = self.t5_base_tokenizer - src_text = ["A long paragraph for summrization.", "Another paragraph for summrization."] + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] tgt_text = [ "Summary of the text.", "Another summary.", ] - expected_src_tokens = [71, 307, 8986, 21, 4505, 51, 52, 1707, 5, tokenizer.eos_token_id] + expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, tokenizer.eos_token_id] batch = tokenizer.prepare_seq2seq_batch( src_text, tgt_texts=tgt_text, @@ -135,15 +135,15 @@ def test_prepare_seq2seq_batch(self): result = list(batch.input_ids.numpy()[0]) self.assertListEqual(expected_src_tokens, result) - self.assertEqual((2, 10), batch.input_ids.shape) - self.assertEqual((2, 10), batch.attention_mask.shape) + self.assertEqual((2, 9), batch.input_ids.shape) + self.assertEqual((2, 9), batch.attention_mask.shape) # Test that special tokens are reset self.assertEqual(tokenizer.prefix_tokens, []) def test_empty_target_text(self): tokenizer = self.t5_base_tokenizer - src_text = ["A long paragraph for summrization.", "Another paragraph for summrization."] + src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."] batch = tokenizer.prepare_seq2seq_batch(src_text, return_tensors=FRAMEWORK) # check if input_ids are returned and no decoder_input_ids self.assertIn("input_ids", batch) @@ -153,7 +153,7 @@ def test_empty_target_text(self): def test_max_target_length(self): tokenizer = self.t5_base_tokenizer - src_text = ["A short paragraph for summrization.", "Another short paragraph for summrization."] + src_text = ["A short paragraph for summarization.", "Another short paragraph for summarization."] tgt_text = [ "Summary of the text.", "Another summary.", @@ -180,9 +180,9 @@ def test_outputs_not_longer_than_maxlen(self): def test_eos_in_input(self): tokenizer = self.t5_base_tokenizer - src_text = ["A long paragraph for summrization. "] + src_text = ["A long paragraph for summarization. "] tgt_text = ["Summary of the text. "] - expected_src_tokens = [71, 307, 8986, 21, 4505, 51, 52, 1707, 5, 1] + expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, 1] expected_tgt_tokens = [0, 20698, 13, 8, 1499, 5, 1] batch = tokenizer.prepare_seq2seq_batch(src_text, tgt_texts=tgt_text, return_tensors=FRAMEWORK)