Skip to content

Commit

Permalink
fix: Fixed CodeGenTokenizationTest::test_truncation failing test (#32850
Browse files Browse the repository at this point in the history
)

* Fixed failing CodeGenTokenizationTest::test_truncation.

* [run_slow] Codegen

* [run_slow] codegen
  • Loading branch information
Sai-Suraj-27 authored Aug 27, 2024
1 parent 9578c25 commit 3bf6dd8
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions tests/models/codegen/test_tokenization_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,12 @@ def test_truncation(self):
tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono")

text = "\nif len_a > len_b:\n result = a\nelse:\n result = b\n\n\n\n#"
expected_trucated_text = "\nif len_a > len_b: result = a\nelse: result = b"
expected_truncated_text = "\nif len_a > len_b:\n result = a\nelse:\n result = b"

input_ids = tokenizer.encode(text)
truncation_pattern = ["^#", re.escape("<|endoftext|>"), "^'''", '^"""', "\n\n\n"]
decoded_text = tokenizer.decode(input_ids, truncate_before_pattern=truncation_pattern)
self.assertEqual(decoded_text, expected_trucated_text)
self.assertEqual(decoded_text, expected_truncated_text)
# TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR

# tokenizer has no padding token
Expand Down

0 comments on commit 3bf6dd8

Please sign in to comment.