Skip to content

🚨🚨🚨 [SPM] Finish fix spm models 🚨🚨🚨 #25224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 36 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c99c130
fix EVERYTHING
ArthurZucker Aug 1, 2023
acf31e2
more fixes
ArthurZucker Aug 1, 2023
7305aff
⚗️⚗️ Tokenizer magic ⚗️⚗️
ArthurZucker Aug 1, 2023
01b8347
wrong value but test passes for the TODO
ArthurZucker Aug 1, 2023
b9ddbbb
update
ArthurZucker Aug 1, 2023
83af718
updat
ArthurZucker Aug 1, 2023
0babe38
safe protobuf import?
ArthurZucker Aug 1, 2023
0fdf51e
style
ArthurZucker Aug 1, 2023
2d197a1
non gated repo
ArthurZucker Aug 1, 2023
e9c7a72
update
ArthurZucker Aug 1, 2023
94964cd
fixup
ArthurZucker Aug 1, 2023
cc9ddcf
Merge branch 'main' of https://github.com/huggingface/transformers in…
ArthurZucker Aug 1, 2023
45cae43
Update src/transformers/models/llama/tokenization_llama.py
ArthurZucker Aug 2, 2023
53557a9
Update src/transformers/models/llama/tokenization_llama.py
ArthurZucker Aug 2, 2023
e049d11
Update tests/models/t5/test_tokenization_t5.py
ArthurZucker Aug 2, 2023
b64b2d2
nits
ArthurZucker Aug 2, 2023
cb95361
fix t5 too
ArthurZucker Aug 2, 2023
a86bf78
use assert equal
ArthurZucker Aug 2, 2023
913cd1d
fix llama decoding
ArthurZucker Aug 2, 2023
ef28574
nits on t5
ArthurZucker Aug 2, 2023
4f65261
fixup
ArthurZucker Aug 2, 2023
ad7f8c6
only remove the prefix space, not other spaces
ArthurZucker Aug 2, 2023
76d00cc
more deconding tests and more todos
ArthurZucker Aug 2, 2023
9cb92b6
fix CI as well
ArthurZucker Aug 2, 2023
204153f
fixup
ArthurZucker Aug 2, 2023
9f37103
skip failing test on CI (its tf its ok)
ArthurZucker Aug 2, 2023
700ee64
Merge branch 'main' of https://github.com/huggingface/transformers in…
ArthurZucker Aug 3, 2023
4b5315b
skip test_subword_regularization_tokenizer that is also crashing on t…
ArthurZucker Aug 3, 2023
a4ed16f
Merge branch 'main' of https://github.com/huggingface/transformers in…
ArthurZucker Aug 16, 2023
e7906c2
update llama
ArthurZucker Aug 17, 2023
ad33c97
revert good fixes
ArthurZucker Aug 17, 2023
f890882
fixup
ArthurZucker Aug 17, 2023
b7f98bc
empty
ArthurZucker Aug 17, 2023
bb79083
explain why we need to encode with an additional token
ArthurZucker Aug 17, 2023
3f8ac96
better warning?
ArthurZucker Aug 17, 2023
4249986
nits
ArthurZucker Aug 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 59 additions & 28 deletions src/transformers/models/llama/tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import sentencepiece as spm

from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging

Expand Down Expand Up @@ -71,9 +72,10 @@ class LlamaTokenizer(PreTrainedTokenizer):
Args:
vocab_file (`str`):
Path to the vocabulary file.
legacy (`bool`, *optional*, defaults to `True`):
Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
which includes fixes to properly handle tokens that appear after special tokens. A simple example:
legacy (`bool`, *optional*):
Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
example:

- `legacy=True`:
```python
Expand All @@ -91,8 +93,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
>>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
[8774, 32099, 5, 1]
```
Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
more details.
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.

"""

Expand All @@ -112,6 +113,7 @@ def __init__(
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
spaces_between_special_tokens=False,
legacy=None,
**kwargs,
):
Expand All @@ -129,22 +131,42 @@ def __init__(
add_eos_token=add_eos_token,
sp_model_kwargs=self.sp_model_kwargs,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
spaces_between_special_tokens=spaces_between_special_tokens,
legacy=legacy,
**kwargs,
)
if legacy is None:
logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
" read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly."
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it"
" means, and thouroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565"
)
legacy = True

self.legacy = legacy
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
self.sp_model = self.get_spm_processor()

self.unk_token_length = len(self.sp_model.encode(str(self.unk_token)))

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
def get_spm_processor(self):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
with open(self.vocab_file, "rb") as f:
sp_model = f.read()
model_pb2 = import_protobuf()
model = model_pb2.ModelProto.FromString(sp_model)
if not self.legacy:
normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec.add_dummy_prefix = False
model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model)
return tokenizer

def __getstate__(self):
state = self.__dict__.copy()
Expand All @@ -170,33 +192,38 @@ def get_vocab(self):

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
# Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
# the beginning of the text
if not self.legacy:
text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
return super().tokenize(text, **kwargs)
"""
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special.
"""
if self.legacy:
return super().tokenize(text, **kwargs)

if len(text) > 0:
tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)

if tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
return tokens

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.

Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
function is called with specials tokens: the input is split on the special tokens, and each subsequence is
passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
the extra `SPIECE_UNDERLINE` prepended.
We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
`['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
if not self.legacy:
is_first = text.startswith(SPIECE_UNDERLINE)
if is_first:
text = text[1:]
if self.legacy:
return self.sp_model.encode(text, out_type=str)

unk_token_length = len(self.sp_model.encode(str(self.unk_token)))
text = self.unk_token + text
tokens = self.sp_model.encode(text, out_type=str)

if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
return tokens
return tokens[unk_token_length:]

def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
Expand All @@ -209,13 +236,17 @@ def _convert_id_to_token(self, index):

def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
# since we manually add the prefix space, we have to remove it when decoding
if tokens[0].startswith(SPIECE_UNDERLINE):
tokens[0] = tokens[0][1:]

current_sub_tokens = []
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special and i != 0:
if not prev_is_special and i != 0 and self.legacy:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
Expand Down
76 changes: 50 additions & 26 deletions src/transformers/models/t5/tokenization_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import sentencepiece as spm

from ...convert_slow_tokenizer import import_protobuf
from ...tokenization_utils import PreTrainedTokenizer


Expand Down Expand Up @@ -106,9 +107,10 @@ class T5Tokenizer(PreTrainedTokenizer):

- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
legacy (`bool`, *optional*, defaults to `True`):
legacy (`bool`, *optional*):
Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
which includes fixes to properly handle tokens that appear after special tokens. A simple example:
and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
example:

- `legacy=True`:
```python
Expand All @@ -126,8 +128,7 @@ class T5Tokenizer(PreTrainedTokenizer):
>>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
[8774, 32099, 5, 1]
```
Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
more details.
Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.

Attributes:
sp_model (`SentencePieceProcessor`):
Expand Down Expand Up @@ -165,8 +166,11 @@ def __init__(
)
if legacy is None:
logger.warning_once(
f"You are using the default legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
" read the related pull request available at https://github.com/huggingface/transformers/pull/24565, and set the legacy attribute accordingly."
f"You are using the default legacy behaviour of the {self.__class__}. If you see this, DO NOT PANIC! This is"
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it"
" means, and thouroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565"
)
legacy = True

Expand All @@ -187,8 +191,21 @@ def __init__(
self.vocab_file = vocab_file
self._extra_ids = extra_ids

self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
self.sp_model = self.get_spm_processor()

def get_spm_processor(self):
tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
with open(self.vocab_file, "rb") as f:
sp_model = f.read()
model_pb2 = import_protobuf()
model = model_pb2.ModelProto.FromString(sp_model)
if not self.legacy:
normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec.add_dummy_prefix = False
model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model)
return tokenizer

@staticmethod
def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
Expand Down Expand Up @@ -332,32 +349,37 @@ def __setstate__(self, d):
self.sp_model.Load(self.vocab_file)

def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
# Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
# the beginning of the text
if not self.legacy:
text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
return super().tokenize(text, **kwargs)
"""
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special.
"""
if self.legacy:
return super().tokenize(text, **kwargs)

if len(text) > 0:
tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)

if tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
tokens = tokens[1:]
return tokens

def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.

Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
function is called with specials tokens: the input is split on the special tokens, and each subsequence is
passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
the extra `SPIECE_UNDERLINE` prepended.
We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
`['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
"""
if not self.legacy:
is_first = text.startswith(SPIECE_UNDERLINE)
if is_first:
text = text[1:]
if self.legacy:
return self.sp_model.encode(text, out_type=str)

unk_token_length = len(self.sp_model.encode(str(self.unk_token)))
text = self.unk_token + text
tokens = self.sp_model.encode(text, out_type=str)

if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
return tokens
return tokens[unk_token_length:]

def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
Expand All @@ -378,6 +400,8 @@ def _convert_id_to_token(self, index):
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
# since we manually add the prefix space, we have to remove it
tokens[0] = tokens[0].lstrip(SPIECE_UNDERLINE)
out_string = ""
prev_is_special = False
for token in tokens:
Expand Down
Loading