keras-team
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/t5_xxl_preprocessor.py‎
Lines changed: 90 additions & 0 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/t5_xxl_preprocessor.py‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/t5_xxl_preprocessor_test.py‎
Lines changed: 74 additions & 0 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/t5_xxl_preprocessor_test.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/t5_xxl_tokenizer.py‎
Lines changed: 0 additions & 35 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/t5_xxl_tokenizer.py‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎keras_nlp/src/models/stable_diffusion_v3/t5_xxl_tokenizer_test.py‎
Lines changed: 0 additions & 69 deletions b/‎keras_nlp/src/models/stable_diffusion_v3/t5_xxl_tokenizer_test.py‎
Lines changed: 0 additions & 69 deletions
@@ -0,0 +1,90 @@
+# Copyright 2024 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import keras
+
+from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
+from keras_nlp.src.models.preprocessor import Preprocessor
+from keras_nlp.src.models.t5.t5_tokenizer import T5Tokenizer
+from keras_nlp.src.utils.keras_utils import (
+    convert_inputs_to_list_of_tensor_segments,
+)
+
+
+class T5XXLPreprocessor(Preprocessor):
+    tokenizer_cls = T5Tokenizer
+
+    def __init__(
+        self,
+        tokenizer,
+        sequence_length=256,
+        add_start_token=False,
+        add_end_token=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tokenizer = tokenizer
+        self.sequence_length = sequence_length
+        self.add_start_token = add_start_token
+        self.add_end_token = add_end_token
+
+    def build(self, input_shape):
+        # Defer packer creation to `build()` so that we can be sure tokenizer
+        # assets have loaded when restoring a saved model.
+        self.packer = StartEndPacker(
+            start_value=self.tokenizer.start_token_id,
+            end_value=self.tokenizer.end_token_id,
+            pad_value=self.tokenizer.pad_token_id,
+            sequence_length=self.sequence_length,
+            return_padding_mask=True,
+        )
+        self.built = True
+
+    def call(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        sequence_length=None,
+    ):
+        x = convert_inputs_to_list_of_tensor_segments(x)
+        if len(x) != 1:
+            raise ValueError(
+                "T5XXL requires each input feature to contain only "
+                f"one segment, but received {len(x)}. If you are using T5XXL"
+                " for a multi-segment classification task, please refer to "
+                "classification models like BERT or RoBERTa."
+            )
+        sequence_length = sequence_length or self.sequence_length
+        token_ids, padding_mask = self.packer(
+            self.tokenizer(x[0]),
+            sequence_length=sequence_length,
+            add_start_value=self.add_start_token,
+            add_end_value=self.add_end_token,
+        )
+        x = {
+            "token_ids": token_ids,
+            "padding_mask": padding_mask,
+        }
+        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "sequence_length": self.sequence_length,
+                "add_start_token": self.add_start_token,
+                "add_end_token": self.add_end_token,
+            }
+        )
+        return config
@@ -0,0 +1,74 @@
+# Copyright 2024 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import pytest
+
+from keras_nlp.src.models.stable_diffusion_v3.t5_xxl_preprocessor import (
+    T5XXLPreprocessor,
+)
+from keras_nlp.src.models.t5.t5_tokenizer import T5Tokenizer
+from keras_nlp.src.tests.test_case import TestCase
+
+
+class GemmaPreprocessorTest(TestCase):
+    def setUp(self):
+        self.tokenizer = T5Tokenizer(
+            proto=os.path.join(self.get_test_data_dir(), "t5_test_vocab.spm")
+        )
+        self.init_kwargs = {
+            "tokenizer": self.tokenizer,
+            "sequence_length": 10,
+        }
+        self.input_data = ["the quick brown fox"]
+
+    def test_preprocessor_basics(self):
+        self.run_preprocessing_layer_test(
+            cls=T5XXLPreprocessor,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+            expected_output={
+                "token_ids": [[4, 9, 5, 7, 1, 0, 0, 0, 0, 0]],
+                "padding_mask": [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]],
+            },
+        )
+
+    def test_no_start_end_token(self):
+        input_data = ["the quick brown fox"] * 4
+        preprocessor = T5XXLPreprocessor(
+            tokenizer=self.tokenizer,
+            sequence_length=8,
+            add_start_token=False,
+            add_end_token=False,
+        )
+        x = preprocessor(input_data)
+        self.assertAllEqual(x["token_ids"], [[4, 9, 5, 7, 0, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
+
+    def test_sequence_length_override(self):
+        input_data = "the quick brown fox"
+        preprocessor = T5XXLPreprocessor(**self.init_kwargs)
+        x = preprocessor(input_data, sequence_length=4)
+        self.assertAllEqual(x["token_ids"], [4, 9, 5, 1])
+
+    @pytest.mark.kaggle_key_required
+    @pytest.mark.extra_large
+    def test_all_presets(self):
+        self.skipTest("TODO")
+        for preset in T5XXLPreprocessor.presets:
+            self.run_preset_test(
+                cls=T5XXLPreprocessor,
+                preset=preset,
+                input_data=self.input_data,
+            )