-
Notifications
You must be signed in to change notification settings - Fork 26.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Modify ProcessorTesterMixin for better generalization #32637
Merged
yonigozlan
merged 3 commits into
huggingface:main
from
yonigozlan:modify-ProcessorTesterMixin-for-better-generalization
Aug 13, 2024
Merged
Changes from 2 commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
dc5e360
Add padding="max_length" to tokenizer kwargs and change crop_size to …
yonigozlan e8064ef
remove crop_size argument in align processor tests to be coherent wit…
yonigozlan 3c85fcb
Add pad_token when loading tokenizer if needed, change test override …
yonigozlan File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -126,7 +126,9 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): | |
if "image_processor" not in self.processor_class.attributes: | ||
self.skipTest(f"image_processor attribute not present in {self.processor_class}") | ||
image_processor = self.get_component("image_processor") | ||
tokenizer = self.get_component("tokenizer", max_length=117) | ||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") | ||
if not tokenizer.pad_token: | ||
tokenizer.pad_token = "[TEST_PAD]" | ||
|
||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) | ||
self.skip_processor_without_typed_kwargs(processor) | ||
|
@@ -141,8 +143,10 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): | |
def test_image_processor_defaults_preserved_by_image_kwargs(self): | ||
if "image_processor" not in self.processor_class.attributes: | ||
self.skipTest(f"image_processor attribute not present in {self.processor_class}") | ||
image_processor = self.get_component("image_processor", crop_size=(234, 234)) | ||
tokenizer = self.get_component("tokenizer", max_length=117) | ||
image_processor = self.get_component("image_processor", size=(234, 234)) | ||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") | ||
if not tokenizer.pad_token: | ||
tokenizer.pad_token = "[TEST_PAD]" | ||
|
||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) | ||
self.skip_processor_without_typed_kwargs(processor) | ||
|
@@ -159,31 +163,37 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): | |
if "image_processor" not in self.processor_class.attributes: | ||
self.skipTest(f"image_processor attribute not present in {self.processor_class}") | ||
image_processor = self.get_component("image_processor") | ||
tokenizer = self.get_component("tokenizer", max_length=117) | ||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") | ||
if not tokenizer.pad_token: | ||
tokenizer.pad_token = "[TEST_PAD]" | ||
|
||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) | ||
self.skip_processor_without_typed_kwargs(processor) | ||
input_str = "lower newer" | ||
image_input = self.prepare_image_inputs() | ||
|
||
inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=112) | ||
inputs = processor( | ||
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" | ||
) | ||
self.assertEqual(len(inputs["input_ids"][0]), 112) | ||
|
||
@require_torch | ||
@require_vision | ||
def test_kwargs_overrides_default_image_processor_kwargs(self): | ||
if "image_processor" not in self.processor_class.attributes: | ||
self.skipTest(f"image_processor attribute not present in {self.processor_class}") | ||
image_processor = self.get_component("image_processor", crop_size=(234, 234)) | ||
tokenizer = self.get_component("tokenizer", max_length=117) | ||
image_processor = self.get_component("image_processor", size=(234, 234)) | ||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") | ||
if not tokenizer.pad_token: | ||
tokenizer.pad_token = "[TEST_PAD]" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see this recurring in all tests, maybe we should add pad token when saving dummy tokenizer, so it's loaded with a pad id There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
|
||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) | ||
self.skip_processor_without_typed_kwargs(processor) | ||
|
||
input_str = "lower newer" | ||
image_input = self.prepare_image_inputs() | ||
|
||
inputs = processor(text=input_str, images=image_input, crop_size=[224, 224]) | ||
inputs = processor(text=input_str, images=image_input, size=[224, 224]) | ||
self.assertEqual(len(inputs["pixel_values"][0][0]), 224) | ||
|
||
@require_torch | ||
|
@@ -193,6 +203,8 @@ def test_unstructured_kwargs(self): | |
self.skipTest(f"image_processor attribute not present in {self.processor_class}") | ||
image_processor = self.get_component("image_processor") | ||
tokenizer = self.get_component("tokenizer") | ||
if not tokenizer.pad_token: | ||
tokenizer.pad_token = "[TEST_PAD]" | ||
|
||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) | ||
self.skip_processor_without_typed_kwargs(processor) | ||
|
@@ -203,7 +215,7 @@ def test_unstructured_kwargs(self): | |
text=input_str, | ||
images=image_input, | ||
return_tensors="pt", | ||
crop_size={"height": 214, "width": 214}, | ||
size={"height": 214, "width": 214}, | ||
padding="max_length", | ||
max_length=76, | ||
) | ||
|
@@ -218,6 +230,8 @@ def test_unstructured_kwargs_batched(self): | |
self.skipTest(f"image_processor attribute not present in {self.processor_class}") | ||
image_processor = self.get_component("image_processor") | ||
tokenizer = self.get_component("tokenizer") | ||
if not tokenizer.pad_token: | ||
tokenizer.pad_token = "[TEST_PAD]" | ||
|
||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) | ||
self.skip_processor_without_typed_kwargs(processor) | ||
|
@@ -228,7 +242,7 @@ def test_unstructured_kwargs_batched(self): | |
text=input_str, | ||
images=image_input, | ||
return_tensors="pt", | ||
crop_size={"height": 214, "width": 214}, | ||
size={"height": 214, "width": 214}, | ||
padding="longest", | ||
max_length=76, | ||
) | ||
|
@@ -254,8 +268,8 @@ def test_doubly_passed_kwargs(self): | |
_ = processor( | ||
text=input_str, | ||
images=image_input, | ||
images_kwargs={"crop_size": {"height": 222, "width": 222}}, | ||
crop_size={"height": 214, "width": 214}, | ||
images_kwargs={"size": {"height": 222, "width": 222}}, | ||
size={"height": 214, "width": 214}, | ||
) | ||
|
||
@require_torch | ||
|
@@ -275,7 +289,7 @@ def test_structured_kwargs_nested(self): | |
# Define the kwargs for each modality | ||
all_kwargs = { | ||
"common_kwargs": {"return_tensors": "pt"}, | ||
"images_kwargs": {"crop_size": {"height": 214, "width": 214}}, | ||
"images_kwargs": {"size": {"height": 214, "width": 214}}, | ||
"text_kwargs": {"padding": "max_length", "max_length": 76}, | ||
} | ||
|
||
|
@@ -303,7 +317,7 @@ def test_structured_kwargs_nested_from_dict(self): | |
# Define the kwargs for each modality | ||
all_kwargs = { | ||
"common_kwargs": {"return_tensors": "pt"}, | ||
"images_kwargs": {"crop_size": {"height": 214, "width": 214}}, | ||
"images_kwargs": {"size": {"height": 214, "width": 214}}, | ||
"text_kwargs": {"padding": "max_length", "max_length": 76}, | ||
} | ||
|
||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: a bit redundant to indicate max-length when init tokenizer and when calling. IMO we can remove it from init because we're testing kwargs at call
Or we can even explicitly init with a
padding=longest
and test if kwargs overrides defaults, as the test name suggestsThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 on removing from the init.
Checking the kwargs overrides is also a good idea - we should add a new test to isolate and check this behaviour