From 9d046675a1524e2d10b321bd6e72e985561db272 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Fri, 19 Apr 2024 19:17:25 +0200 Subject: [PATCH] text: reduce lang. specific `mecab-ko` (#2456) * text: reduce lang. specific `mecab-ko` * mecab-ko for testing * Add skips for TestSacreBLEUScore for ko-mecab --------- Co-authored-by: daniel.stancl --- requirements/text.txt | 2 -- requirements/text_test.txt | 3 +++ tests/unittests/text/test_sacre_bleu.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/requirements/text.txt b/requirements/text.txt index e902bf83aa4..09ebc9b8f97 100644 --- a/requirements/text.txt +++ b/requirements/text.txt @@ -6,7 +6,5 @@ tqdm >=4.41.0, <4.67.0 regex >=2021.9.24, <=2023.12.25 transformers >4.4.0, <4.40.0 mecab-python3 >=1.0.6, <1.1.0 -mecab-ko >=1.0.0, <1.1.0 -mecab-ko-dic >=1.0.0, <1.1.0 ipadic >=1.0.0, <1.1.0 sentencepiece >=0.2.0, <0.3.0 diff --git a/requirements/text_test.txt b/requirements/text_test.txt index 94a9807e769..ebf39259506 100644 --- a/requirements/text_test.txt +++ b/requirements/text_test.txt @@ -6,3 +6,6 @@ rouge-score >0.1.0, <=0.1.2 bert_score ==0.3.13 huggingface-hub <0.23 sacrebleu >=2.3.0, <2.5.0 + +mecab-ko >=1.0.0, <1.1.0 +mecab-ko-dic >=1.0.0, <1.1.0 diff --git a/tests/unittests/text/test_sacre_bleu.py b/tests/unittests/text/test_sacre_bleu.py index f9d8b853779..dbf72269a3e 100644 --- a/tests/unittests/text/test_sacre_bleu.py +++ b/tests/unittests/text/test_sacre_bleu.py @@ -16,6 +16,7 @@ from typing import Sequence import pytest +from lightning_utilities.core.imports import RequirementCache from torch import Tensor, tensor from torchmetrics.functional.text.sacre_bleu import AVAILABLE_TOKENIZERS, _TokenizersLiteral, sacre_bleu_score from torchmetrics.text.sacre_bleu import SacreBLEUScore @@ -51,6 +52,8 @@ class TestSacreBLEUScore(TextTester): @pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False]) def test_bleu_score_class(self, ddp, preds, targets, tokenize, lowercase): """Test class implementation of metric.""" + if _should_skip_tokenizer(tokenize): + pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed") if tokenize == "flores200": pytest.skip("flores200 tests are flaky") # TODO: figure out why @@ -68,6 +71,9 @@ def test_bleu_score_class(self, ddp, preds, targets, tokenize, lowercase): def test_bleu_score_functional(self, preds, targets, tokenize, lowercase): """Test functional implementation of metric.""" + if _should_skip_tokenizer(tokenize): + pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed") + metric_args = {"tokenize": tokenize, "lowercase": lowercase} original_sacrebleu = partial(_reference_sacre_bleu, tokenize=tokenize, lowercase=lowercase) @@ -81,6 +87,9 @@ def test_bleu_score_functional(self, preds, targets, tokenize, lowercase): def test_bleu_score_differentiability(self, preds, targets, tokenize, lowercase): """Test the differentiability of the metric, according to its `is_differentiable` attribute.""" + if _should_skip_tokenizer(tokenize): + pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed") + metric_args = {"tokenize": tokenize, "lowercase": lowercase} self.run_differentiability_test( @@ -122,6 +131,7 @@ def test_tokenize_ja_mecab(): assert sacrebleu(preds, targets) == _reference_sacre_bleu(preds, targets, tokenize="ja-mecab", lowercase=False) +@pytest.mark.skipif(not RequirementCache("mecab-ko"), reason="this test requires `mecab-ko` package to be installed") def test_tokenize_ko_mecab(): """Test that `ja-mecab` tokenizer works on a Japanese text in alignment with the SacreBleu implementation.""" sacrebleu = SacreBLEUScore(tokenize="ko-mecab") @@ -134,3 +144,7 @@ def test_tokenize_ko_mecab(): def test_equivalence_of_available_tokenizers_and_annotation(): """Test equivalence of SacreBLEU available tokenizers and corresponding type annotation.""" assert set(AVAILABLE_TOKENIZERS) == set(_TokenizersLiteral.__args__) + + +def _should_skip_tokenizer(tokenizer: _TokenizersLiteral) -> bool: + return tokenizer == "ko-mecab" and not RequirementCache("mecab-ko")