Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

text: reduce lang. specific mecab-ko #2456

Merged
merged 8 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions requirements/text.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,5 @@ tqdm >=4.41.0, <4.67.0
regex >=2021.9.24, <=2023.12.25
transformers >4.4.0, <4.40.0
mecab-python3 >=1.0.6, <1.1.0
mecab-ko >=1.0.0, <1.1.0
mecab-ko-dic >=1.0.0, <1.1.0
ipadic >=1.0.0, <1.1.0
sentencepiece >=0.2.0, <0.3.0
3 changes: 3 additions & 0 deletions requirements/text_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ rouge-score >0.1.0, <=0.1.2
bert_score ==0.3.13
huggingface-hub <0.23
sacrebleu >=2.3.0, <2.5.0

mecab-ko >=1.0.0, <1.1.0
mecab-ko-dic >=1.0.0, <1.1.0
14 changes: 14 additions & 0 deletions tests/unittests/text/test_sacre_bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from typing import Sequence

import pytest
from lightning_utilities.core.imports import RequirementCache
from torch import Tensor, tensor
from torchmetrics.functional.text.sacre_bleu import AVAILABLE_TOKENIZERS, _TokenizersLiteral, sacre_bleu_score
from torchmetrics.text.sacre_bleu import SacreBLEUScore
Expand Down Expand Up @@ -51,6 +52,8 @@ class TestSacreBLEUScore(TextTester):
@pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False])
def test_bleu_score_class(self, ddp, preds, targets, tokenize, lowercase):
"""Test class implementation of metric."""
if _should_skip_tokenizer(tokenize):
pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed")
if tokenize == "flores200":
pytest.skip("flores200 tests are flaky") # TODO: figure out why

Expand All @@ -68,6 +71,9 @@ def test_bleu_score_class(self, ddp, preds, targets, tokenize, lowercase):

def test_bleu_score_functional(self, preds, targets, tokenize, lowercase):
"""Test functional implementation of metric."""
if _should_skip_tokenizer(tokenize):
pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed")

metric_args = {"tokenize": tokenize, "lowercase": lowercase}
original_sacrebleu = partial(_reference_sacre_bleu, tokenize=tokenize, lowercase=lowercase)

Expand All @@ -81,6 +87,9 @@ def test_bleu_score_functional(self, preds, targets, tokenize, lowercase):

def test_bleu_score_differentiability(self, preds, targets, tokenize, lowercase):
"""Test the differentiability of the metric, according to its `is_differentiable` attribute."""
if _should_skip_tokenizer(tokenize):
pytest.skip(reason="`ko-mecab` tokenizer requires `mecab-ko` package to be installed")

metric_args = {"tokenize": tokenize, "lowercase": lowercase}

self.run_differentiability_test(
Expand Down Expand Up @@ -122,6 +131,7 @@ def test_tokenize_ja_mecab():
assert sacrebleu(preds, targets) == _reference_sacre_bleu(preds, targets, tokenize="ja-mecab", lowercase=False)


@pytest.mark.skipif(not RequirementCache("mecab-ko"), reason="this test requires `mecab-ko` package to be installed")
def test_tokenize_ko_mecab():
"""Test that `ja-mecab` tokenizer works on a Japanese text in alignment with the SacreBleu implementation."""
sacrebleu = SacreBLEUScore(tokenize="ko-mecab")
Expand All @@ -134,3 +144,7 @@ def test_tokenize_ko_mecab():
def test_equivalence_of_available_tokenizers_and_annotation():
"""Test equivalence of SacreBLEU available tokenizers and corresponding type annotation."""
assert set(AVAILABLE_TOKENIZERS) == set(_TokenizersLiteral.__args__)


def _should_skip_tokenizer(tokenizer: _TokenizersLiteral) -> bool:
return tokenizer == "ko-mecab" and not RequirementCache("mecab-ko")
Loading