-
Notifications
You must be signed in to change notification settings - Fork 17
/
GPT2TokenizerTest.java
34 lines (28 loc) · 1.22 KB
/
GPT2TokenizerTest.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
package ai.tunib.tokenizer;
import java.util.List;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class GPT2TokenizerTest {
private final String encodingExample = "Hello my name is Kevin.";
private final List<Integer> decodingExample = List.of(15496, 616, 1438, 318, 7939, 13);
private final String encodingLongTextExample = "interesting";
private final List<Integer> decodingLongTextExample = List.of(47914);
@Test
public void testEncoding() {
GPT2Tokenizer tokenizer = GPT2Tokenizer.fromPretrained("tokenizers/gpt2");
List<Integer> result = tokenizer.encode(encodingExample);
Assertions.assertEquals(decodingExample, result);
}
@Test
public void testDecoding() {
GPT2Tokenizer tokenizer = GPT2Tokenizer.fromPretrained("tokenizers/gpt2");
String result = tokenizer.decode(decodingExample);
Assertions.assertEquals(encodingExample, result);
}
@Test
public void testLongWord() {
GPT2Tokenizer tokenizer = GPT2Tokenizer.fromPretrained("tokenizers/gpt2");
List<Integer> result = tokenizer.encode(encodingLongTextExample);
Assertions.assertEquals(decodingLongTextExample, result);
}
}