LabStrangeLoop · ayeganov · Sep 25, 2024 · Sep 25, 2024
diff --git a/scratchgpt/dataloader.py b/scratchgpt/dataloader.py
diff --git a/scratchgpt/main.py b/scratchgpt/main.py
@@ -10,19 +10,21 @@
 from tqdm import tqdm
 from ptflops import get_model_complexity_info
 
+from scratchgpt.tokenizer.char_tokenizer import CharTokenizer
+
 
 from .metering import AverageValueMeter
 
 
 torch.manual_seed(1337)
 
 DEVICE: str = "cuda" if torch.cuda.is_available() else "cpu"
-BATCH_SIZE = 64
-BLOCK_SIZE = 256
+BATCH_SIZE = 32
+BLOCK_SIZE = 32
 MAX_ITERS = 5000
-LEARNING_RATE = 3e-4
+LEARNING_RATE = 3e-3
 EVAL_INTERVAL = 500
-N_EMBED = 384
+N_EMBED = 48
 NUM_HEADS = 6
 NUM_BLOCKS = 6
 
@@ -38,9 +40,6 @@ def parse_args():
 
 
 def print_model_complexity(model: nn.Module):
-    batch_size = 1
-    block_size = BLOCK_SIZE
-
     input_shape = (BLOCK_SIZE,)
 
     flops, params = get_model_complexity_info(model, input_shape, print_per_layer_stat=True, as_strings=True)
@@ -56,7 +55,7 @@ def __init__(self, embedding_size: int, block_size: int, head_size: int) -> None
         self._key = nn.Linear(embedding_size, head_size, bias=False)
         self._query = nn.Linear(embedding_size, head_size, bias=False)
         self._value = nn.Linear(embedding_size, head_size, bias=False)
-        self._dropout_factor = .6
+        self._dropout_factor = .4
         self._dropout = nn.Dropout(self._dropout_factor)
         self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
 
@@ -81,7 +80,7 @@ def forward(self, context: Tensor) -> Tensor:
 class MultiHeadAttention(nn.Module):
     def __init__(self, num_heads: int, embedding_size: int, block_size: int, head_size: int) -> None:
         super().__init__()
-        self._dropout_factor = .6
+        self._dropout_factor = .4
         self._heads = nn.ModuleList(Head(embedding_size, block_size, head_size) for _ in range(num_heads))
         self._proj = nn.Linear(embedding_size, embedding_size)
         self._dropout = nn.Dropout(self._dropout_factor)
@@ -97,7 +96,7 @@ class FeedFoward(nn.Module):
     def __init__(self, embedding_size: int) -> None:
         super().__init__()
         self._ffwd_multipler = 4
-        self._dropout = .6
+        self._dropout = .4
 
         self._net = nn.Sequential(
             nn.Linear(embedding_size, embedding_size * self._ffwd_multipler),
@@ -175,27 +174,6 @@ def load_dataset(path: io.TextIOWrapper) -> str:
     return path.read()
 
 
-def get_vocab(text: str) -> list[str]:
-    chars = sorted(list(set(text)))
-    return chars
-
-
-def str_to_int(chars: list[str]) -> dict[str, int]:
-    return {char:idx for idx, char in enumerate(chars)}
-
-
-def int_to_str(chars: list[str]) -> dict[int, str]:
-    return {idx: char for idx, char in enumerate(chars)}
-
-
-def encode(text: str, mapping: dict[str, int]) -> list[int]:
-    return [mapping[char] for char in text]
-
-
-def decode(encoding: list[int], mapping: dict[int, str]) -> str:
-    return ''.join(mapping[v] for v in encoding)
-
-
 def get_batch(block_size: int, batch_size: int, data: Tensor) -> tuple[Tensor, Tensor]:
     indices = torch.randint(len(data) - block_size, (batch_size,))
     batch = torch.stack([data[i:i+block_size] for i in indices])
@@ -209,23 +187,20 @@ def main():
 
     text = load_dataset(args.train_file)
 
-    chars = get_vocab(text)
-
-    vocab_size = len(chars)
-    print(f"{chars=}\n{vocab_size=}")
+    tokenizer = CharTokenizer(text)
 
-    encoding_mapping = str_to_int(chars)
-    decoding_mapping = int_to_str(chars)
+    print(f"{tokenizer.vocabulary=}\n{tokenizer.vocab_size=}")
 
-    data = torch.tensor(encode(text, encoding_mapping), dtype=torch.long).to(DEVICE)
+    tokenized_data = tokenizer.encode(text)
+    data = torch.tensor(tokenized_data, dtype=torch.long).to(DEVICE)
 
     train_split: float = 0.9
     train_size = int(train_split * len(text))
 
     train_data = data[:train_size]
     val_data = data[train_size:]
 
-    model = BigramLanguageModel(NUM_HEADS, vocab_size, N_EMBED, BLOCK_SIZE, NUM_BLOCKS)
+    model = BigramLanguageModel(NUM_HEADS, tokenizer.vocab_size, N_EMBED, BLOCK_SIZE, NUM_BLOCKS)
 
     model = model.to(DEVICE)
 
@@ -248,7 +223,7 @@ def main():
 
         model.eval()
         with torch.no_grad():
-            val_batch, val_targets = get_batch(BLOCK_SIZE, BATCH_SIZE, train_data)
+            val_batch, val_targets = get_batch(BLOCK_SIZE, BATCH_SIZE, val_data)
             _, val_loss = model(val_batch, val_targets)
             val_average_loss.add(val_loss.item())
 
@@ -262,7 +237,7 @@ def main():
 
     context = torch.zeros((1,1), dtype=torch.long).to(DEVICE)
     generated = model.generate(context, max_new_tokens=500)
-    first_batch_trained = decode(generated[0].tolist(), decoding_mapping)
+    first_batch_trained = tokenizer.decode(generated[0].tolist())
     print(first_batch_trained)
 
 

diff --git a/scratchgpt/tokenizer/base_tokenizer.py b/scratchgpt/tokenizer/base_tokenizer.py
@@ -0,0 +1,22 @@
+from abc import abstractmethod, ABC
+
+
+class Tokenizer(ABC):
+
+    @abstractmethod
+    def encode(self, text: str) -> list[int]:
+        """Convert a string into a sequence of token IDs."""
+
+    @abstractmethod
+    def decode(self, encoding: list[int]) -> str:
+        """Convert a sequence of token IDs back into a string."""
+
+    @property
+    @abstractmethod
+    def vocab_size(self) -> int:
+        """Return the size of the vocabulary."""
+
+    @property
+    @abstractmethod
+    def vocabulary(self) -> list[str]:
+        """Return the learned vocabulary"""
diff --git a/scratchgpt/tokenizer/char_tokenizer.py b/scratchgpt/tokenizer/char_tokenizer.py
@@ -0,0 +1,41 @@
+from typing import override
+from .base_tokenizer import Tokenizer
+
+
+def get_vocab(text: str) -> list[str]:
+    chars = sorted(list(set(text)))
+    return chars
+
+
+def str_to_int(chars: list[str]) -> dict[str, int]:
+    return {char:idx for idx, char in enumerate(chars)}
+
+
+def int_to_str(chars: list[str]) -> dict[int, str]:
+    return {idx: char for idx, char in enumerate(chars)}
+
+
+class CharTokenizer(Tokenizer):
+
+    def __init__(self, text: str) -> None:
+        self._vocabulary = get_vocab(text)
+        self._encoding_mapping = str_to_int(self._vocabulary)
+        self._decoding_mapping = int_to_str(self._vocabulary)
+
+    @property
+    @override
+    def vocab_size(self) -> int:
+        return len(self._vocabulary)
+
+    @property
+    @override
+    def vocabulary(self) -> list[str]:
+        return self._vocabulary
+
+    @override
+    def encode(self, text: str) -> list[int]:
+        return [self._encoding_mapping[char] for char in text]
+
+    @override
+    def decode(self, encoding: list[int],) -> str:
+        return ''.join(self._decoding_mapping[v] for v in encoding)