thedch
diff --git a/‎autoencoder/generate_tokens.py‎
Lines changed: 29 additions & 0 deletions b/‎autoencoder/generate_tokens.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎autoencoder/resource_loader.py‎
Lines changed: 1 addition & 5 deletions b/‎autoencoder/resource_loader.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎transformer/data/openwebtext/prepare.py‎
Lines changed: 0 additions & 80 deletions b/‎transformer/data/openwebtext/prepare.py‎
Lines changed: 0 additions & 80 deletions
diff --git a/‎transformer/data/openwebtext/readme.md‎
Lines changed: 0 additions & 15 deletions b/‎transformer/data/openwebtext/readme.md‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎transformer/data/shakespeare/prepare.py‎
Lines changed: 0 additions & 33 deletions b/‎transformer/data/shakespeare/prepare.py‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎transformer/data/shakespeare/readme.md‎
Lines changed: 0 additions & 9 deletions b/‎transformer/data/shakespeare/readme.md‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎transformer/data/shakespeare_char/prepare.py‎
Lines changed: 22 additions & 5 deletions b/‎transformer/data/shakespeare_char/prepare.py‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎transformer/model.py‎
Lines changed: 7 additions & 6 deletions b/‎transformer/model.py‎
Lines changed: 7 additions & 6 deletions
@@ -0,0 +1,29 @@
+import torch
+import logging
+import argparse
+from resource_loader import ResourceLoader
+
+def main(gpt_ckpt_dir: str, prompt: str):
+    resourceloader = ResourceLoader(
+        dataset='shakespeare_char',
+        gpt_ckpt_dir=gpt_ckpt_dir,
+        device='cpu',
+        mode="prepare",
+    )
+    enc_fxn, dec_fxn = resourceloader.load_tokenizer()
+    tokens = torch.Tensor([enc_fxn(prompt)]).long()
+    logging.info(tokens)
+    generated = resourceloader.transformer.generate(
+        idx=tokens,
+        max_new_tokens=100,
+    )
+    generated = dec_fxn(generated.squeeze().tolist())
+    print(generated)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--gpt_ckpt_dir', type=str, default='')
+    parser.add_argument('--prompt', type=str, help='Try "def run(" or "oh romeo!"')
+    args = parser.parse_args()
+    main(**vars(args))
@@ -177,9 +177,5 @@ def load_tokenizer(self):
             encode = lambda s: [stoi[c] for c in s]
             decode = lambda l: ''.join([itos[i] for i in l])
         else:
-            # ok let's assume gpt-2 encodings by default
-            print("No meta.pkl found, assuming GPT-2 encodings...")
-            enc = tiktoken.get_encoding("gpt2")
-            encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
-            decode = lambda l: enc.decode(l)
+            raise DeprecationWarning('must load from dataset dir')
         return encode, decode
@@ -8,6 +8,7 @@
 import pickle
 import requests
 import numpy as np
+import pandas as pd
 
 # download the tiny shakespeare dataset
 input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
@@ -18,6 +19,22 @@
 
 with open(input_file_path, 'r') as f:
     data = f.read()
+
+# Add in some Python code training data so the model learns both Shakespare and Python
+df = pd.read_parquet(
+    "hf://datasets/matlok/python-text-copilot-training-instruct-ai-research-2024-02-10/schema/train-0022-qwen-agent-qwen_agent.parquet"
+)
+python_code = '\n###\n'.join(df['code'].dropna().astype(str))
+python_code = python_code.encode('ascii', 'ignore').decode() # there's a few non-ascii characters but I don't want to deal with them
+
+train_split = python_code[:int(len(python_code) * 0.9)]
+val_split = python_code[int(len(python_code) * 0.9):]
+
+# Add the train split to the beginning, and then the val split at the end, so that
+# the code below to create the train/val splits works as expected.
+# In the industry, this is what we call a "insane awful hack".
+data = train_split + data + val_split
+
 print(f"length of dataset in characters: {len(data):,}")
 
 # get all the unique characters that occur in this text
@@ -60,9 +77,9 @@ def decode(l):
 with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
     pickle.dump(meta, f)
 
-# length of dataset in characters:  1115394
+# length of dataset in characters: 1,217,175
 # all the unique characters:
-#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
-# vocab size: 65
-# train has 1003854 tokens
-# val has 111540 tokens
+#  !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+# vocab size: 96
+# train has 1,095,457 tokens
+# val has 121,718 tokens
@@ -260,7 +260,7 @@ def from_pretrained(cls, model_type, override_args=None):
 
         return model
 
-    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+    def configure_optimizers(self, weight_decay, learning_rate, betas):
         # start with all of the candidate parameters
         param_dict = {pn: p for pn, p in self.named_parameters()}
         # filter out those that do not require grad
@@ -277,12 +277,13 @@ def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
         num_nodecay_params = sum(p.numel() for p in nodecay_params)
         print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
         print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+
         # Create AdamW optimizer and use the fused version if it is available
-        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
-        use_fused = fused_available and device_type == 'cuda'
-        extra_args = dict(fused=True) if use_fused else dict()
-        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
-        print(f"using fused AdamW: {use_fused}")
+        # fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        # use_fused = fused_available and device_type == 'cuda'
+        # extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, fused=False)
+        # print(f"using fused AdamW: {use_fused}")
 
         return optimizer