Skip to content
26 changes: 25 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,30 @@ __pycache__/
*.pkl
*.pt
*.pyc
*.onnx
*.pdf
input.txt
env/
venv/
venv/

# Google Cloud service account keys (sensitive!)
*service-account*.json
*gcloud*.json
*key*.json
my-project-*.json

# Other sensitive files
.env
.env.*
credentials.json
config.json

# OCR output directories (generated content)
ocr_free_pdf*/
ocr_google_vision_pdf/206-*/
ocr_google_vision_pdf/consolidated_extracted_text.txt
ocr_google_vision_pdf/consolidation_summary.txt
batch_ocr_free_pdf*/
batch_google_vision*/
*_extracted/
test_extraction/
86 changes: 86 additions & 0 deletions VIRTUAL_ENV.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# NanoGPT Virtual Environment Setup

## Create and Activate Virtual Environment

```bash
python3 -m venv nanogpt-env
source nanogpt-env/bin/activate
pip install --upgrade pip
```

## Install Dependencies

```bash
pip install torch numpy transformers datasets tiktoken wandb tqdm
```

## Verify Installation

```bash
python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
```

## Shakespeare Dataset

### Prepare Data
```bash
python data/shakespeare/prepare.py
```

### Train
```bash
python train.py config/finetune_shakespeare.py --device=mps --compile=False
```

### Generate Sample
```bash
python sample.py --out_dir=out-shakespeare --start="ROMEO:" --num_samples=1 --max_new_tokens=100 --temperature=0.8
```

## Shakespeare Character-Level Dataset

### Prepare Data
```bash
python data/shakespeare_char/prepare.py
```

### Train
```bash
python train.py config/train_shakespeare_char.py --device=mps --compile=False
```

### Generate Sample
```bash
python sample.py --out_dir=out-shakespeare-char --start="How are you?" --num_samples=1 --max_new_tokens=100 --temperature=0.8
```

## Catholic Bible Character-Level Dataset

### Prepare Data
```bash
python data/catholic_bible/prepare.py
```

### Train
```bash
python train.py config/train_catholic_bible_char.py --device=mps --compile=False
```

### Generate Sample
```bash
python sample.py --out_dir=out-catholic-bible-char --start="In the beginning" --num_samples=1 --max_new_tokens=100 --temperature=0.8
```

## OCR Google Vision Character-Level Dataset

**Note:** OCR data is prepared in the separate `ocr-to-training-data` repository. The prepared training data should be in `data/ocr_google_vision_pdf/`.

### Train
```bash
python train.py config/train_ocr_vision_char.py --device=mps --compile=False
```

### Generate Sample
```bash
python sample.py --out_dir=out-ocr-vision-char --start="Where was Kennedy?" --num_samples=1 --max_new_tokens=100 --temperature=0.8
```
37 changes: 37 additions & 0 deletions config/train_catholic_bible_char.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# train a character-level model on Catholic Bible text
# dataset from Catholic Public Domain Version (CPDV)

out_dir = 'out-catholic-bible-char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'catholic-bible-char'
wandb_run_name = 'mini-gpt'

dataset = 'catholic_bible'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

# on macbook also add
# device = 'cpu' # run on cpu only
# compile = False # do not torch compile the model
37 changes: 37 additions & 0 deletions config/train_ocr_vision_char.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# train a character-level model on OCR extracted text
# dataset from Google Cloud Vision OCR results

out_dir = 'out-ocr-vision-char'
eval_interval = 250 # keep frequent because we'll overfit
eval_iters = 200
log_interval = 10 # don't print too too often

# we expect to overfit on this dataset, so only save when val improves
always_save_checkpoint = False

wandb_log = False # override via command line if you like
wandb_project = 'ocr-vision-char'
wandb_run_name = 'mini-gpt'

dataset = 'ocr_google_vision_pdf'
gradient_accumulation_steps = 1
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher
max_iters = 5000
lr_decay_iters = 5000 # make equal to max_iters usually
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

warmup_iters = 100 # not super necessary potentially

# on macbook also add
# device = 'cpu' # run on cpu only
# compile = False # do not torch compile the model
Loading