TopRankDigitalSolutions · jpruiz114 · Oct 12, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,30 @@ __pycache__/
 *.pkl
 *.pt
 *.pyc
+*.onnx
+*.pdf
 input.txt
 env/
-venv/
+venv/
+
+# Google Cloud service account keys (sensitive!)
+*service-account*.json
+*gcloud*.json
+*key*.json
+my-project-*.json
+
+# Other sensitive files
+.env
+.env.*
+credentials.json
+config.json
+
+# OCR output directories (generated content)
+ocr_free_pdf*/
+ocr_google_vision_pdf/206-*/
+ocr_google_vision_pdf/consolidated_extracted_text.txt
+ocr_google_vision_pdf/consolidation_summary.txt
+batch_ocr_free_pdf*/
+batch_google_vision*/
+*_extracted/
+test_extraction/
diff --git a/VIRTUAL_ENV.md b/VIRTUAL_ENV.md
@@ -0,0 +1,86 @@
+# NanoGPT Virtual Environment Setup
+
+## Create and Activate Virtual Environment
+
+```bash
+python3 -m venv nanogpt-env
+source nanogpt-env/bin/activate
+pip install --upgrade pip
+```
+
+## Install Dependencies
+
+```bash
+pip install torch numpy transformers datasets tiktoken wandb tqdm
+```
+
+## Verify Installation
+
+```bash
+python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
+```
+
+## Shakespeare Dataset
+
+### Prepare Data
+```bash
+python data/shakespeare/prepare.py
+```
+
+### Train
+```bash
+python train.py config/finetune_shakespeare.py --device=mps --compile=False
+```
+
+### Generate Sample
+```bash
+python sample.py --out_dir=out-shakespeare --start="ROMEO:" --num_samples=1 --max_new_tokens=100 --temperature=0.8
+```
+
+## Shakespeare Character-Level Dataset
+
+### Prepare Data
+```bash
+python data/shakespeare_char/prepare.py
+```
+
+### Train
+```bash
+python train.py config/train_shakespeare_char.py --device=mps --compile=False
+```
+
+### Generate Sample
+```bash
+python sample.py --out_dir=out-shakespeare-char --start="How are you?" --num_samples=1 --max_new_tokens=100 --temperature=0.8
+```
+
+## Catholic Bible Character-Level Dataset
+
+### Prepare Data
+```bash
+python data/catholic_bible/prepare.py
+```
+
+### Train
+```bash
+python train.py config/train_catholic_bible_char.py --device=mps --compile=False
+```
+
+### Generate Sample
+```bash
+python sample.py --out_dir=out-catholic-bible-char --start="In the beginning" --num_samples=1 --max_new_tokens=100 --temperature=0.8
+```
+
+## OCR Google Vision Character-Level Dataset
+
+**Note:** OCR data is prepared in the separate `ocr-to-training-data` repository. The prepared training data should be in `data/ocr_google_vision_pdf/`.
+
+### Train
+```bash
+python train.py config/train_ocr_vision_char.py --device=mps --compile=False
+```
+
+### Generate Sample
+```bash
+python sample.py --out_dir=out-ocr-vision-char --start="Where was Kennedy?" --num_samples=1 --max_new_tokens=100 --temperature=0.8
+```
diff --git a/config/train_catholic_bible_char.py b/config/train_catholic_bible_char.py
@@ -0,0 +1,37 @@
+# train a character-level model on Catholic Bible text
+# dataset from Catholic Public Domain Version (CPDV)
+
+out_dir = 'out-catholic-bible-char'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+
+# we expect to overfit on this dataset, so only save when val improves
+always_save_checkpoint = False
+
+wandb_log = False # override via command line if you like
+wandb_project = 'catholic-bible-char'
+wandb_run_name = 'mini-gpt'
+
+dataset = 'catholic_bible'
+gradient_accumulation_steps = 1
+batch_size = 64
+block_size = 256 # context of up to 256 previous characters
+
+# baby GPT model :)
+n_layer = 6
+n_head = 6
+n_embd = 384
+dropout = 0.2
+
+learning_rate = 1e-3 # with baby networks can afford to go a bit higher
+max_iters = 5000
+lr_decay_iters = 5000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+
+warmup_iters = 100 # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+# compile = False # do not torch compile the model
diff --git a/config/train_ocr_vision_char.py b/config/train_ocr_vision_char.py
@@ -0,0 +1,37 @@
+# train a character-level model on OCR extracted text
+# dataset from Google Cloud Vision OCR results
+
+out_dir = 'out-ocr-vision-char'
+eval_interval = 250 # keep frequent because we'll overfit
+eval_iters = 200
+log_interval = 10 # don't print too too often
+
+# we expect to overfit on this dataset, so only save when val improves
+always_save_checkpoint = False
+
+wandb_log = False # override via command line if you like
+wandb_project = 'ocr-vision-char'
+wandb_run_name = 'mini-gpt'
+
+dataset = 'ocr_google_vision_pdf'
+gradient_accumulation_steps = 1
+batch_size = 64
+block_size = 256 # context of up to 256 previous characters
+
+# baby GPT model :)
+n_layer = 6
+n_head = 6
+n_embd = 384
+dropout = 0.2
+
+learning_rate = 1e-3 # with baby networks can afford to go a bit higher
+max_iters = 5000
+lr_decay_iters = 5000 # make equal to max_iters usually
+min_lr = 1e-4 # learning_rate / 10 usually
+beta2 = 0.99 # make a bit bigger because number of tokens per iter is small
+
+warmup_iters = 100 # not super necessary potentially
+
+# on macbook also add
+# device = 'cpu'  # run on cpu only
+# compile = False # do not torch compile the model