Update rp effective seq len sampling

m2kulkarni · Sep 19, 2024 · 50aabea · 50aabea
1 parent 90abd34
commit 50aabea
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 1 deletion.
diff --git a/configs/experiment/distill_rpcontig2048_dcs2048_n10k_xent0_mse1000_lr1e-2.yaml b/configs/experiment/distill_rpcontig2048_dcs2048_n10k_xent0_mse1000_lr1e-2.yaml
@@ -5,7 +5,7 @@ dataset:
     - redpajama/train.json[50000]
     eval_data:
     - redpajama/train.json
-    num_train_samples: 10000  # (8 * 2500) * (1024) = 20M
+    num_train_samples: 10000  # (8 * 2500) * (2048) = 20M
     max_train_samples: 20000
     max_eval_num: 1000
     max_length: 32768

diff --git a/configs/experiment/finetune_lora_qkvo_rpcontig2048_dcs2048_n10k.yaml b/configs/experiment/finetune_lora_qkvo_rpcontig2048_dcs2048_n10k.yaml
@@ -0,0 +1,69 @@
+dataset:
+  name: redpajama_sample_contig
+  dataset_config:
+    train_data: 
+    - redpajama/train.json[50000]
+    eval_data:
+    - redpajama/train.json
+    num_train_samples: 10000  # (8 * 2500) * (2048) = 20M
+    max_train_samples: 20000
+    max_eval_num: 1000
+    max_length: 32768
+    min_length: 2048
+    chat_template: llama-3
+    chunk_size: 2048  # sequence length for distilling
+    seed: 42
+    cache_dir: '/scr-ssd/mzhang/data/long-llm/long-llm/'  # Change this to where you want to save
+    load_from_cache_file: true
+    esl_model_config: base_llama3_8b
+    filter_by_esl: true
+    dataloaders_dir: '/scr-ssd/mzhang/projects/lolcats/src/dataloaders'
+
+  pretrained_model_config:
+    pretrained_model_name_or_path: 'meta-llama/Meta-Llama-3-8B'
+    cache_dir: '/scr-ssd/mzhang/models/llama3'  # Set this to where you want to save checkpoint weights
+  preprocess_config: null
+
+dataloader:
+  batch_size: 1
+  num_workers: 2
+  drop_last: false
+  pin_memory: true
+
+optimizer:
+  optim: adamw_torch_fused
+  lr: 1e-4
+  weight_decay: 0.0
+
+lr_scheduler:
+  lr_scheduler_type: reduce_lr_on_plateau
+  mode: min
+  factor: 0.1
+  patience: 10
+  min_lr: 0.00001
+
+trainer:  # HuggingFace Trainer-like arguments
+  name: default_lm
+  bf16: true
+  train_split: train
+  val_split: validation
+  num_train_epochs: 2
+  gradient_accumulation_steps: 8
+  seed: 42
+  batch_size: 1
+  load_best_model_at_end: true
+  greater_is_better: false
+  metric_for_best_model: eval/loss # eval/rouge/geometric_mean
+  logging_steps: 100
+  evaluation_strategy: steps
+  max_steps: -1
+  eval_steps: 100
+  max_eval_batches: null
+
+finetune:
+  method: lora
+  kwargs:
+    r: 8
+    lora_alpha: 16 # 32
+    lora_dropout: 0 # 0.05
+    target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]