local

Life-0-1 · Sep 19, 2022 · 20bc05c · 20bc05c
1 parent cdc7c47
commit 20bc05c
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 9 deletions.
diff --git a/configs/13B_deduped.yml b/configs/13B_deduped.yml
@@ -19,9 +19,8 @@
    "output_layer_parallelism": "column",
 
    # these should provide some speedup but takes a while to build, set to true if desired
-   "scaled-upper-triang-masked-softmax-fusion": true,
-   "bias-gelu-fusion": true,
-
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
    # optimizer settings
    "optimizer": {
      "type": "Adam",
@@ -105,5 +104,5 @@
   "wandb_project": "pythia",
   "wandb_group": "13B dedupe",
   "launcher": "srun",
-  "deepspeed_mpi": true
+  "deepspeed_mpi": false 
 }
diff --git a/debug_srun.sh b/debug_srun.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH --job-name="dash-neox"
-#SBATCH --partition=compute-od-gpu
+#SBATCH --partition=gpu
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1          # Crucial - only 1 task per dist per node!
 #SBATCH --cpus-per-task=32           # Number of cores per tasks
@@ -10,6 +10,7 @@
 #SBATCH --output=%x_%j.out  # Set this dir where you want slurm outs to go
 #SBATCH --error=%x_%j.out  # Set this dir where you want slurm outs to go
 #SBATCH --exclusive
+#SBATCH --comment neox
 
 module load intelmpi
 
@@ -53,10 +54,12 @@ TRAIN_PATH=/fsx/dashiell/gpt-neox
 export TORCHELASTIC_ERROR_FILE=$TRAIN_PATH/tmp/torch-elastic-error.json
 
 # Env setup
-source /fsx/dashiell/miniconda3/conda/bin/activate neox
+source /fsx/dashiell/miniconda3/bin/activate neox
 cd $TRAIN_PATH
 
-srun python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \
+which python
+which nvcc
+srun --comment neox python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \
     --conf_dir configs 13B_deduped.yml 
 
 set +x
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -1,9 +1,7 @@
-git+https://github.com/EleutherAI/DeeperSpeed.git@ef9002f62c85efd727aa465d41987deb8e2e54dd#egg=deepspeed
 einops==0.3.0
 ftfy==6.0.1
 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 lm_eval==0.2.0
-mpi4py==3.0.3
 numpy==1.22.0
 pybind11==2.6.2
 regex