Skip to content

ValueError: Invalid attention arguments: AttnType.self_attn, None #59

Open
@chen-lee-li

Description

@chen-lee-li

Traceback (most recent call last):
File "/data/lee/Megatron-LM/pretrain_gpt.py", line 158, in
pretrain(train_valid_test_datasets_provider, model_provider,
File "/data/lee/Megatron-LM/megatron/training.py", line 129, in pretrain
model, optimizer, opt_param_scheduler = setup_model_and_optimizer(model_provider,
File "/data/lee/Megatron-LM/megatron/training.py", line 376, in setup_model_and_optimizer
model = get_model(model_provider_func, model_type)
File "/data/lee/Megatron-LM/megatron/training.py", line 262, in get_model
model = model_provider_func(
File "/data/lee/Megatron-LM/pretrain_gpt.py", line 35, in model_provider
model = GPTModel(
File "/data/lee/Megatron-LM/megatron/model/gpt_model.py", line 74, in init
self.language_model, self._language_model_key = get_language_model(
File "/data/lee/Megatron-LM/megatron/model/language_model.py", line 75, in get_language_model
language_model = TransformerLanguageModel(
File "/data/lee/Megatron-LM/megatron/model/language_model.py", line 373, in init
self.encoder = ParallelTransformer(
File "/data/lee/Megatron-LM/megatron/model/transformer.py", line 1182, in init
[build_layer(i + 1 + offset) for i in range(self.num_layers)])
File "/data/lee/Megatron-LM/megatron/model/transformer.py", line 1182, in
[build_layer(i + 1 + offset) for i in range(self.num_layers)])
File "/data/lee/Megatron-LM/megatron/model/transformer.py", line 1130, in build_layer
return ParallelTransformerLayer(
File "/data/lee/Megatron-LM/megatron/model/transformer.py", line 877, in init
self.self_attention = ParallelAttention(
File "/data/lee/Megatron-LM/megatron/model/transformer.py", line 590, in init
raise ValueError(f"Invalid attention arguments: {attention_type}, {self.attention_head_type}")
ValueError: Invalid attention arguments: AttnType.self_attn, None

The parameters and commands are as follows. May I ask what the problem is?

GPUS_PER_NODE=1
MASTER_ADDR=localhost
MASTER_PORT=6001
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
CHECKPOINT_PATH=/data/lee/Megatron-LM/experiments/0606
VOCAB_FILE=vocab.json
MERGE_FILE=merges.txt
DATA_PATH=starcoder-abap_content_document
GPT_ARGS="--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 12 \
--global-batch-size 192 \
--lr 0.0005 \
--train-iters 150000 \
--lr-decay-iters 150000 \
--lr-decay-style cosine \
--lr-warmup-iters 2000 \
--weight-decay .1 \
--adam-beta2 .999 \
--fp16 \
--log-interval 10 \
--save-interval 2000 \
--eval-interval 200 \
--eval-iters 10"

# --finetune
TENSORBOARD_ARGS="--tensorboard-dir experiments/tensorboard"


python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        pretrain_gpt.py \
        --tensor-model-parallel-size 1 \
        --pipeline-model-parallel-size 1 \
        $GPT_ARGS \
        --vocab-file $VOCAB_FILE \
        --merge-file $MERGE_FILE \
        --save $CHECKPOINT_PATH \
        --data-path $DATA_PATH \
        $TENSORBOARD_ARGS

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions