-
Notifications
You must be signed in to change notification settings - Fork 185
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
I tried to quantize Qwen1.5-MoE-A2.7B-Chat with w4a16 for vllm PR: vllm-project/vllm#7766
raise error TypeError: forward() got multiple values for argument 'attention_mask'
Expected behavior
A clear and concise description of what you expected to happen.
Environment
Include all relevant environment information:
- OS [e.g. Ubuntu 20.04]: Red Hat 7
- Python version [e.g. 3.7]: 3.8
- LLM Compressor version or commit hash [e.g. 0.1.0,
f7245c8
]: main3fb4212f
- ML framework version(s) [e.g. torch 2.3.1]: torch 2.4
- Other Python package versions [e.g. vLLM, compressed-tensors, numpy, ONNX]:
- Other relevant environment information [e.g. hardware, CUDA version]: CUDA 12.1
To Reproduce
Exact steps to reproduce the behavior:
My code
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers.compression.helpers import \
calculate_offload_device_map
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 4096
data_path = "/path/to/your/data_file.jsonl"
pretrained_model_dir = "/path/to/your/model_directory"
def load_data(tokenizer: AutoTokenizer, batch_size=5, num_calibration_samples=512) -> Dataset:
with open(data_path) as f:
# omit
pass
return ds
if __name__ == "__main__":
device_map = calculate_offload_device_map(
pretrained_model_dir,
reserve_for_hessians=False,
num_gpus=1,
torch_dtype="auto"
)
model = SparseAutoModelForCausalLM.from_pretrained(
pretrained_model_dir, device_map=device_map, torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir)
ds = load_data(tokenizer)
# Configure the quantization algorithm to run.
recipe = [
GPTQModifier(
targets="Linear",
scheme="W4A16",
ignore=["lm_head", "re:.*gate_proj"],
sequential_update=True,
),
]
# Apply quantization.
SAVE_DIR = "/path/to/save/compressed_model"
oneshot(
model=model, dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
save_compressed=True,
output_dir=SAVE_DIR,
)
tokenizer.save_pretrained(SAVE_DIR)
print("========== SAMPLE GENERATION ==============")
SAMPLE_INPUT = ["Sample input for testing."]
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir)
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt",
padding=True).to(model.device)
output = model.generate(**inputs, max_length=50)
text_output = tokenizer.batch_decode(output)
print(text_output)
Errors
If applicable, add a full print-out of any errors or exceptions that are raised or include screenshots to help explain your problem.
Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00, 2.20it/s]2024-09-20T13:52:08.810665+0800 | download_model_directory | DEBUG - Model directory already exists locally.
Loading checkpoint shards: 100%|██████████| 6/6 [00:05<00:00, 1.00it/s]2024-09-20T13:52:22.289845+0800 | infer_recipe_from_model_path | DEBUG - No recipe found in the model_path:
2024-09-20T13:52:22.648264+0800 | main | WARNING - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: False
2024-09-20T13:52:22.649009+0800 | main | INFO - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
clear_sparse_session=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_oneshot=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=False,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=Qwen1.5-MoE-W4A16-G128/runs/Sep20_13-52-22_kmaker-54-011064081248,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=3.0,
oneshot_device=cuda:0,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=Qwen1.5-MoE-W4A16-G128,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=8,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
recipe=[GPTQModifier(index=None, group=None, start=None, end=None, update=None, initialized_structure_=False, initialized_=False, finalized_=False, started_=False, ended_=False, sequential_update=True, targets='Linear', sequential_targets=None, block_size=128, quantize=True, dampening_frac=0.01, config_groups=None, ignore=['lm_head', 're:.*gate_proj'], disable_quantization_observer_epoch=None, num_calibration_steps=None, scheme='W4A16', model=None, layer_compressors_=None, compressible_layers_=None, quantization_modifier_=None)],
recipe_args=None,
remove_unused_columns=True,
report_to=[],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=Qwen1.5-MoE-W4A16-G128,
run_stages=False,
save_compressed=True,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=500,
save_strategy=steps,
save_total_limit=None,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
)
2024-09-20T13:52:22.796774+0800 | pre_initialize_structure | DEBUG - Pre-initializing structure
2024-09-20T13:52:22.796940+0800 | _check_create_state | DEBUG - Creating new State instance for compression lifecycle
2024-09-20T13:52:22.797014+0800 | _check_create_state | INFO - State created for compression lifecycle
2024-09-20T13:52:22.797048+0800 | update | DEBUG - Updating state with provided parameters: {'model': Qwen2MoeForCausalLM(
(model): Qwen2MoeModel(
(embed_tokens): Embedding(151936, 2048)
(layers): ModuleList(
(0-23): 24 x Qwen2MoeDecoderLayer(
(self_attn): Qwen2MoeSdpaAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=True)
(k_proj): Linear(in_features=2048, out_features=2048, bias=True)
(v_proj): Linear(in_features=2048, out_features=2048, bias=True)
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
(rotary_emb): Qwen2MoeRotaryEmbedding()
)
(mlp): Qwen2MoeSparseMoeBlock(
(gate): Linear(in_features=2048, out_features=60, bias=False)
(experts): ModuleList(
(0-59): 60 x Qwen2MoeMLP(
(gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
(up_proj): Linear(in_features=2048, out_features=1408, bias=False)
(down_proj): Linear(in_features=1408, out_features=2048, bias=False)
(act_fn): SiLU()
)
)
(shared_expert): Qwen2MoeMLP(
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
(act_fn): SiLU()
)
(shared_expert_gate): Linear(in_features=2048, out_features=1, bias=False)
)
(input_layernorm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
(post_attention_layernorm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
)
)
(norm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
)
(lm_head): Linear(in_features=2048, out_features=151936, bias=False)
), 'teacher_model': None, 'optimizer': None, 'attach_optim_callbacks': True, 'train_data': None, 'val_data': None, 'test_data': None, 'calib_data': None, 'copy_data': True, 'start': None, 'steps_per_epoch': None, 'batches_per_step': None, 'loggers': None, 'model_log_cadence': None, 'kwargs': {'recipe': None, 'recipe_stage': None, 'recipe_args': None}}
2024-09-20T13:52:22.848239+0800 | pre_initialize_structure | INFO - Compression lifecycle structure pre-initialized for 0 modifiers
2024-09-20T13:52:22.848332+0800 | pre_initialize_structure | DEBUG - Pre-initializing structure
2024-09-20T13:52:22.848361+0800 | update | DEBUG - Updating state with provided parameters: {'model': Qwen2MoeForCausalLM(
(model): Qwen2MoeModel(
(embed_tokens): Embedding(151936, 2048)
(layers): ModuleList(
(0-23): 24 x Qwen2MoeDecoderLayer(
(self_attn): Qwen2MoeSdpaAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=True)
(k_proj): Linear(in_features=2048, out_features=2048, bias=True)
(v_proj): Linear(in_features=2048, out_features=2048, bias=True)
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
(rotary_emb): Qwen2MoeRotaryEmbedding()
)
(mlp): Qwen2MoeSparseMoeBlock(
(gate): Linear(in_features=2048, out_features=60, bias=False)
(experts): ModuleList(
(0-59): 60 x Qwen2MoeMLP(
(gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
(up_proj): Linear(in_features=2048, out_features=1408, bias=False)
(down_proj): Linear(in_features=1408, out_features=2048, bias=False)
(act_fn): SiLU()
)
)
(shared_expert): Qwen2MoeMLP(
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
(act_fn): SiLU()
)
(shared_expert_gate): Linear(in_features=2048, out_features=1, bias=False)
)
(input_layernorm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
(post_attention_layernorm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
)
)
(norm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
)
(lm_head): Linear(in_features=2048, out_features=151936, bias=False)
), 'teacher_model': None, 'optimizer': None, 'attach_optim_callbacks': True, 'train_data': None, 'val_data': None, 'test_data': None, 'calib_data': None, 'copy_data': True, 'start': None, 'steps_per_epoch': None, 'batches_per_step': None, 'loggers': None, 'model_log_cadence': None, 'kwargs': {'recipe': None, 'recipe_stage': None, 'recipe_args': None}}
2024-09-20T13:52:22.861091+0800 | pre_initialize_structure | INFO - Compression lifecycle structure pre-initialized for 0 modifiers
Detected kernel version 4.9.151, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
2024-09-20T13:52:23.004311+0800 | one_shot | INFO - *** One Shot ***
2024-09-20T13:52:23.008081+0800 | initialize | DEBUG - Initializing compression lifecycle
2024-09-20T13:52:23.008160+0800 | update | DEBUG - Updating state with provided parameters: {'model': Qwen2MoeForCausalLM(
(model): Qwen2MoeModel(
(embed_tokens): Embedding(151936, 2048)
(layers): ModuleList(
(0-23): 24 x Qwen2MoeDecoderLayer(
(self_attn): Qwen2MoeSdpaAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=True)
(k_proj): Linear(in_features=2048, out_features=2048, bias=True)
(v_proj): Linear(in_features=2048, out_features=2048, bias=True)
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
(rotary_emb): Qwen2MoeRotaryEmbedding()
)
(mlp): Qwen2MoeSparseMoeBlock(
(gate): Linear(in_features=2048, out_features=60, bias=False)
(experts): ModuleList(
(0-59): 60 x Qwen2MoeMLP(
(gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
(up_proj): Linear(in_features=2048, out_features=1408, bias=False)
(down_proj): Linear(in_features=1408, out_features=2048, bias=False)
(act_fn): SiLU()
)
)
(shared_expert): Qwen2MoeMLP(
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
(act_fn): SiLU()
)
(shared_expert_gate): Linear(in_features=2048, out_features=1, bias=False)
)
(input_layernorm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
(post_attention_layernorm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
)
)
(norm): Qwen2MoeRMSNorm((2048,), eps=1e-06)
)
(lm_head): Linear(in_features=2048, out_features=151936, bias=False)
), 'teacher_model': None, 'optimizer': None, 'attach_optim_callbacks': True, 'train_data': None, 'val_data': None, 'test_data': None, 'calib_data': <accelerate.data_loader.DataLoaderShard object at 0x7f9c93359e20>, 'copy_data': False, 'start': -1, 'steps_per_epoch': None, 'batches_per_step': None, 'loggers': None, 'model_log_cadence': None, 'kwargs': {'recipe': [GPTQModifier(index=None, group=None, start=None, end=None, update=None, initialized_structure_=False, initialized_=False, finalized_=False, started_=False, ended_=False, sequential_update=True, targets='Linear', sequential_targets=None, block_size=128, quantize=True, dampening_frac=0.01, config_groups=None, ignore=['lm_head', 're:.*gate_proj'], disable_quantization_observer_epoch=None, num_calibration_steps=None, scheme='W4A16', model=None, layer_compressors_=None, compressible_layers_=None, quantization_modifier_=None)], 'recipe_stage': None, 'recipe_args': None, 'accelerator': <accelerate.accelerator.Accelerator object at 0x7f9c92fceb80>, 'min_tokens_per_module': None}}
2024-09-20T13:52:23.021760+0800 | current_index | DEBUG - Setting current index: -1
2024-09-20T13:52:23.022020+0800 | from_modifiers | INFO - Creating recipe from modifiers
2024-09-20T13:52:23.057445+0800 | create_instance | DEBUG - Could not initialize recipe as a file path or zoo stub, attempting to process as a string.
2024-09-20T13:52:23.057497+0800 | create_instance | DEBUG - Input string: DEFAULT_stage:
DEFAULT_modifiers:
GPTQModifier:
sequential_update: true
targets: Linear
ignore:
- lm_head
- re:.*gate_proj
scheme: W4A16
2024-09-20T13:52:23.058327+0800 | _check_compile_recipe | DEBUG - Compiling recipe and creating modifiers for compression lifecycle
2024-09-20T13:52:27.715571+0800 | _check_compile_recipe | INFO - Recipe compiled and 1 modifiers created
2024-09-20T13:52:27.732833+0800 | on_initialize_structure | WARNING - GPTQ quantization is set to True without an active quantization modifier.
2024-09-20T13:52:27.732946+0800 | _build_quant_modifier | INFO - Building quantization modifier with args: {'targets': 'Linear', 'scheme': 'W4A16', 'ignore': ['lm_head', 're:.*gate_proj']}
2024-09-20T13:52:27.958224+0800 | _check_calibration_data | INFO - Skipping QuantizationModifier calibration, it is not required for the provided quantization config.
2024-09-20T13:52:33.744165+0800 | _check_token_distribution | DEBUG - Skipping token distribution check. No calibration data.
2024-09-20T13:52:33.944612+0800 | initialize_compression | INFO - Preparing model.layers.0 for compression
2024-09-20T13:52:33.944802+0800 | initialize_compression | INFO - Preparing model.layers.1 for compression
2024-09-20T13:52:33.944834+0800 | initialize_compression | INFO - Preparing model.layers.2 for compression
2024-09-20T13:52:33.944858+0800 | initialize_compression | INFO - Preparing model.layers.3 for compression
2024-09-20T13:52:33.944879+0800 | initialize_compression | INFO - Preparing model.layers.4 for compression
2024-09-20T13:52:33.944902+0800 | initialize_compression | INFO - Preparing model.layers.5 for compression
2024-09-20T13:52:33.944919+0800 | initialize_compression | INFO - Preparing model.layers.6 for compression
2024-09-20T13:52:33.944936+0800 | initialize_compression | INFO - Preparing model.layers.7 for compression
2024-09-20T13:52:33.944952+0800 | initialize_compression | INFO - Preparing model.layers.8 for compression
2024-09-20T13:52:33.944968+0800 | initialize_compression | INFO - Preparing model.layers.9 for compression
2024-09-20T13:52:33.944983+0800 | initialize_compression | INFO - Preparing model.layers.10 for compression
2024-09-20T13:52:33.944999+0800 | initialize_compression | INFO - Preparing model.layers.11 for compression
2024-09-20T13:52:33.945014+0800 | initialize_compression | INFO - Preparing model.layers.12 for compression
2024-09-20T13:52:33.945031+0800 | initialize_compression | INFO - Preparing model.layers.13 for compression
2024-09-20T13:52:33.945046+0800 | initialize_compression | INFO - Preparing model.layers.14 for compression
2024-09-20T13:52:33.945063+0800 | initialize_compression | INFO - Preparing model.layers.15 for compression
2024-09-20T13:52:33.945078+0800 | initialize_compression | INFO - Preparing model.layers.16 for compression
2024-09-20T13:52:33.945093+0800 | initialize_compression | INFO - Preparing model.layers.17 for compression
2024-09-20T13:52:33.945109+0800 | initialize_compression | INFO - Preparing model.layers.18 for compression
2024-09-20T13:52:33.945124+0800 | initialize_compression | INFO - Preparing model.layers.19 for compression
2024-09-20T13:52:33.945140+0800 | initialize_compression | INFO - Preparing model.layers.20 for compression
2024-09-20T13:52:33.945155+0800 | initialize_compression | INFO - Preparing model.layers.21 for compression
2024-09-20T13:52:33.945170+0800 | initialize_compression | INFO - Preparing model.layers.22 for compression
2024-09-20T13:52:33.945184+0800 | initialize_compression | INFO - Preparing model.layers.23 for compression
2024-09-20T13:52:33.945285+0800 | apply_compression | INFO - Running GPTQModifier calibration with 512 samples...
0%| | 0/512 [00:00<?, ?it/s]
0%| | 1/512 [00:00<01:47, 4.76it/s]
31%|███▏ | 161/512 [00:00<00:00, 649.11it/s]
64%|██████▍ | 329/512 [00:00<00:00, 1025.27it/s]
97%|█████████▋| 495/512 [00:00<00:00, 1241.57it/s]
100%|██████████| 512/512 [00:00<00:00, 983.32it/s] 2024-09-20T13:52:34.496359+0800 | apply_compression | INFO -
===== Compressing layer 1/24 =====
2024-09-20T13:52:35.313237+0800 | apply_compression | INFO - Calibrating model.layers.0...
100%|██████████| 512/512 [00:15<00:00, 32.43it/s]2024-09-20T13:52:51.103512+0800 | compress_module | INFO - Compressing model.layers.0.model.layers.0.self_attn.q_proj...
2024-09-20T13:52:51.674928+0800 | compress | METRIC - time 0.57
2024-09-20T13:52:51.675339+0800 | compress | METRIC - error 5.33
2024-09-20T13:52:51.675625+0800 | compress | METRIC - GPU 0 | usage: 80.41% | total memory: 44 GB
2024-09-20T13:52:51.675705+0800 | compress | METRIC - Compressed layer size: 8.09765625 MB
done
......
2024-09-20T13:53:42.418176+0800 | compress_module | INFO - Compressing model.layers.0.model.layers.0.mlp.experts.59.gate_proj...
2024-09-20T13:53:42.418225+0800 | compress | DEBUG - Skipping unquantized layer model.layers.0.mlp.experts.59.gate_proj...
done
2024-09-20T13:53:42.418279+0800 | compress_module | INFO - Compressing model.layers.0.model.layers.0.mlp.experts.59.up_proj...
2024-09-20T13:53:42.963659+0800 | compress | METRIC - time 0.55
2024-09-20T13:53:42.964437+0800 | compress | METRIC - error 0.65
2024-09-20T13:53:42.964647+0800 | compress | METRIC - GPU 0 | usage: 80.41% | total memory: 44 GB
2024-09-20T13:53:42.964758+0800 | compress | METRIC - Compressed layer size: 5.564453125 MB
done
2024-09-20T13:53:42.965041+0800 | compress_module | INFO - Compressing model.layers.0.model.layers.0.mlp.experts.59.down_proj...
2024-09-20T13:53:43.254784+0800 | compress | METRIC - time 0.29
2024-09-20T13:53:43.255255+0800 | compress | METRIC - error 0.04
2024-09-20T13:53:43.255396+0800 | compress | METRIC - GPU 0 | usage: 80.41% | total memory: 44 GB
2024-09-20T13:53:43.255461+0800 | compress | METRIC - Compressed layer size: 5.564453125 MB
done
2024-09-20T13:53:43.255686+0800 | compress_module | INFO - Compressing model.layers.0.model.layers.0.mlp.shared_expert.gate_proj...
2024-09-20T13:53:43.255744+0800 | compress | DEBUG - Skipping unquantized layer model.layers.0.mlp.shared_expert.gate_proj...
done
2024-09-20T13:53:43.255812+0800 | compress_module | INFO - Compressing model.layers.0.model.layers.0.mlp.shared_expert.up_proj...
2024-09-20T13:53:43.723472+0800 | compress | METRIC - time 0.47
2024-09-20T13:53:43.723944+0800 | compress | METRIC - error 102.56
2024-09-20T13:53:43.724085+0800 | compress | METRIC - GPU 0 | usage: 80.73% | total memory: 44 GB
2024-09-20T13:53:43.724150+0800 | compress | METRIC - Compressed layer size: 22.2578125 MB
done
2024-09-20T13:53:43.724344+0800 | compress_module | INFO - Compressing model.layers.0.model.layers.0.mlp.shared_expert.down_proj...
2024-09-20T13:53:44.963208+0800 | compress | METRIC - time 1.24
2024-09-20T13:53:44.964259+0800 | compress | METRIC - error 8.68
2024-09-20T13:53:44.964417+0800 | compress | METRIC - GPU 0 | usage: 81.26% | total memory: 44 GB
2024-09-20T13:53:44.964484+0800 | compress | METRIC - Compressed layer size: 22.2578125 MB
done
2024-09-20T13:53:44.964696+0800 | compress_module | INFO - Compressing model.layers.0.model.layers.0.mlp.shared_expert_gate...
2024-09-20T13:53:45.361770+0800 | compress | METRIC - time 0.40
2024-09-20T13:53:45.361970+0800 | compress | METRIC - error 0.10
2024-09-20T13:53:45.362099+0800 | compress | METRIC - GPU 0 | usage: 81.26% | total memory: 44 GB
2024-09-20T13:53:45.362160+0800 | compress | METRIC - Compressed layer size: 0.0039520263671875 MB
done
2024-09-20T13:53:46.042718+0800 | apply_compression | INFO -
===== Compressing layer 2/24 =====
2024-09-20T13:53:46.831382+0800 | apply_compression | INFO - Calibrating model.layers.1...
0%| | 0/512 [00:00<?, ?it/s]
0%| | 0/512 [00:00<?, ?it/s]
Traceback (most recent call last):
File "llm_compressor.py", line 85, in <module>
oneshot(
File "/ossfs/workspace/llm-compressor/src/llmcompressor/transformers/finetune/text_generation.py", line 76, in oneshot
main(model_args, data_args, training_args)
File "/ossfs/workspace/llm-compressor/src/llmcompressor/transformers/finetune/text_generation.py", line 364, in main
stage_runner.one_shot()
File "/ossfs/workspace/llm-compressor/src/llmcompressor/transformers/finetune/runner.py", line 171, in one_shot
self.trainer.one_shot(calibration_data=calib_data, stage=stage)
File "/ossfs/workspace/llm-compressor/src/llmcompressor/transformers/finetune/session_mixin.py", line 401, in one_shot
apply(
File "/ossfs/workspace/llm-compressor/src/llmcompressor/core/session_functions.py", line 184, in apply
return active_session().apply(
File "/ossfs/workspace/llm-compressor/src/llmcompressor/core/session.py", line 210, in apply
self.initialize(**kwargs)
File "/ossfs/workspace/llm-compressor/src/llmcompressor/core/session.py", line 156, in initialize
mod_data = self._lifecycle.initialize(
File "/ossfs/workspace/llm-compressor/src/llmcompressor/core/lifecycle.py", line 126, in initialize
data = mod.initialize(state=self.state, **extras)
File "/ossfs/workspace/llm-compressor/src/llmcompressor/modifiers/stage.py", line 124, in initialize
modifier.initialize(state, **kwargs)
File "/ossfs/workspace/llm-compressor/src/llmcompressor/modifiers/modifier.py", line 118, in initialize
initialized = self.on_initialize(state=state, **kwargs)
File "/ossfs/workspace/llm-compressor/src/llmcompressor/modifiers/quantization/gptq/base.py", line 187, in on_initialize
self.apply_compression(calibration_dataloader)
File "/root/.virtualenvs/llmcompressor/lib/python3.8/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/ossfs/workspace/llm-compressor/src/llmcompressor/modifiers/quantization/gptq/base.py", line 290, in apply_compression
intermediates = layer_compressor.calibrate_layer(intermediates)
File "/ossfs/workspace/llm-compressor/src/llmcompressor/modifiers/utils/layer_compressor.py", line 135, in calibrate_layer
output = self.layer(*tensors_to_device(args, device), **kwargs)
File "/root/.virtualenvs/llmcompressor/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/root/.virtualenvs/llmcompressor/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
TypeError: forward() got multiple values for argument 'attention_mask'
Additional context
Add any other context about the problem here. Also include any relevant files.
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working