Open
Description
What happened?
Trying to include performance metrics for GRPO, which tries to count tokens seen, results in a bug with HF transformer's trainer code.
Steps to reproduce the bug
Run oumi launch up -c configs/examples/grpo_tldr/gcp_job.yaml --cluster tiny-qwen-grpo
, but modify the run command to set --training.include_performance_metrics true
Results in:
Traceback (most recent call last):
File "/home/gcpuser/miniconda3/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/gcpuser/miniconda3/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/gcpuser/sky_workdir/src/oumi/__main__.py", line 30, in <module>
run()
File "/home/gcpuser/sky_workdir/src/oumi/cli/main.py", line 123, in run
return app()
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/typer/main.py", line 339, in __call__
raise e
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/typer/main.py", line 322, in __call__
return get_command(self)(*args, **kwargs)
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/click/core.py", line 1161, in __call__
return self.main(*args, **kwargs)
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/typer/core.py", line 740, in main
return _main(
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/typer/core.py", line 195, in _main
rv = self.invoke(ctx)
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/click/core.py", line 1697, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/click/core.py", line 1443, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/click/core.py", line 788, in invoke
return __callback(*args, **kwargs)
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/typer/main.py", line 697, in wrapper
return callback(**use_params)
File "/home/gcpuser/sky_workdir/src/oumi/cli/train.py", line 72, in train
oumi_train(parsed_config)
File "/home/gcpuser/sky_workdir/src/oumi/__init__.py", line 247, in train
return oumi.train.train(config, *kwargs)
File "/home/gcpuser/sky_workdir/src/oumi/train.py", line 405, in train
trainer.train(resume_from_checkpoint=checkpoint_location)
File "/home/gcpuser/sky_workdir/src/oumi/core/trainers/hf_trainer.py", line 40, in train
self._hf_trainer.train(resume_from_checkpoint=resume_from_checkpoint)
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/transformers/trainer.py", line 2171, in train
return inner_training_loop(
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/transformers/trainer.py", line 2227, in _inner_training_loop
self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
File "/home/gcpuser/miniconda3/lib/python3.10/site-packages/transformers/trainer.py", line 1709, in num_tokens
tokens = batch["input_ids"].numel()
TypeError: list indices must be integers or slices, not str
System Info
Run from source on 3/12/25