diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml index fae9c87542..67b4a0705a 100644 --- a/.github/workflows/gpu_test.yaml +++ b/.github/workflows/gpu_test.yaml @@ -53,7 +53,7 @@ jobs: - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" - python -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@fb963f0f0a5b28b69763590bb59676072cf43a01 + python -m pip install lm-eval==0.4.5 - name: Run recipe and unit tests with coverage run: pytest tests --with-integration --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov diff --git a/.github/workflows/recipe_test.yaml b/.github/workflows/recipe_test.yaml index 2b335ec0c1..d5a2dbe790 100644 --- a/.github/workflows/recipe_test.yaml +++ b/.github/workflows/recipe_test.yaml @@ -42,7 +42,7 @@ jobs: run: | python -m pip install torch torchvision torchao python -m pip install -e ".[dev]" - python -m pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@fb963f0f0a5b28b69763590bb59676072cf43a01 + python -m pip install lm-eval==0.4.5 - name: Run recipe tests with coverage run: pytest tests -m integration_test --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov diff --git a/.github/workflows/regression_test.yaml b/.github/workflows/regression_test.yaml index 229371812c..80ee645f47 100644 --- a/.github/workflows/regression_test.yaml +++ b/.github/workflows/regression_test.yaml @@ -56,7 +56,7 @@ jobs: - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" - python -m pip install lm-eval==0.4.* + python -m pip install lm-eval==0.4.5 - name: Run regression tests with coverage run: pytest tests -m slow_integration_test --silence-s3-logs --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000..dd5bf558c3 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,9 @@ +cff-version: 1.2.0 +title: "torchtune: PyTorch's finetuning library" +message: "If you use this software, please cite it as below." +type: software +authors: + - given-names: "torchtune maintainers and contributors" +url: "https//github.com/pytorch/torchtune" +license: "BSD-3-Clause" +date-released: "2024-04-14" diff --git a/README.md b/README.md index a66d3ded4c..2b702dc529 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ ![Recipe Integration Test](https://github.com/pytorch/torchtune/actions/workflows/recipe_test.yaml/badge.svg) [![](https://dcbadge.vercel.app/api/server/4Xsdn8Rr9Q?style=flat)](https://discord.gg/4Xsdn8Rr9Q) -[**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) | [**Documentation**](https://pytorch.org/torchtune/main/index.html) | [**Community**](#community) | [**License**](#license) +[**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) | [**Documentation**](https://pytorch.org/torchtune/main/index.html) | [**Community**](#community) | [**License**](#license) | [**Citing torchtune**](#citing-torchtune) > [!IMPORTANT] > Update September 25, 2024: torchtune has support for **Llama 3.2 11B Vision**, **Llama 3.2 3B**, and **Llama 3.2 1B** models! Try them out by following our installation instructions [here](#Installation), then run any of the text configs [here](recipes/configs/llama3_2) or vision configs [here](recipes/configs/llama3_2_vision). @@ -282,3 +282,19 @@ We also want to acknowledge some awesome libraries and tools from the ecosystem: ## License torchtune is released under the [BSD 3 license](./LICENSE). However you may have other legal obligations that govern your use of other content, such as the terms of service for third-party models. + + +## Citing torchtune + +If you find the torchtune library useful, please cite it in your work as below. + +```bibtex +@software{torchtune, + title = {torchtune: PyTorch's finetuning library}, + author = {torchtune maintainers and contributors}, + url = {https//github.com/pytorch/torchtune}, + license = {BSD-3-Clause}, + month = apr, + year = {2024} +} +``` diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst index a31082e174..cc9a493147 100644 --- a/docs/source/api_ref_modules.rst +++ b/docs/source/api_ref_modules.rst @@ -14,7 +14,6 @@ Modeling Components and Building Blocks MultiHeadAttention FeedForward KVCache - get_cosine_schedule_with_warmup RotaryPositionalEmbeddings RMSNorm Fp32LayerNorm diff --git a/docs/source/api_ref_training.rst b/docs/source/api_ref_training.rst index 9d402b1e34..980e1d40db 100644 --- a/docs/source/api_ref_training.rst +++ b/docs/source/api_ref_training.rst @@ -74,6 +74,19 @@ Utilities to reduce memory consumption during training. create_optim_in_bwd_wrapper register_optim_in_bwd_hooks +.. _lr_scheduler_label: + +Schedulers +---------- + +Utilities to control lr during the training process. + +.. autosummary:: + :toctree: generated/ + :nosignatures: + + get_cosine_schedule_with_warmup + .. _metric_logging_label: Metric Logging diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst index be40d89134..04644093a9 100644 --- a/docs/source/tutorials/memory_optimizations.rst +++ b/docs/source/tutorials/memory_optimizations.rst @@ -128,7 +128,7 @@ For example: with ``batch_size=1`` and ``gradient_accumulation_steps=32`` we get .. note:: For other components in torchtune which use "steps", such as :ref:`metric logging `, or - :func:`learning rate schedulers `, a "step" is counted as a + :func:`learning rate schedulers `, a "step" is counted as a single update to model parameters, rather than a single model forward pass with the data. Suppose ``gradient_accumulation_steps = 4`` and ``log_every_n_steps = 10``. Metrics would be logged every 10 global steps, which translates to every 40 model forward passes. diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml index 75daa2b454..263e3c12e1 100644 --- a/recipes/configs/code_llama2/7B_lora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_lora_single_device.yaml @@ -64,7 +64,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml index ab6b4e2b55..4f6fd9be61 100644 --- a/recipes/configs/code_llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_qlora_single_device.yaml @@ -64,7 +64,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml index 5364ec2bce..b82faa39e2 100644 --- a/recipes/configs/gemma/2B_lora.yaml +++ b/recipes/configs/gemma/2B_lora.yaml @@ -55,7 +55,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 10 loss: diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml index 786b0c7f2f..d6e1664b71 100644 --- a/recipes/configs/gemma/2B_lora_single_device.yaml +++ b/recipes/configs/gemma/2B_lora_single_device.yaml @@ -54,7 +54,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 10 loss: diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml index 39ebc088e7..9b24d6c0ee 100644 --- a/recipes/configs/gemma/2B_qlora_single_device.yaml +++ b/recipes/configs/gemma/2B_qlora_single_device.yaml @@ -54,7 +54,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 10 loss: diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml index a4ee960c17..6db9b0ab82 100644 --- a/recipes/configs/gemma/7B_lora.yaml +++ b/recipes/configs/gemma/7B_lora.yaml @@ -57,7 +57,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 10 loss: diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml index 2edeab2047..c82f0b76ba 100644 --- a/recipes/configs/gemma/7B_lora_single_device.yaml +++ b/recipes/configs/gemma/7B_lora_single_device.yaml @@ -56,7 +56,7 @@ optimizer: lr: 5e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 10 loss: diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml index 23d7465770..fcbccb786b 100644 --- a/recipes/configs/gemma/7B_qlora_single_device.yaml +++ b/recipes/configs/gemma/7B_qlora_single_device.yaml @@ -56,7 +56,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 10 loss: diff --git a/recipes/configs/gemma/evaluation.yaml b/recipes/configs/gemma/evaluation.yaml new file mode 100644 index 0000000000..2ff8f78546 --- /dev/null +++ b/recipes/configs/gemma/evaluation.yaml @@ -0,0 +1,39 @@ +# Config for EleutherEvalRecipe in eleuther_eval.py +# +# To launch, run the following command: +# tune run eleuther_eval --config gemma/evaluation + +# Model Arguments +model: + _component_: torchtune.models.gemma.gemma_2b + +# Checkpointer +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/gemma-2b + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors, + ] + output_dir: ./ # Not needed + model_type: GEMMA + +# Tokenizer +tokenizer: + _component_: torchtune.models.gemma.gemma_tokenizer + path: /tmp/gemma-2b/tokenizer.model + +# Environment +device: cuda +dtype: bf16 +seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed + +# EleutherAI specific eval args +tasks: ["truthfulqa_mc2"] +limit: null +max_seq_length: 4096 +batch_size: 8 +enable_kv_cache: True + +# Quantization specific args +quantizer: null diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml index 267725ab92..d657754139 100644 --- a/recipes/configs/llama2/13B_lora.yaml +++ b/recipes/configs/llama2/13B_lora.yaml @@ -64,7 +64,7 @@ optimizer: weight_decay: 0.01 lr: 2e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml index 539d692382..56431fdff5 100644 --- a/recipes/configs/llama2/13B_qlora_single_device.yaml +++ b/recipes/configs/llama2/13B_qlora_single_device.yaml @@ -59,7 +59,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml index ff4f56493b..b4d0d9c9a9 100644 --- a/recipes/configs/llama2/70B_lora.yaml +++ b/recipes/configs/llama2/70B_lora.yaml @@ -64,7 +64,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml index b8ff55c01b..c1de2c2358 100644 --- a/recipes/configs/llama2/70B_qlora.yaml +++ b/recipes/configs/llama2/70B_qlora.yaml @@ -70,7 +70,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml index beb2248b23..06558009ed 100644 --- a/recipes/configs/llama2/7B_full_low_memory.yaml +++ b/recipes/configs/llama2/7B_full_low_memory.yaml @@ -55,7 +55,7 @@ optimizer: _component_: bitsandbytes.optim.PagedAdamW lr: 1e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 optimizer_in_bwd: True loss: diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml index 68e1d302df..2c9a694d7b 100644 --- a/recipes/configs/llama2/7B_lora.yaml +++ b/recipes/configs/llama2/7B_lora.yaml @@ -61,7 +61,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml index f6acfcb76e..26f824814f 100644 --- a/recipes/configs/llama2/7B_lora_dpo.yaml +++ b/recipes/configs/llama2/7B_lora_dpo.yaml @@ -58,7 +58,7 @@ optimizer: weight_decay: 0.05 lr: 5e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml index 458a023c36..2ad3988867 100644 --- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml @@ -57,7 +57,7 @@ optimizer: weight_decay: 0.05 lr: 5e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml index 6608bdc48d..ebaee584c2 100644 --- a/recipes/configs/llama2/7B_lora_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_single_device.yaml @@ -59,7 +59,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml index 630d1f6357..052cdb9296 100644 --- a/recipes/configs/llama2/7B_qlora.yaml +++ b/recipes/configs/llama2/7B_qlora.yaml @@ -61,7 +61,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml index 062e66d833..0893f48579 100644 --- a/recipes/configs/llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/llama2/7B_qlora_single_device.yaml @@ -58,7 +58,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml index 84bed19a02..f3a921f289 100644 --- a/recipes/configs/llama3/70B_lora.yaml +++ b/recipes/configs/llama3/70B_lora.yaml @@ -79,7 +79,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml index 3911e856c2..1265c82c72 100644 --- a/recipes/configs/llama3/8B_dora.yaml +++ b/recipes/configs/llama3/8B_dora.yaml @@ -54,7 +54,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml index 1f91dadda8..0fc0a484dc 100644 --- a/recipes/configs/llama3/8B_dora_single_device.yaml +++ b/recipes/configs/llama3/8B_dora_single_device.yaml @@ -56,7 +56,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml index 1d5479ccbc..cd3e3586ce 100644 --- a/recipes/configs/llama3/8B_full_single_device.yaml +++ b/recipes/configs/llama3/8B_full_single_device.yaml @@ -54,7 +54,7 @@ optimizer: _component_: bitsandbytes.optim.PagedAdamW8bit lr: 1e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml index 5c3510f466..d65138f348 100644 --- a/recipes/configs/llama3/8B_lora.yaml +++ b/recipes/configs/llama3/8B_lora.yaml @@ -59,7 +59,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml index 0d9cb71a16..e49afacbb1 100644 --- a/recipes/configs/llama3/8B_lora_single_device.yaml +++ b/recipes/configs/llama3/8B_lora_single_device.yaml @@ -58,7 +58,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml index 29a2a2d84f..7180c5a72c 100644 --- a/recipes/configs/llama3/8B_qdora_single_device.yaml +++ b/recipes/configs/llama3/8B_qdora_single_device.yaml @@ -57,7 +57,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml index 0d831a8b77..1eef476d17 100644 --- a/recipes/configs/llama3/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3/8B_qlora_single_device.yaml @@ -57,7 +57,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml index 69583dd9d4..6398a840ec 100644 --- a/recipes/configs/llama3_1/405B_qlora.yaml +++ b/recipes/configs/llama3_1/405B_qlora.yaml @@ -58,7 +58,7 @@ optimizer: lr: 3e-4 fused: True lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml index c4fa8d589c..861279127a 100644 --- a/recipes/configs/llama3_1/70B_lora.yaml +++ b/recipes/configs/llama3_1/70B_lora.yaml @@ -78,7 +78,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml index c6e94e0aab..5f101b170f 100644 --- a/recipes/configs/llama3_1/8B_lora.yaml +++ b/recipes/configs/llama3_1/8B_lora.yaml @@ -62,7 +62,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml index c951abc3a5..3991f728ce 100644 --- a/recipes/configs/llama3_1/8B_lora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml @@ -61,7 +61,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml index 0b3e615bc9..a9b0662105 100644 --- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml @@ -60,7 +60,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml index 1fb0f483b3..228e4989d5 100644 --- a/recipes/configs/llama3_2/1B_lora.yaml +++ b/recipes/configs/llama3_2/1B_lora.yaml @@ -59,7 +59,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml index c69728ac0d..c9ebed6dc7 100644 --- a/recipes/configs/llama3_2/1B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml @@ -58,7 +58,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml index ca60a687eb..da552b2a0f 100644 --- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml @@ -57,7 +57,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml index 9a628f2c29..d13a303814 100644 --- a/recipes/configs/llama3_2/3B_lora.yaml +++ b/recipes/configs/llama3_2/3B_lora.yaml @@ -60,7 +60,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml index 8fd65dd913..255c75e227 100644 --- a/recipes/configs/llama3_2/3B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml @@ -59,7 +59,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml index 4547459282..360443b9e1 100644 --- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml @@ -58,7 +58,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml index c621467582..9cb029666f 100644 --- a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml +++ b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml @@ -74,7 +74,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml index 2c8f1f58fd..ee9180dbcf 100644 --- a/recipes/configs/llama3_2_vision/11B_full.yaml +++ b/recipes/configs/llama3_2_vision/11B_full.yaml @@ -28,6 +28,7 @@ tokenizer: _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model image_size: 560 + max_seq_len: 8192 # Checkpointer checkpointer: diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml index d42fb971e6..3372c1a540 100644 --- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml @@ -30,6 +30,7 @@ tokenizer: _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model image_size: 560 + max_seq_len: 8192 # Checkpointer checkpointer: diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml index e39ff367ba..357af64496 100644 --- a/recipes/configs/llama3_2_vision/11B_lora.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora.yaml @@ -34,6 +34,7 @@ tokenizer: _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model image_size: 560 + max_seq_len: 8192 # Checkpointer checkpointer: @@ -64,7 +65,7 @@ optimizer: weight_decay: 0.01 lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml index 827e04a815..f56828c301 100644 --- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml @@ -32,6 +32,7 @@ tokenizer: _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model image_size: 560 + max_seq_len: 8192 # Checkpointer checkpointer: @@ -63,7 +64,7 @@ optimizer: lr: 2e-5 optimizer_in_bwd: False lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/llama3_2_vision/evaluation.yaml b/recipes/configs/llama3_2_vision/evaluation.yaml index 81c0ed3c94..69123d8045 100644 --- a/recipes/configs/llama3_2_vision/evaluation.yaml +++ b/recipes/configs/llama3_2_vision/evaluation.yaml @@ -3,8 +3,8 @@ # This config assumes that you've run the following command before launching: # tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct # -# It also assumes that you've downloaded the EleutherAI Eval Harness: -# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@fb963f0f0a5b28b69763590bb59676072cf43a01 +# It also assumes that you've downloaded the EleutherAI Eval Harness (v0.4.5): +# pip install lm_eval==0.4.5 # # To launch, run the following command from root torchtune directory: # tune run eleuther_eval --config llama3_2_vision/evaluation diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml index fd2c637df7..08196660fc 100644 --- a/recipes/configs/mistral/7B_lora.yaml +++ b/recipes/configs/mistral/7B_lora.yaml @@ -63,7 +63,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml index ccfb0c2cd4..2ebc9f798e 100644 --- a/recipes/configs/mistral/7B_lora_single_device.yaml +++ b/recipes/configs/mistral/7B_lora_single_device.yaml @@ -60,7 +60,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml index 0e2fa20d94..3bbfebe3ba 100644 --- a/recipes/configs/mistral/7B_qlora_single_device.yaml +++ b/recipes/configs/mistral/7B_qlora_single_device.yaml @@ -61,7 +61,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/mistral/evaluation.yaml b/recipes/configs/mistral/evaluation.yaml new file mode 100644 index 0000000000..61d69dcb40 --- /dev/null +++ b/recipes/configs/mistral/evaluation.yaml @@ -0,0 +1,41 @@ +# Config for EleutherEvalRecipe in eleuther_eval.py +# +# To launch, run the following command: +# tune run eleuther_eval --config mistral/evaluation + +# Model Arguments +model: + _component_: torchtune.models.mistral.mistral_7b + +# Checkpointer +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Mistral-7B-v0.1/ + checkpoint_files: [ + pytorch_model-00001-of-00002.bin, + pytorch_model-00002-of-00002.bin + ] + output_dir: /tmp/Mistral-7B-v0.1/ + model_type: MISTRAL +resume_from_checkpoint: False + +# Tokenizer +tokenizer: + _component_: torchtune.models.mistral.mistral_tokenizer + path: /tmp/Mistral-7B-v0.1/tokenizer.model + max_seq_len: null + +# Environment +device: cuda +dtype: bf16 +seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed + +# EleutherAI specific eval args +tasks: ["truthfulqa_mc2"] +limit: null +max_seq_length: 4096 +batch_size: 8 +enable_kv_cache: True + +# Quantization specific args +quantizer: null diff --git a/recipes/configs/phi3/evaluation.yaml b/recipes/configs/phi3/evaluation.yaml new file mode 100644 index 0000000000..ca2f1c9759 --- /dev/null +++ b/recipes/configs/phi3/evaluation.yaml @@ -0,0 +1,42 @@ +# Config for EleutherEvalRecipe in eleuther_eval.py +# +# To launch, run the following command: +# tune run eleuther_eval --config phi3/evaluation + +# Model Arguments +model: + _component_: torchtune.models.phi3.phi3_mini + +# Checkpointer +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Phi-3-mini-4k-instruct + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Phi-3-mini-4k-instruct + model_type: PHI3_MINI +resume_from_checkpoint: False + +# Tokenizer +tokenizer: + _component_: torchtune.models.phi3.phi3_mini_tokenizer + path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model + max_seq_len: null + +# Environment +device: cuda +dtype: bf16 +seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed + +# EleutherAI specific eval args +tasks: ["truthfulqa_mc2"] +limit: null +max_seq_length: 4096 +batch_size: 8 +enable_kv_cache: True + +# Quantization specific args +quantizer: null diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml index 721a61790b..fff05885ef 100644 --- a/recipes/configs/phi3/mini_lora.yaml +++ b/recipes/configs/phi3/mini_lora.yaml @@ -64,7 +64,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml index 7de8a30c94..b5c14b19ca 100644 --- a/recipes/configs/phi3/mini_lora_single_device.yaml +++ b/recipes/configs/phi3/mini_lora_single_device.yaml @@ -62,7 +62,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml index 1d2d5c5cbc..10114bc67a 100644 --- a/recipes/configs/phi3/mini_qlora_single_device.yaml +++ b/recipes/configs/phi3/mini_qlora_single_device.yaml @@ -62,7 +62,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml index 9ccd400897..e0608eba5c 100644 --- a/recipes/configs/qwen2/0.5B_lora.yaml +++ b/recipes/configs/qwen2/0.5B_lora.yaml @@ -60,7 +60,7 @@ optimizer: lr: 2e-3 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml index 343eb8ea14..602c63853a 100644 --- a/recipes/configs/qwen2/0.5B_lora_single_device.yaml +++ b/recipes/configs/qwen2/0.5B_lora_single_device.yaml @@ -58,7 +58,7 @@ optimizer: lr: 2e-3 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml index 84fd73696b..a496dade08 100644 --- a/recipes/configs/qwen2/1.5B_lora.yaml +++ b/recipes/configs/qwen2/1.5B_lora.yaml @@ -56,7 +56,7 @@ optimizer: lr: 2e-5 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml index 3e8377b6a1..b41269de1a 100644 --- a/recipes/configs/qwen2/1.5B_lora_single_device.yaml +++ b/recipes/configs/qwen2/1.5B_lora_single_device.yaml @@ -56,7 +56,7 @@ optimizer: lr: 2e-3 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml index f6a4cc2ac6..d3b63fd1df 100644 --- a/recipes/configs/qwen2/7B_lora.yaml +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -62,7 +62,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml index 8b8d470f6d..6f9fb35b15 100644 --- a/recipes/configs/qwen2/7B_lora_single_device.yaml +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -60,7 +60,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml index 1254b6a33b..9cc894a7e5 100644 --- a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml +++ b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml @@ -67,7 +67,7 @@ optimizer: weight_decay: 0.01 lr: 3e-4 lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 loss: diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py index b07a3ad3ae..590e4f902a 100644 --- a/recipes/eleuther_eval.py +++ b/recipes/eleuther_eval.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import importlib.metadata import sys import time @@ -13,6 +12,12 @@ import PIL import torch + +from lm_eval.evaluator import evaluate +from lm_eval.models.hf_vlms import HFMultimodalLM +from lm_eval.models.huggingface import HFLM +from lm_eval.tasks import get_task_dict, TaskManager +from lm_eval.utils import make_table from omegaconf import DictConfig from torchtune import config, training, utils @@ -31,40 +36,6 @@ from torchtune.recipe_interfaces import EvalRecipeInterface from torchtune.training import FullModelTorchTuneCheckpointer -try: - import lm_eval -except ImportError: - print( - "You must install the EleutherAI Eval Harness to run this recipe. " - "Please install with `pip install lm_eval>=0.4.2`" - ) - sys.exit(1) - -lm_eval_version = importlib.metadata.version("lm_eval") -if not lm_eval_version >= "0.4.2": - print( - "You must install the EleutherAI Eval Harness >= v0.4.2 to run this recipe. " - "Please install with `pip install lm_eval>=0.4.2`" - ) - sys.exit(1) - -from lm_eval.evaluator import evaluate - -# User doesn't have to have nightlies installed, they just won't be able -# to use the multimodal model -try: - from lm_eval.models.hf_vlms import HFMultimodalLM -except ImportError as e: - # Create a dummy class to avoid having to import the HF models - # TODO (@joecummings): Remove this once v0.4.5 patch is released - class HFMultimodalLM: - def __init__(self, *args, **kwargs): - pass - - -from lm_eval.models.huggingface import HFLM -from lm_eval.tasks import get_task_dict, TaskManager - class _VLMEvalWrapper(HFMultimodalLM): """An EvalWrapper for EleutherAI's eval harness based on gpt-fast's @@ -466,6 +437,16 @@ class EleutherEvalRecipe(EvalRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: + # Double check we have the right Eval Harness version + from importlib.metadata import version + + if version("lm-eval") != "0.4.5": + raise RuntimeError( + "This recipe requires EleutherAI Eval Harness v0.4.5. " + "Please install with `pip install lm-eval==0.4.5`" + ) + + # General variable initialization self.device = utils.get_device(device=cfg.device) self.dtype = training.get_dtype(dtype=cfg.dtype, device=self.device) self.logger = utils.get_logger(cfg.get("log_level", "info")) @@ -568,7 +549,7 @@ def evaluate(self) -> None: self.logger.info( f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" ) - formatted_output = lm_eval.utils.make_table(output) + formatted_output = make_table(output) self.logger.info(f"\n\n{formatted_output}\n") diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py index fc4cd2fae2..1c3a7bb65f 100644 --- a/tests/recipes/test_eleuther_eval.py +++ b/tests/recipes/test_eleuther_eval.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import builtins import math import re import runpy @@ -64,17 +63,10 @@ def test_torchtune_checkpoint_eval_results( out = caplog.text - # v0.4.2 format - # | Tasks |Version|Filter|n-shot|Metric|Value | |Stderr| - # |--------------|------:|------|-----:|------|-----:|---|-----:| - # |truthfulqa_mc2| 2|none | 0|acc |0.4497|± |0.1067| - - # v0.4.3 format + # Format of output is: # | Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| # |--------------|------:|------|-----:|------|---|-----:|---|-----:| # |truthfulqa_mc2| 2|none | 0|acc |↑ |0.4497|± |0.1067| - - # The below RegEx command will pick up both formats search_results = re.search( r"acc(?:_norm)?\s*\|?\s*(?:\↑\s*\|?)?([\d.]+)", out.strip() ) @@ -83,18 +75,20 @@ def test_torchtune_checkpoint_eval_results( assert math.isclose(acc_result, expected_acc, abs_tol=0.05) @pytest.fixture - def hide_available_pkg(self, monkeypatch): - import_orig = builtins.__import__ + def hide_correct_version_number(self, monkeypatch): + import importlib.metadata + + import_orig = importlib.metadata.version def mocked_import(name, *args, **kwargs): - if name == "lm_eval": - raise ImportError() + if name == "lm-eval": + return "0.4.4" # Hardcode wrong version number return import_orig(name, *args, **kwargs) - monkeypatch.setattr(builtins, "__import__", mocked_import) + monkeypatch.setattr(importlib.metadata, "version", mocked_import) @pytest.mark.integration_test - @pytest.mark.usefixtures("hide_available_pkg") + @pytest.mark.usefixtures("hide_correct_version_number") def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir): ckpt = "llama2_tune" ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) @@ -116,16 +110,17 @@ def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir): device=cpu \ """.split() + model_config = llama2_test_config() + cmd = cmd + model_config + monkeypatch.setattr(sys, "argv", cmd) - with pytest.raises(SystemExit, match="1"): + with pytest.raises( + RuntimeError, + match="This recipe requires EleutherAI Eval Harness v0.4.5. " + "Please install with `pip install lm-eval==0.4.5`", + ): runpy.run_path(TUNE_PATH, run_name="__main__") - printed_err = capsys.readouterr().out - assert ( - "You must install the EleutherAI Eval Harness to run this recipe" - in printed_err - ) - @pytest.mark.integration_test def test_eval_recipe_errors_with_quantization_hf_checkpointer( self, capsys, monkeypatch, tmpdir diff --git a/tests/torchtune/_cli/test_download.py b/tests/torchtune/_cli/test_download.py index 5dbd695226..8a6d6ba0ab 100644 --- a/tests/torchtune/_cli/test_download.py +++ b/tests/torchtune/_cli/test_download.py @@ -65,3 +65,44 @@ def test_download_calls_snapshot(self, capsys, monkeypatch, snapshot_download): # Make sure it was called twice assert snapshot_download.call_count == 3 + + # GatedRepoError without --hf-token (expect prompt for token) + def test_gated_repo_error_no_token(self, capsys, monkeypatch, snapshot_download): + model = "meta-llama/Llama-2-7b" + testargs = f"tune download {model}".split() + monkeypatch.setattr(sys, "argv", testargs) + + # Expect GatedRepoError without --hf-token provided + with pytest.raises(SystemExit, match="2"): + runpy.run_path(TUNE_PATH, run_name="__main__") + + out_err = capsys.readouterr() + # Check that error message prompts for --hf-token + assert ( + "It looks like you are trying to access a gated repository." in out_err.err + ) + assert ( + "Please ensure you have access to the repository and have provided the proper Hugging Face API token" + in out_err.err + ) + + # GatedRepoError with --hf-token (should not ask for token) + def test_gated_repo_error_with_token(self, capsys, monkeypatch, snapshot_download): + model = "meta-llama/Llama-2-7b" + testargs = f"tune download {model} --hf-token valid_token".split() + monkeypatch.setattr(sys, "argv", testargs) + + # Expect GatedRepoError with --hf-token provided + with pytest.raises(SystemExit, match="2"): + runpy.run_path(TUNE_PATH, run_name="__main__") + + out_err = capsys.readouterr() + # Check that error message does not prompt for --hf-token again + assert ( + "It looks like you are trying to access a gated repository." in out_err.err + ) + assert "Please ensure you have access to the repository." in out_err.err + assert ( + "Please ensure you have access to the repository and have provided the proper Hugging Face API token" + not in out_err.err + ) diff --git a/tests/torchtune/models/clip/test_clip_image_transform.py b/tests/torchtune/models/clip/test_clip_image_transform.py index dd54fbaddd..a29ef83cef 100644 --- a/tests/torchtune/models/clip/test_clip_image_transform.py +++ b/tests/torchtune/models/clip/test_clip_image_transform.py @@ -37,17 +37,6 @@ class TestCLIPImageTransform: "expected_tile_max": [1.0, 1.0], "expected_tile_min": [0.0, 0.0], "expected_aspect_ratio": [1, 2], - "pad_max_tiles": False, - }, - { - "image_size": (100, 400, 3), - "expected_shape": torch.Size([4, 3, 224, 224]), - "resize_to_max_canvas": False, - "expected_tile_means": [0.2230, 0.1763, 0.0, 0.0], - "expected_tile_max": [1.0, 1.0, 0.0, 0.0], - "expected_tile_min": [0.0, 0.0, 0.0, 0.0], - "expected_aspect_ratio": [1, 2], - "pad_max_tiles": True, }, { "image_size": (1000, 300, 3), @@ -57,7 +46,6 @@ class TestCLIPImageTransform: "expected_tile_max": [0.9705, 0.9694, 0.9521, 0.9314], "expected_tile_min": [0.0353, 0.0435, 0.0528, 0.0], "expected_aspect_ratio": [4, 1], - "pad_max_tiles": False, }, { "image_size": (200, 200, 3), @@ -67,7 +55,6 @@ class TestCLIPImageTransform: "expected_tile_max": [0.9922, 0.9926, 0.9970, 0.9908], "expected_tile_min": [0.0056, 0.0069, 0.0059, 0.0033], "expected_aspect_ratio": [2, 2], - "pad_max_tiles": False, "pad_tiles": 1, }, { @@ -78,17 +65,6 @@ class TestCLIPImageTransform: "expected_tile_max": [1.0, 1.0, 1.0], "expected_tile_min": [0.0, 0.0, 0.0], "expected_aspect_ratio": [3, 1], - "pad_max_tiles": False, - }, - { - "image_size": (600, 200, 3), - "expected_shape": torch.Size([4, 3, 224, 224]), - "resize_to_max_canvas": False, - "expected_tile_means": [0.4473, 0.4469, 0.3032, 0.0], - "expected_tile_max": [1.0, 1.0, 1.0, 0.0], - "expected_tile_min": [0.0, 0.0, 0.0, 0.0], - "expected_aspect_ratio": [3, 1], - "pad_max_tiles": True, }, ], ) @@ -103,7 +79,6 @@ def test_clip_image_transform(self, params): resample="bilinear", dtype=torch.float32, resize_to_max_canvas=params["resize_to_max_canvas"], - pad_max_tiles=params["pad_max_tiles"], ) image_transform_inference = CLIPImageTransformInference( @@ -115,7 +90,6 @@ def test_clip_image_transform(self, params): resample="bilinear", resize_to_max_canvas=params["resize_to_max_canvas"], antialias=True, - pad_max_tiles=params["pad_max_tiles"], ) # Generate a deterministic image using np.arange for reproducibility @@ -169,13 +143,7 @@ def test_clip_image_transform(self, params): ), f"Expected aspect ratio {params['expected_aspect_ratio']} but got {tuple(output_ar.numpy())}" # number of tiles matches the product of the aspect ratio - if params["pad_max_tiles"]: - # max_num_tiles=4. - assert ( - 4 == output_image.shape[0] - ), f"Expected 4 tiles but got {output_image.shape[0]}" - else: - expected_num_tiles = output_ar[0] * output_ar[1] - assert ( - expected_num_tiles == output_image.shape[0] - ), f"Expected {expected_num_tiles} tiles but got {output_image.shape[0]}" + expected_num_tiles = output_ar[0] * output_ar[1] + assert ( + expected_num_tiles == output_image.shape[0] + ), f"Expected {expected_num_tiles} tiles but got {output_image.shape[0]}" diff --git a/tests/torchtune/modules/test_cosine_with_warmup.py b/tests/torchtune/modules/test_cosine_with_warmup.py index 274170e971..1cc4631708 100644 --- a/tests/torchtune/modules/test_cosine_with_warmup.py +++ b/tests/torchtune/modules/test_cosine_with_warmup.py @@ -13,7 +13,7 @@ from tests.test_utils import assert_expected -from torchtune.modules import get_cosine_schedule_with_warmup +from torchtune.training.lr_schedulers import get_cosine_schedule_with_warmup class TestCosineLR: diff --git a/torchtune/_cli/download.py b/torchtune/_cli/download.py index b35b81cca0..82b4935c01 100644 --- a/torchtune/_cli/download.py +++ b/torchtune/_cli/download.py @@ -131,12 +131,18 @@ def _download_cmd(self, args: argparse.Namespace) -> None: token=args.hf_token, ) except GatedRepoError: - self._parser.error( - "It looks like you are trying to access a gated repository. Please ensure you " - "have access to the repository and have provided the proper Hugging Face API token " - "using the option `--hf-token` or by running `huggingface-cli login`." - "You can find your token by visiting https://huggingface.co/settings/tokens" - ) + if args.hf_token: + self._parser.error( + "It looks like you are trying to access a gated repository. Please ensure you " + "have access to the repository." + ) + else: + self._parser.error( + "It looks like you are trying to access a gated repository. Please ensure you " + "have access to the repository and have provided the proper Hugging Face API token " + "using the option `--hf-token` or by running `huggingface-cli login`." + "You can find your token by visiting https://huggingface.co/settings/tokens" + ) except RepositoryNotFoundError: self._parser.error( f"Repository '{args.repo_id}' not found on the Hugging Face Hub." diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index ca1ce6150e..3f6697f593 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -328,6 +328,18 @@ class Recipe: name="qwen2/evaluation", file_path="qwen2/evaluation.yaml", ), + Config( + name="gemma/evaluation", + file_path="gemma/evaluation.yaml", + ), + Config( + name="phi3/evaluation", + file_path="phi3/evaluation.yaml", + ), + Config( + name="mistral/evaluation", + file_path="mistral/evaluation.yaml", + ), ], supports_distributed=False, ), diff --git a/torchtune/data/_collate.py b/torchtune/data/_collate.py index 562a81ff8f..055ab77350 100644 --- a/torchtune/data/_collate.py +++ b/torchtune/data/_collate.py @@ -222,6 +222,7 @@ def padded_collate_tiled_images_and_mask( padding_idx: int = 0, ignore_idx: int = CROSS_ENTROPY_IGNORE_IDX, pad_direction: str = "right", + pad_max_tiles: Optional[int] = None, pad_max_images: Optional[int] = None, ) -> Dict[str, torch.Tensor]: """Pad a batch of text sequences, tiled image tensors, aspect ratios, @@ -259,6 +260,8 @@ def padded_collate_tiled_images_and_mask( :func:`torch.nn.utils.rnn.pad_sequence`, otherwise if ``pad_direction="left"``, we use :func:`torchtune.data.left_pad_sequence`. For training, we typically want to pad from the right. For inference, we typically want to pad from the left. Defaults to "right". + pad_max_tiles (Optional[int]): Maximum number of tiles to pad to. If None, will pad to the largest number of tiles + in the batch. Defaults to None. pad_max_images (Optional[int]): Maximum number of images to pad to. If None, will pad to the largest number of images in the batch. Defaults to None. @@ -272,6 +275,7 @@ def padded_collate_tiled_images_and_mask( Raises: ValueError: if ``pad_direction`` is not one of "left" or "right". + ValueError: if pad_max_tiles is set to a value less than the largest number of tiles in an image. Example: >>> image_id = 1 @@ -355,6 +359,13 @@ def padded_collate_tiled_images_and_mask( for sample in batch for image in sample["encoder_input"]["images"] ) + if pad_max_tiles is not None: + if pad_max_tiles < max_num_tiles: + raise ValueError( + f"More tiles in image {max_num_tiles}, than pad_max_tiles {pad_max_tiles}" + ) + max_num_tiles = pad_max_tiles + # Second loop: pad images and masks to max number of tiles, max text seq len in batch batch_images = [] batch_masks = [] diff --git a/torchtune/data/_prompt_templates.py b/torchtune/data/_prompt_templates.py index 67167b3ed9..a7fa070a2e 100644 --- a/torchtune/data/_prompt_templates.py +++ b/torchtune/data/_prompt_templates.py @@ -107,16 +107,17 @@ def __call__( """ formatted_dialogue = [] for message in messages: + content = message.content if message.role in self.template: prepend_tag = self.template[message.role][0] append_tag = self.template[message.role][1] - content = ( - [{"type": "text", "content": prepend_tag}] - + message.content - + [{"type": "text", "content": append_tag}] - ) - else: content = message.content + + if isinstance(prepend_tag, str) and len(prepend_tag) > 0: + content = [{"type": "text", "content": prepend_tag}] + content + + if isinstance(append_tag, str) and len(append_tag) > 0: + content = content + [{"type": "text", "content": append_tag}] formatted_dialogue.append( Message( role=message.role, @@ -183,13 +184,20 @@ def __call__( and index == len(messages) - 1 and len(message.text_content) == 0 ): - content = [{"type": "text", "content": prepend_tag}] + message.content + content = message.content + if isinstance(prepend_tag, str) and len(prepend_tag) > 0: + content = [ + {"type": "text", "content": prepend_tag} + ] + message.content else: - content = ( - [{"type": "text", "content": prepend_tag}] - + message.content - + [{"type": "text", "content": append_tag}] - ) + content = message.content + + if isinstance(prepend_tag, str) and len(prepend_tag) > 0: + content = [{"type": "text", "content": prepend_tag}] + content + + if isinstance(append_tag, str) and len(append_tag) > 0: + content = content + [{"type": "text", "content": append_tag}] + formatted_dialogue.append( Message( role=message.role, diff --git a/torchtune/models/clip/_transform.py b/torchtune/models/clip/_transform.py index 533d27c865..a9b60624ff 100644 --- a/torchtune/models/clip/_transform.py +++ b/torchtune/models/clip/_transform.py @@ -15,7 +15,6 @@ find_supported_resolutions, get_canvas_best_fit, ) -from torchtune.modules.transforms.vision_utils.pad_dim_to_size import pad_dim_to_size from torchtune.modules.transforms.vision_utils.resize_with_pad import resize_with_pad from torchtune.modules.transforms.vision_utils.tile_crop import tile_crop @@ -63,7 +62,6 @@ class CLIPImageTransform: This will be used to generate possible_resolutions, e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224. Default 4. - pad_max_tiles (bool): If True, the image will be padded to have tiles == max_num_tiles. Default False. dtype (torch.dtype): Data type of the output image. Default torch.bfloat16. resample (str): Resampling method used when resizing images. Supports any enum of ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic". @@ -101,7 +99,6 @@ def __init__( possible_resolutions: Optional[List[Tuple[int, int]]] = None, tile_size: int = 224, max_num_tiles: Optional[int] = 4, - pad_max_tiles: bool = False, dtype: torch.dtype = torch.bfloat16, resample: str = "bilinear", resize_to_max_canvas: bool = False, @@ -142,7 +139,6 @@ def __init__( # tile_crop self.tile_size = tile_size self.tile_crop = tile_crop - self.pad_tile_size = max_num_tiles if pad_max_tiles else None def __call__( self, sample: Mapping[str, Any], inference: bool = False @@ -190,8 +186,6 @@ def __call__( # Divide the image into equally sized tiles image = self.tile_crop(image=image, tile_size=self.tile_size) - if self.pad_tile_size: - image = pad_dim_to_size(image, size=self.pad_tile_size, dim=0) aspect_ratio = torch.tensor(best_resolution).reshape(-1) // self.tile_size diff --git a/torchtune/models/llama3_2_vision/_transform.py b/torchtune/models/llama3_2_vision/_transform.py index 4272f5a1e2..4dc4f781e9 100644 --- a/torchtune/models/llama3_2_vision/_transform.py +++ b/torchtune/models/llama3_2_vision/_transform.py @@ -86,7 +86,6 @@ def __init__( tile_size=tile_size, possible_resolutions=None, max_num_tiles=max_num_tiles, - pad_max_tiles=True, resample="bilinear", resize_to_max_canvas=False, ) diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py index 23d53d9fa1..32af70f8e5 100644 --- a/torchtune/modules/__init__.py +++ b/torchtune/modules/__init__.py @@ -34,7 +34,6 @@ "TanhGate", "FeedForward", "FrozenNF4Linear", - "get_cosine_schedule_with_warmup", "KVCache", "RotaryPositionalEmbeddings", "RMSNorm", @@ -51,4 +50,5 @@ "local_kv_cache", "delete_kv_caches", "disable_kv_cache", + "get_cosine_schedule_with_warmup", ] diff --git a/torchtune/modules/attention.py b/torchtune/modules/attention.py index e3f7451ed1..879f0679cf 100644 --- a/torchtune/modules/attention.py +++ b/torchtune/modules/attention.py @@ -304,7 +304,7 @@ def forward( k, v, mask=mask, - dropout_p=self.attn_dropout, + dropout_p=self.attn_dropout if self.training else 0.0, is_causal=self.kv_cache is None and mask is None and self.is_causal, ) diff --git a/torchtune/modules/low_precision/_utils.py b/torchtune/modules/low_precision/_utils.py deleted file mode 100644 index 30f02911e6..0000000000 --- a/torchtune/modules/low_precision/_utils.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from datetime import datetime -from importlib.metadata import PackageNotFoundError, version -from typing import Optional, Tuple - -import torch - -import torchao - - -def _is_fbcode(): - return not hasattr(torch.version, "git_version") - - -def _nightly_version_ge(ao_version_str: str, date: str) -> bool: - """ - Compare a torchao nightly version to a date of the form - %Y-%m-%d. - - Returns True if the nightly version is greater than or equal to - the date, False otherwise - """ - ao_datetime = datetime.strptime(ao_version_str.split("+")[0], "%Y.%m.%d") - return ao_datetime >= datetime.strptime(date, "%Y-%m-%d") - - -def _get_torchao_version() -> Tuple[Optional[str], Optional[bool]]: - """ - Get torchao version. Returns a tuple of two elements, the first element - is the version string, the second element is whether it's a nightly version. - For fbcode usage, return None, None. - - Checks: - 1) is_fbcode, then - 3) torchao.__version__ (only defined for torchao >= 0.3.0), then - 4) importlib's version(torchao) - - - If none of these work, raise an error. - - """ - if _is_fbcode(): - return None, None - try: - ao_version = torchao.__version__ - except AttributeError: - try: - ao_version = version("torchao") - except Exception as e: - raise PackageNotFoundError("Could not find torchao version") from e - is_nightly = "dev" in ao_version - return ao_version, is_nightly diff --git a/torchtune/modules/lr_schedulers.py b/torchtune/modules/lr_schedulers.py index ee5f2cd91d..30d73e5dd5 100644 --- a/torchtune/modules/lr_schedulers.py +++ b/torchtune/modules/lr_schedulers.py @@ -8,8 +8,13 @@ import torch from torch.optim.lr_scheduler import LambdaLR +from torchtune.utils._logging import deprecated +@deprecated( + msg="Please use get_cosine_schedule_with_warmup from torchtune.training.lr_schedulers instead. \ + " +) def get_cosine_schedule_with_warmup( optimizer: torch.optim.Optimizer, num_warmup_steps: int, diff --git a/torchtune/modules/model_fusion/_fusion.py b/torchtune/modules/model_fusion/_fusion.py index 46507deb17..cf488d3a02 100644 --- a/torchtune/modules/model_fusion/_fusion.py +++ b/torchtune/modules/model_fusion/_fusion.py @@ -391,18 +391,26 @@ def setup_caches( ) def caches_are_setup(self) -> bool: - """Check if the key value caches are setup.""" + """ + Check if the key value caches are setup. This means `setup_caches` has been called, and + the relevant attention modules in the model have created `KVCache`s. + """ return self.decoder.caches_are_setup() def caches_are_enabled(self) -> bool: """ - Checks if the key value caches are enabled. KV-caches must also have been setup - for them to be enabled. + Checks if the key value caches are enabled. Once KV-caches have been setup, the relevant + attention modules will be "enabled" and all forward passes will update the caches. This behaviour + can be disabled without altering the state of the KV-caches by "disabling" the KV-caches + using ``torchtune.modules.disable_kv_cache``, upon which ``caches_are_enabled`` would return False. """ return self.decoder.caches_are_enabled() def reset_caches(self): - """Reset the key value caches.""" + """ + Resets KV-cache buffers on relevant attention modules to zero, and reset cache positions to zero, + without deleting or reallocating cache tensors. + """ self.decoder.reset_caches() def forward( diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py index eb6cd6e570..3b2d356c29 100644 --- a/torchtune/modules/transformer.py +++ b/torchtune/modules/transformer.py @@ -187,15 +187,26 @@ def setup_caches( self.attn.setup_cache(batch_size, dtype, encoder_max_seq_len) def caches_are_setup(self) -> bool: - """Check if the key value caches are setup.""" + """ + Check if the key value caches are setup. This means `setup_caches` has been called, and + the relevant attention modules in the model have created `KVCache`s. + """ return self.attn.kv_cache is not None def caches_are_enabled(self) -> bool: - """Check if key value caches are enabled.""" + """ + Checks if the key value caches are enabled. Once KV-caches have been setup, the relevant + attention modules will be "enabled" and all forward passes will update the caches. This behaviour + can be disabled without altering the state of the KV-caches by "disabling" the KV-caches + using ``torchtune.modules.disable_kv_cache``, upon which ``caches_are_enabled`` would return False. + """ return self.attn.cache_enabled def reset_cache(self): - """Reset the key value caches.""" + """ + Resets KV-cache buffers on relevant attention modules to zero, and reset cache positions to zero, + without deleting or reallocating cache tensors. + """ self.attn.reset_cache() def _skip_mask(self, mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]: diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py index 9e33f40067..bd111d05df 100644 --- a/torchtune/training/__init__.py +++ b/torchtune/training/__init__.py @@ -51,6 +51,7 @@ TOTAL_EPOCHS_KEY, update_state_dict_for_classifier, ) +from torchtune.training.lr_schedulers import get_cosine_schedule_with_warmup from torchtune.training.memory import ( cleanup_before_training, create_optim_in_bwd_wrapper, @@ -91,6 +92,7 @@ "STEPS_KEY", "TOTAL_EPOCHS_KEY", "get_quantizer_mode", + "get_cosine_schedule_with_warmup", "cleanup_before_training", "create_optim_in_bwd_wrapper", "get_memory_stats", diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py index f8004a356e..d296006b5d 100644 --- a/torchtune/training/_profiler.py +++ b/torchtune/training/_profiler.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. +import datetime import os import time from functools import partial @@ -97,8 +98,13 @@ def trace_handler( # Use tensorboard trace handler rather than directly exporting chrome traces since # tensorboard doesn't seem to be able to parse traces with prof.export_chrome_trace + + now = datetime.datetime.now() + exporter = tensorboard_trace_handler( - curr_trace_dir, worker_name=f"rank{rank}", use_gzip=True + curr_trace_dir, + worker_name=f"r0-{now.year}-{now.month}-{now.day}-{now.hour}-{now.minute}", + use_gzip=True, ) exporter(prof) diff --git a/torchtune/training/lr_schedulers.py b/torchtune/training/lr_schedulers.py new file mode 100644 index 0000000000..ee5f2cd91d --- /dev/null +++ b/torchtune/training/lr_schedulers.py @@ -0,0 +1,56 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +from torch.optim.lr_scheduler import LambdaLR + + +def get_cosine_schedule_with_warmup( + optimizer: torch.optim.Optimizer, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float = 0.5, + last_epoch: int = -1, +) -> LambdaLR: + """ + Create a learning rate schedule that linearly increases the learning rate from + 0.0 to lr over ``num_warmup_steps``, then decreases to 0.0 on a cosine schedule over + the remaining ``num_training_steps-num_warmup_steps`` (assuming ``num_cycles`` = 0.5). + + This is based on the Hugging Face implementation + https://github.com/huggingface/transformers/blob/v4.23.1/src/transformers/optimization.py#L104. + + Args: + optimizer (torch.optim.Optimizer): The optimizer for which to + schedule the learning rate. + num_warmup_steps (int): The number of steps for the warmup phase. + num_training_steps (int): The total number of training steps. + num_cycles (float): The number of waves in the cosine schedule. Defaults to 0.5 + (decrease from the max value to 0 following a half-cosine). + last_epoch (int): The index of the last epoch when resuming training. Defaults to -1 + + Returns: + torch.optim.lr_scheduler.LambdaLR with the appropriate schedule. + """ + + def lr_lambda(current_step: int) -> float: + # linear warmup phase + if current_step < num_warmup_steps: + return current_step / max(1, num_warmup_steps) + + # cosine + progress = (current_step - num_warmup_steps) / max( + 1, num_training_steps - num_warmup_steps + ) + + cosine_lr_multiple = 0.5 * ( + 1.0 + math.cos(math.pi * num_cycles * 2.0 * progress) + ) + return max(0.0, cosine_lr_multiple) + + return LambdaLR(optimizer, lr_lambda, last_epoch) diff --git a/torchtune/training/quantization.py b/torchtune/training/quantization.py index debe49ab15..465e987981 100644 --- a/torchtune/training/quantization.py +++ b/torchtune/training/quantization.py @@ -6,7 +6,13 @@ from typing import Callable, Optional -from torchao.dtypes import TensorCoreTiledLayoutType +from torchtune.utils._import_guard import _USE_NEW_TENSOR_CORE_TILED_LAYOUT_API + +if _USE_NEW_TENSOR_CORE_TILED_LAYOUT_API: + from torchao.dtypes import TensorCoreTiledLayout +else: + from torchao.dtypes import TensorCoreTiledLayoutType as TensorCoreTiledLayout + from torchao.quantization import ( int4_weight_only, int8_dynamic_activation_int4_weight, @@ -88,7 +94,7 @@ def __init__(self, groupsize: int = 128, inner_k_tiles: int = 8): self.inner_k_tiles = inner_k_tiles def quantize(self, model): - layout_type = TensorCoreTiledLayoutType(self.inner_k_tiles) + layout_type = TensorCoreTiledLayout(self.inner_k_tiles) quantize_fn = int4_weight_only(self.groupsize, layout_type) quantize_(model, quantize_fn) return model diff --git a/torchtune/utils/_import_guard.py b/torchtune/utils/_import_guard.py index c0779271fb..93e7941fbc 100644 --- a/torchtune/utils/_import_guard.py +++ b/torchtune/utils/_import_guard.py @@ -5,7 +5,8 @@ # LICENSE file in the root directory of this source tree. import torch -from torchtune.utils._version import torch_version_ge +import torchao +from torchtune.utils._version import _is_fbcode, _nightly_version_ge, torch_version_ge # We can only use flex attention / BlockMask if torch version >= 2.5.0 and GPU is Turing / SM75 and above _SUPPORTS_FLEX_ATTENTION = ( @@ -13,3 +14,16 @@ and torch.cuda.is_available() and torch.cuda.get_device_capability() >= (7, 5) ) + +torchao_version = torchao.__version__ + +_USE_NEW_TENSOR_CORE_TILED_LAYOUT_API = _is_fbcode() or ( + not _is_fbcode() + and ( + ("dev" not in torchao_version and torchao_version >= "0.6.0") + or ( + "dev" in torchao_version + and _nightly_version_ge(torchao_version, "2024-10-10") + ) + ) +) diff --git a/torchtune/utils/_version.py b/torchtune/utils/_version.py index 830a8ba079..9dcbd8e450 100644 --- a/torchtune/utils/_version.py +++ b/torchtune/utils/_version.py @@ -3,6 +3,9 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + +from datetime import datetime + import torch @@ -23,3 +26,21 @@ def torch_version_ge(version: str) -> bool: True """ return version in torch.__version__ or torch.__version__ >= version + + +def _is_fbcode(): + return not hasattr(torch.version, "git_version") + + +def _nightly_version_ge(ao_version_str: str, date: str) -> bool: + """ + Compare a torchao nightly version to a date of the form + %Y-%m-%d. + + Returns True if the nightly version is greater than or equal to + the date, False otherwise + """ + ao_datetime = datetime.strptime( + ao_version_str.split("+")[0].split("dev")[1], "%Y%m%d" + ) + return ao_datetime >= datetime.strptime(date, "%Y-%m-%d")