Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
f9af280
trigger test
bhimrazy Oct 30, 2025
62b015d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 30, 2025
c3fabe1
update versions and vars and sycn with azure one to lightning gpu tests
bhimrazy Oct 30, 2025
b0b3db2
test with new read token
bhimrazy Oct 30, 2025
45a2d7b
update HF_TOKEN in cpu-tests.yml for testing
bhimrazy Oct 30, 2025
bd5e565
revert token
bhimrazy Oct 30, 2025
b442793
pin json arg parse
bhimrazy Oct 30, 2025
3ee6c89
add missing conftest
bhimrazy Oct 30, 2025
b3ee2a9
mark flaky for tests causing 429
bhimrazy Oct 30, 2025
a780cf2
cleanup[
bhimrazy Oct 30, 2025
0e47bbf
revert gpu test configs
bhimrazy Oct 30, 2025
0073448
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 30, 2025
ffd73cf
pin to torch version 2.8.0 for lightning gpu test to fix nvfuser miss…
bhimrazy Oct 30, 2025
72cc9e0
increase rerun delay for flaky tokenizer test to improve stability
bhimrazy Oct 30, 2025
786a198
limit py versions temporarily
bhimrazy Oct 30, 2025
848adfa
temporarily limit Python versions in CPU tests to 3.11 and 3.12
bhimrazy Oct 30, 2025
4ef4833
revert
bhimrazy Oct 30, 2025
07f2134
Increase reruns for flaky tokenizer test and ensure clean model-speci…
bhimrazy Oct 30, 2025
42675c0
Uninstall transformers alongside lightning-thunder and install test r…
bhimrazy Oct 30, 2025
14e0a4d
Clarify comment on installing thunder's test requirements to specify …
bhimrazy Oct 30, 2025
4adb326
Refactor CPU tests workflow to use 'uv' for dependency management and…
bhimrazy Oct 30, 2025
7519abf
Pin transformers version to 4.52.4 to match thunder's test requirements
bhimrazy Oct 30, 2025
506aff5
run updated ci
bhimrazy Oct 30, 2025
a426abc
Fix branch name in CPU tests workflow for push events
bhimrazy Oct 30, 2025
ad7a741
revert trigger
bhimrazy Oct 30, 2025
0422020
try to trigger the uv based test
bhimrazy Nov 2, 2025
990cc67
revert back to original ci
bhimrazy Nov 2, 2025
fa55e59
Add TORCH_URL environment variable for cpu tests and save some space
bhimrazy Nov 2, 2025
2d312e4
hide some checks tenporarily to run this action
bhimrazy Nov 2, 2025
e303834
update the trigger condition
bhimrazy Nov 2, 2025
931da0e
revert conditional changes
bhimrazy Nov 2, 2025
8608bfb
Empty commit
bhimrazy Nov 2, 2025
5454955
Add flaky marker to test_simple for improved reliability
bhimrazy Nov 2, 2025
bb742ca
test withenhanced process kill tree
bhimrazy Nov 3, 2025
10ff069
Revert "last change"
bhimrazy Nov 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ env:
TRANSFORMERS_CACHE: .cache-HF/transformers
DATASETS_CACHE: .cache-HF/datasets
HF_DATASETS_CACHE: .cache-HF/datasets
TORCH_URL: "https://download.pytorch.org/whl/cpu/"

jobs:
testing-imports:
Expand All @@ -51,7 +52,7 @@ jobs:

- name: Install minimal dependencies
run: |
pip install . -U
pip install . -U --extra-index-url="${TORCH_URL}"
pip list

- name: Testing package imports
Expand Down Expand Up @@ -119,7 +120,7 @@ jobs:
python -m lightning_utilities.cli requirements set-oldest --req_files=pyproject.toml
- name: Install dependencies
run: |
pip install '.[extra,compiler,test]' -U --upgrade-strategy eager
pip install '.[extra,compiler,test]' -U --upgrade-strategy eager --extra-index-url="${TORCH_URL}"
pip list

- name: Run tests
Expand Down
13 changes: 8 additions & 5 deletions .lightning/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ trigger:
pull_request:
branches: ["main"]

image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.7.1-dev"
image: "pytorchlightning/lightning-thunder:ubuntu24.04-cuda12.6.3-cudnn-fe1.10.0-py3.10-pt_2.8.0-dev"
machine: "L4_X_2"
interruptible: "true"
timeout: "45" # minutes
Expand All @@ -19,7 +19,7 @@ env:
NCCL_DEBUG: "INFO"
CUBLAS_WORKSPACE_CONFIG: ":4096:8"
NCCL_IGNORE_DISABLED_P2P: "1"
TORCH_VERSION: "2.7.1"
TORCH_VERSION: "2.8.0"
RUN_ONLY_CUDA_TESTS: "1" # run CUDA tests only

run: |
Expand All @@ -30,7 +30,7 @@ run: |
pip list
set -ex

pip install -q '.[extra,test]' "torch==${TORCH_VERSION}" cffi -U
pip install -q '.[extra,test]' "torch==${TORCH_VERSION}" cffi -U --upgrade-strategy eager

if [ "${dependency}" == "compiler" ]; then
pip uninstall -y torchvision torchaudio
Expand All @@ -41,17 +41,20 @@ run: |

pip list
python -c "import torch ; gpus = torch.cuda.device_count() ; assert gpus >= 2, f'GPU: {gpus}'"
python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '$TORCH_VERSION', f'PyTorch: installed {ver} but expected $TORCH_VERSION'"
python -c "from torch import __version__ as ver ; assert str(ver).split('+')[0] == '${TORCH_VERSION}', f'PyTorch: installed {ver} but expected ${TORCH_VERSION}'"

pytest -v --durations=100

wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
PL_RUN_STANDALONE_TESTS=1 bash run_standalone_tests.sh "tests"

if [ "${dependency}" == "compiler" ]; then
pip uninstall -y lightning-thunder
pip uninstall -y lightning-thunder transformers
# install thunder from source, so that, thunder.tests will be available
pip install -U "lightning-thunder[test] @ git+https://github.com/Lightning-AI/lightning-thunder.git" "torch==${TORCH_VERSION}"
# Pin transformers to match thunder's test_networks.py requirements
# See: https://github.com/Lightning-AI/lightning-thunder/blob/main/requirements/test.txt
pip install transformers==4.52.4 # todo: find more robust way
# without env var, it filters out all tests
RUN_ONLY_CUDA_TESTS=0 pytest tests/ext_thunder/test_thunder_networks.py -v
fi
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ dependencies = [
# download models:
"huggingface-hub>=0.30,<0.35",
"jsonargparse[signatures]>=4.31,<=4.32.1; python_version<'3.10'", # 4.33 does not seem to be compatible with Python 3.9
"jsonargparse[signatures]>=4.37; python_version>'3.9'", # required to work with python3.12+
"jsonargparse[signatures]>=4.37,<=4.41; python_version>'3.9'", # required to work with python3.12+
"lightning>=2.5",
"psutil==7",
"safetensors>=0.4.3",
Expand Down
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,14 @@ def destroy_process_group():
torch.distributed.destroy_process_group()


@pytest.fixture
def turn_off_tf32_and_set_seed(monkeypatch):
monkeypatch.setenv("NVIDIA_TF32_OVERRIDE", "0")
torch.manual_seed(42)
yield
torch.seed()


class MockTokenizer:
"""A dummy tokenizer that encodes each character as its ASCII code."""

Expand Down
1 change: 1 addition & 0 deletions tests/test_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def _wait_and_check_response(waiting: int = 30):


# todo: try to resolve this issue
@pytest.mark.flaky(reruns=2, reruns_delay=30)
@pytest.mark.xfail(condition=platform.system() == "Darwin", reason="it passes locally but having some issues on CI")
def test_simple(tmp_path):
seed_everything(123)
Expand Down
14 changes: 9 additions & 5 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@


# @pytest.mark.flaky(reruns=3, rerun_except=["AssertionError", "assert", "TypeError"])
@pytest.mark.flaky(reruns=3, reruns_delay=120)
@pytest.mark.parametrize("config", config_module.configs, ids=[c["hf_config"]["name"] for c in config_module.configs])
def test_tokenizer_against_hf(config, tmp_path):
config = config_module.Config(**config)
Expand All @@ -34,14 +35,17 @@ def test_tokenizer_against_hf(config, tmp_path):
if "tokenizer.json" not in hf_files and "tokenizer.model" not in hf_files:
raise ConnectionError("Unable to download any tokenizer files from HF")

# we need to rename the dir to match the model name in testing as well
# since we use to it determine the model in tokenizer.py
tmp_path = tmp_path.rename(tmp_path.parent / config.hf_config["name"])
# Create a clean, model-specific subdirectory for this test run.
# This avoids errors if previous runs or retries left files behind, ensuring the directory is always ready for fresh downloads and comparisons.
model_dir = tmp_path / config.hf_config["name"]
if model_dir.exists():
shutil.rmtree(model_dir)
os.makedirs(model_dir, exist_ok=True)

for filename, hf_file in hf_files.items():
shutil.copy(hf_file, str(tmp_path / filename))
shutil.copy(hf_file, model_dir / filename)

ours = Tokenizer(tmp_path)
ours = Tokenizer(model_dir)

assert ours.vocab_size == theirs.vocab_size
if config.name == "Mixtral-8x22B-v0.1":
Expand Down
Loading