Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 87 additions & 1 deletion .github/scripts/filter-matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,41 @@ def filter_matrix_item(
return True


def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
"""Create distributed test configuration from a regular config.

Takes a standard test config and modifies it for distributed testing:
- Changes runner to multi-GPU instance
- Adds num_gpus field
- Adds config marker
"""
import sys

# Create a copy to avoid modifying the original
dist_item = item.copy()

# Debug: Show original config
print(f"[DEBUG] Creating distributed config from:", file=sys.stderr)
print(f"[DEBUG] Python: {item.get('python_version')}", file=sys.stderr)
print(f"[DEBUG] CUDA: {item.get('desired_cuda')}", file=sys.stderr)
print(
f"[DEBUG] Original runner: {item.get('validation_runner')}", file=sys.stderr
)

# Override runner to use multi-GPU instance
dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"

# Add distributed-specific fields
dist_item["num_gpus"] = 2
dist_item["config"] = "distributed"

# Debug: Show modified config
print(f"[DEBUG] New runner: {dist_item['validation_runner']}", file=sys.stderr)
print(f"[DEBUG] GPUs: {dist_item['num_gpus']}", file=sys.stderr)

return dist_item


def main(args: list[str]) -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -99,16 +134,67 @@ def main(args: list[str]) -> None:

includes = matrix_dict["include"]
filtered_includes = []
distributed_includes = [] # NEW: separate list for distributed configs

print(f"[DEBUG] Processing {len(includes)} input configs", file=sys.stderr)

for item in includes:
py_ver = item.get("python_version", "unknown")
cuda_ver = item.get("desired_cuda", "unknown")

print(f"[DEBUG] Checking config: py={py_ver}, cuda={cuda_ver}", file=sys.stderr)

if filter_matrix_item(
item,
options.jetpack == "true",
options.limit_pr_builds == "true",
):
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
filtered_includes.append(item)

filtered_matrix_dict = {"include": filtered_includes}
if item["python_version"] == "3.10" and item["desired_cuda"] == "cu130":
print(
f"[DEBUG] Creating distributed config for py3.10+cu130",
file=sys.stderr,
)
distributed_includes.append(create_distributed_config(item))
else:
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)

# Debug: Show summary
print(f"[DEBUG] Final counts:", file=sys.stderr)
print(f"[DEBUG] Regular configs: {len(filtered_includes)}", file=sys.stderr)
print(
f"[DEBUG] Distributed configs: {len(distributed_includes)}", file=sys.stderr
)

# Debug: Show which configs will be built
print(
f"[DEBUG] Configs that will be BUILT (in filtered_includes):", file=sys.stderr
)
for item in filtered_includes:
print(
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}",
file=sys.stderr,
)

print(
f"[DEBUG] Configs for DISTRIBUTED TESTS (in distributed_includes):",
file=sys.stderr,
)
for item in distributed_includes:
print(
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}, gpus={item.get('num_gpus')}",
file=sys.stderr,
)

# NEW: Output both regular and distributed configs
filtered_matrix_dict = {
"include": filtered_includes,
"distributed_include": distributed_includes, # NEW field
}

# Output to stdout (consumed by GitHub Actions)
print(json.dumps(filtered_matrix_dict))


Expand Down
149 changes: 129 additions & 20 deletions .github/workflows/build-test-linux-x86_64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
env-var-script: ${{ matrix.env-var-script }}
post-script: ${{ matrix.post-script }}
Expand Down Expand Up @@ -97,7 +101,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -125,7 +133,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -156,7 +168,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -184,7 +200,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -216,7 +236,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -244,7 +268,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -272,7 +300,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -302,7 +334,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -334,7 +370,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -363,7 +403,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -392,7 +436,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -420,7 +468,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -448,7 +500,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand All @@ -463,7 +519,7 @@ jobs:

L2-dynamo-distributed-tests:
name: L2 dynamo distributed tests
needs: [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests, L1-torchscript-tests]
needs: [filter-matrix, build]
strategy:
fail-fast: false
matrix:
Expand All @@ -480,18 +536,71 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the distributed_include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail

# Debug: Show what config we're using
echo "=========================================="
echo "DISTRIBUTED TEST CONFIGURATION"
echo "=========================================="
echo "Python version: ${PYTHON_VERSION}"
echo "CUDA version: ${CU_VERSION}"
echo "Num GPUs: ${NUM_GPUS}"
echo "=========================================="

# Verify GPUs are available
echo "Checking GPU availability:"
nvidia-smi
echo "GPU count: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
echo "=========================================="

export USE_HOST_DEPS=1
export CI_BUILD=1
export USE_TRTLLM_PLUGINS=1
dnf install -y mpich mpich-devel openmpi openmpi-devel

# Install MPI (required for TensorRT-LLM plugins)
echo "Installing MPI..."
dnf install -y openmpi openmpi-devel

# Add OpenMPI to PATH (RHEL/AlmaLinux specific location)
export PATH="/usr/lib64/openmpi/bin:$PATH"
export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"

# Verify mpirun is accessible
which mpirun
mpirun --version

# Run distributed tests
pushd .
cd tests/py
cd dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
cd tests/py/dynamo

echo "Running distributed tests with mpirun..."
echo "[CONFIG] Number of GPUs to use: ${NUM_GPUS}"
echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"

# Set master port for distributed communication (must be same across all ranks)
export MASTER_ADDR=127.0.0.1
export MASTER_PORT=29500

# Use a wrapper script to ensure only rank 0 writes the JUnit XML
# Each rank runs pytest, but only rank 0 saves results to avoid file conflicts
RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
mpirun --allow-run-as-root -n ${NUM_GPUS} \
bash -c '
echo "[MPI DEBUG] Rank: ${OMPI_COMM_WORLD_RANK:-0}, World Size: ${OMPI_COMM_WORLD_SIZE:-1}"
if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
else
python -m pytest -ra distributed/test_nccl_ops.py
fi
'

popd

concurrency:
Expand Down
11 changes: 11 additions & 0 deletions .github/workflows/build_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,17 @@ jobs:
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
timeout-minutes: ${{ inputs.timeout }}
steps:
- name: Debug matrix configuration
shell: bash
run: |
echo "=========================================="
echo "BUILD MATRIX DEBUG"
echo "=========================================="
echo "Python version: ${{ matrix.python_version }}"
echo "CUDA version: ${{ matrix.desired_cuda }}"
echo "GPU arch type: ${{ matrix.gpu_arch_type }}"
echo "Runner: ${{ matrix.validation_runner }}"
echo "=========================================="
- name: Clean workspace
shell: bash -l {0}
run: |
Expand Down
Loading
Loading