Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/_tutorials/ds4sci_evoformerattention.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the
The extension checks both requirements and fails if any is not met. To disable the check, for example for cross-compiling in a system without GPUs, you can set the environment variable ```DS_IGNORE_CUDA_DETECTION=TRUE```
and the environment value ```DS_EVOFORMER_GPU_ARCH={70|75|80}```, which controls the target GPU (80 being the last supported and meaning NVIDIA Ampere and later).

`TORCH_CUDA_ARCH_LIST` controls emitted CUDA fatbin targets (`-gencode`), while `DS_EVOFORMER_GPU_ARCH` controls the Evoformer kernel family (`-DGPU_ARCH`). For Evoformer builds, compute capabilities below the selected family floor are filtered out (for example, with `DS_EVOFORMER_GPU_ARCH=80`, `7.x` entries in `TORCH_CUDA_ARCH_LIST` are pruned). Make sure `TORCH_CUDA_ARCH_LIST` still includes your runtime architecture (or a compatible `+PTX` target), otherwise runtime can fail with `invalid device function`.

### 3.2 Unit test and benchmark

The unit test and benchmark are available in the `tests` folder in DeepSpeed repo. You can use the following command to run the unit test and benchmark.
Expand Down
143 changes: 131 additions & 12 deletions op_builder/evoformer_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@

# DeepSpeed Team

from .builder import CUDAOpBuilder, installed_cuda_version
import os
import re
from pathlib import Path
from typing import Optional, Tuple

from .builder import CUDAOpBuilder, installed_cuda_version


class EvoformerAttnBuilder(CUDAOpBuilder):
Expand All @@ -22,6 +25,7 @@ def __init__(self, name=None):
# No specializations of the kernel beyond Ampere are implemented
# See gemm_kernel_utils.h (also in cutlass example for fused attention) and cutlass/arch/arch.h
self.gpu_arch = os.environ.get("DS_EVOFORMER_GPU_ARCH")
self._resolved_gpu_arch = None

def absolute_name(self):
return f"deepspeed.ops.{self.NAME}_op"
Expand All @@ -36,21 +40,136 @@ def sources(self):
src_dir = "csrc/deepspeed4science/evoformer_attn"
return [f"{src_dir}/attention.cpp", f"{src_dir}/attention_back.cu", f"{src_dir}/attention_cu.cu"]

@staticmethod
def _parse_gpu_arch(raw_arch: str) -> Optional[int]:
token = raw_arch.strip().lower()
if not token:
return None

token = re.sub(r"^sm_?", "", token)
if "." in token:
major, minor = token.split(".", maxsplit=1)
if not (major.isdigit() and minor.isdigit()):
return None
return int(major) * 10 + int(minor)

if not token.isdigit():
return None

# Accept single digit forms like "8" and normalize to "80".
if len(token) == 1:
return int(token) * 10
return int(token)

@staticmethod
def _parse_cc_token(token: str) -> Tuple[Optional[list], Optional[int]]:
value = token.strip()
if not value:
return None, None

major, dot, minor = value.partition(".")
if dot != "." or not major.isdigit():
return None, None

minor_value = minor.split("+", maxsplit=1)[0]
if not minor_value.isdigit():
return None, None

return [major, minor], int(major) * 10 + int(minor_value)

@staticmethod
def _effective_floor_cc(raw_cc: Optional[int]) -> Optional[int]:
if raw_cc is None:
return None
if raw_cc >= 80:
return 80
if raw_cc >= 75:
return 75
if raw_cc >= 70:
return 70
return None

def _detect_local_gpu_cc(self) -> Optional[int]:
try:
import torch
except ImportError:
self.warning("Please install torch if trying to pre-compile kernels")
return None

if not torch.cuda.is_available(): #ignore-cuda
return None

props = torch.cuda.get_device_properties(0) #ignore-cuda
return int(props.major) * 10 + int(props.minor)

def _resolve_gpu_arch(self) -> Tuple[Optional[int], Optional[int]]:
if self._resolved_gpu_arch is not None:
return self._resolved_gpu_arch

resolved_arch = None
if self.gpu_arch:
parsed_arch = self._parse_gpu_arch(self.gpu_arch)
if parsed_arch is None:
self.warning(
f"Invalid DS_EVOFORMER_GPU_ARCH='{self.gpu_arch}'. Falling back to local CUDA device capability.")
else:
resolved_arch = parsed_arch

if resolved_arch is None:
resolved_arch = self._detect_local_gpu_cc()

floor = self._effective_floor_cc(resolved_arch)
if resolved_arch is not None and floor is None:
self.warning(f"DS4Sci_EvoformerAttention requires compute capability >= 7.0, got '{resolved_arch}'.")
resolved_arch = None

self._resolved_gpu_arch = (resolved_arch, floor)
return self._resolved_gpu_arch

def nvcc_args(self):
args = super().nvcc_args()
if not self.gpu_arch:
try:
import torch
except ImportError:
self.warning("Please install torch if trying to pre-compile kernels")
return args
major = torch.cuda.get_device_properties(0).major #ignore-cuda
minor = torch.cuda.get_device_properties(0).minor #ignore-cuda
args.append(f"-DGPU_ARCH={major}{minor}")
else:
args.append(f"-DGPU_ARCH={self.gpu_arch}")
resolved_arch, floor = self._resolve_gpu_arch()
if floor is None:
raise RuntimeError(
"Unable to resolve DS_EVOFORMER_GPU_ARCH for DS4Sci_EvoformerAttention. "
"Set DS_EVOFORMER_GPU_ARCH to a supported value such as 70, 75, 80, 7.0, 7.5, 8.0, or sm80.")
if resolved_arch != floor:
self.warning(
f"Normalizing DS_EVOFORMER_GPU_ARCH={resolved_arch} to Evoformer kernel family GPU_ARCH={floor}.")
args.append(f"-DGPU_ARCH={floor}")
return args

def filter_ccs(self, ccs):
_, floor = self._resolve_gpu_arch()

ccs_retained = []
ccs_pruned = []
for cc in ccs:
parsed_cc, numeric_cc = self._parse_cc_token(cc)
if parsed_cc is None or numeric_cc is None:
if cc.strip():
ccs_pruned.append(cc.strip())
continue

# Evoformer kernels require Volta+.
if numeric_cc < 70:
ccs_pruned.append(cc.strip())
continue

if floor is not None and numeric_cc < floor:
ccs_pruned.append(cc.strip())
continue

ccs_retained.append(parsed_cc)

if len(ccs_pruned) > 0:
if floor is not None:
self.warning(f"Filtered compute capabilities {ccs_pruned} below Evoformer floor {floor}")
else:
self.warning(f"Filtered compute capabilities {ccs_pruned}")

return ccs_retained

def is_compatible(self, verbose=False):
try:
import torch
Expand Down
108 changes: 108 additions & 0 deletions tests/unit/ops/deepspeed4science/test_evoformer_attn_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from unittest import mock

from deepspeed.ops.op_builder.evoformer_attn import EvoformerAttnBuilder


def _cc_set(ccs):
return {f"{major}.{minor}" for major, minor in ccs}


def test_normalize_gpu_arch_accepts_80_8dot0_sm80(monkeypatch):
for value in ("80", "8.0", "sm80"):
monkeypatch.setenv("DS_EVOFORMER_GPU_ARCH", value)
builder = EvoformerAttnBuilder()
assert builder._parse_gpu_arch(builder.gpu_arch) == 80


def test_normalize_gpu_arch_invalid_value_warns_and_falls_back(monkeypatch):
monkeypatch.setenv("DS_EVOFORMER_GPU_ARCH", "invalid")
builder = EvoformerAttnBuilder()
warnings = []
builder.warning = warnings.append

with mock.patch.object(builder, "_detect_local_gpu_cc", return_value=90):
raw_cc, floor_cc = builder._resolve_gpu_arch()

assert (raw_cc, floor_cc) == (90, 80)
assert any("Invalid DS_EVOFORMER_GPU_ARCH" in msg for msg in warnings)


def test_nvcc_args_uses_normalized_gpu_arch_macro(monkeypatch):
monkeypatch.setenv("DS_EVOFORMER_GPU_ARCH", "8.0")
builder = EvoformerAttnBuilder()

with mock.patch("deepspeed.ops.op_builder.evoformer_attn.CUDAOpBuilder.nvcc_args", return_value=["-O3"]):
args = builder.nvcc_args()

assert "-DGPU_ARCH=80" in args
assert "-DGPU_ARCH=8.0" not in args


def test_effective_floor_maps_90_to_80():
assert EvoformerAttnBuilder._effective_floor_cc(90) == 80


def test_filter_ccs_prunes_below_70_even_without_floor(monkeypatch):
monkeypatch.delenv("DS_EVOFORMER_GPU_ARCH", raising=False)
builder = EvoformerAttnBuilder()

with mock.patch.object(builder, "_resolve_gpu_arch", return_value=(None, None)):
filtered = builder.filter_ccs(["6.1", "7.0", "8.0"])

assert filtered == [["7", "0"], ["8", "0"]]


def test_filter_ccs_gpu_arch80_prunes_70(monkeypatch):
monkeypatch.setenv("DS_EVOFORMER_GPU_ARCH", "80")
builder = EvoformerAttnBuilder()

with mock.patch.object(builder, "_resolve_gpu_arch", return_value=(80, 80)):
filtered = builder.filter_ccs(["8.0", "7.0", "9.0"])

assert filtered == [["8", "0"], ["9", "0"]]


def test_filter_ccs_preserves_ptx_suffix(monkeypatch):
monkeypatch.setenv("DS_EVOFORMER_GPU_ARCH", "80")
builder = EvoformerAttnBuilder()

with mock.patch.object(builder, "_resolve_gpu_arch", return_value=(80, 80)):
filtered = builder.filter_ccs(["9.0+PTX", "8.0", "7.5"])

assert filtered == [["9", "0+PTX"], ["8", "0"]]


def test_filter_ccs_order_independent_result_set(monkeypatch):
monkeypatch.setenv("DS_EVOFORMER_GPU_ARCH", "80")
builder = EvoformerAttnBuilder()

with mock.patch.object(builder, "_resolve_gpu_arch", return_value=(80, 80)):
left = builder.filter_ccs(["8.0", "7.0", "9.0"])
right = builder.filter_ccs(["7.0", "9.0", "8.0"])

assert _cc_set(left) == _cc_set(right) == {"8.0", "9.0"}


def test_filter_ccs_empty_after_filter(monkeypatch):
monkeypatch.setenv("DS_EVOFORMER_GPU_ARCH", "80")
builder = EvoformerAttnBuilder()

with mock.patch.object(builder, "_resolve_gpu_arch", return_value=(80, 80)):
filtered = builder.filter_ccs(["6.0", "7.0", "7.5"])

assert filtered == []


def test_filter_ccs_handles_whitespace_and_minor_variants(monkeypatch):
monkeypatch.setenv("DS_EVOFORMER_GPU_ARCH", "80")
builder = EvoformerAttnBuilder()

with mock.patch.object(builder, "_resolve_gpu_arch", return_value=(80, 80)):
filtered = builder.filter_ccs([" 8.6 ", " 7.5 ", " 9.0+PTX "])

assert filtered == [["8", "6"], ["9", "0+PTX"]]
Loading