diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h index 6bfd09bd2a36..6c21b7c8e82d 100644 --- a/csrc/includes/cpu_adagrad.h +++ b/csrc/includes/cpu_adagrad.h @@ -1,5 +1,8 @@ #pragma once +#define NOMINMAX // Windows idiosyncrasy + // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c + #include #include #include diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h index 9a4e80593f21..09677c6842de 100644 --- a/csrc/includes/cpu_adam.h +++ b/csrc/includes/cpu_adam.h @@ -1,5 +1,8 @@ #pragma once +#define NOMINMAX // Windows idiosyncrasy + // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c + #include #include #include diff --git a/csrc/sparse_attention/utils.cpp b/csrc/sparse_attention/utils.cpp index a802025e92ed..8e4346be8a29 100644 --- a/csrc/sparse_attention/utils.cpp +++ b/csrc/sparse_attention/utils.cpp @@ -94,14 +94,14 @@ ret_t sdd_segment(torch::Tensor layout, int start_width) // block index torch::Tensor idx = torch::zeros_like(layout); int current = 0; - size_t H = layout.size(0); - size_t M = layout.size(1); - size_t N = layout.size(2); + int64_t H = layout.size(0); + int64_t M = layout.size(1); + int64_t N = layout.size(2); auto _layout = layout.accessor(); auto _idx = idx.accessor(); - for (size_t h = 0; h < H; h++) - for (size_t m = 0; m < M; m++) - for (size_t n = 0; n < N; n++) { + for (int64_t h = 0; h < H; h++) + for (int64_t m = 0; m < M; m++) + for (int64_t n = 0; n < N; n++) { if (_layout[h][m][n] == 0) continue; _idx[h][m][n] = current++; } diff --git a/op_builder/builder.py b/op_builder/builder.py index 34040d33df37..54a179bdd1e5 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -553,3 +553,23 @@ def libraries_args(self): return ['cublas', 'curand'] else: return [] + + +class TorchCPUOpBuilder(CUDAOpBuilder): + def cxx_args(self): + import torch + CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64") + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() + + args = super().cxx_args() + args += [ + f'-L{CUDA_LIB64}', + '-lcudart', + '-lcublas', + '-g', + CPU_ARCH, + '-fopenmp', + SIMD_WIDTH, + ] + return args diff --git a/op_builder/cpu_adagrad.py b/op_builder/cpu_adagrad.py index 68fc78583960..7f2de7b42ecc 100644 --- a/op_builder/cpu_adagrad.py +++ b/op_builder/cpu_adagrad.py @@ -4,20 +4,16 @@ import os import sys import subprocess -from .builder import CUDAOpBuilder +from .builder import TorchCPUOpBuilder -class CPUAdagradBuilder(CUDAOpBuilder): +class CPUAdagradBuilder(TorchCPUOpBuilder): BUILD_VAR = "DS_BUILD_CPU_ADAGRAD" NAME = "cpu_adagrad" def __init__(self): super().__init__(name=self.NAME) - def is_compatible(self, verbose=True): - # Disable on Windows. - return sys.platform != "win32" - def absolute_name(self): return f'deepspeed.ops.adagrad.{self.NAME}_op' @@ -28,22 +24,3 @@ def include_paths(self): import torch CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include") return ['csrc/includes', CUDA_INCLUDE] - - def cxx_args(self): - import torch - CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64") - CPU_ARCH = self.cpu_arch() - SIMD_WIDTH = self.simd_width() - - return [ - '-O3', - '-std=c++14', - f'-L{CUDA_LIB64}', - '-lcudart', - '-lcublas', - '-g', - '-Wno-reorder', - CPU_ARCH, - '-fopenmp', - SIMD_WIDTH, - ] diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py index 8a58756d1fcd..c016124310b7 100644 --- a/op_builder/cpu_adam.py +++ b/op_builder/cpu_adam.py @@ -4,20 +4,16 @@ import os import sys import subprocess -from .builder import CUDAOpBuilder +from .builder import TorchCPUOpBuilder -class CPUAdamBuilder(CUDAOpBuilder): +class CPUAdamBuilder(TorchCPUOpBuilder): BUILD_VAR = "DS_BUILD_CPU_ADAM" NAME = "cpu_adam" def __init__(self): super().__init__(name=self.NAME) - def is_compatible(self, verbose=True): - # Disable on Windows. - return sys.platform != "win32" - def absolute_name(self): return f'deepspeed.ops.adam.{self.NAME}_op' @@ -28,22 +24,3 @@ def include_paths(self): import torch CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include") return ['csrc/includes', CUDA_INCLUDE] - - def cxx_args(self): - import torch - CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64") - CPU_ARCH = self.cpu_arch() - SIMD_WIDTH = self.simd_width() - - return [ - '-O3', - '-std=c++14', - f'-L{CUDA_LIB64}', - '-lcudart', - '-lcublas', - '-g', - '-Wno-reorder', - CPU_ARCH, - '-fopenmp', - SIMD_WIDTH, - ]