Skip to content

Commit

Permalink
support CPU Adam and Adagrad on Windows with SDK 10.0.22000 (#1634)
Browse files Browse the repository at this point in the history
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
  • Loading branch information
3 people authored Jan 5, 2022
1 parent b6f0ac9 commit 74493b2
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 56 deletions.
3 changes: 3 additions & 0 deletions csrc/includes/cpu_adagrad.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#pragma once

#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c

#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
Expand Down
3 changes: 3 additions & 0 deletions csrc/includes/cpu_adam.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#pragma once

#define NOMINMAX // Windows idiosyncrasy
// https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c

#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
Expand Down
12 changes: 6 additions & 6 deletions csrc/sparse_attention/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,14 +94,14 @@ ret_t sdd_segment(torch::Tensor layout, int start_width)
// block index
torch::Tensor idx = torch::zeros_like(layout);
int current = 0;
size_t H = layout.size(0);
size_t M = layout.size(1);
size_t N = layout.size(2);
int64_t H = layout.size(0);
int64_t M = layout.size(1);
int64_t N = layout.size(2);
auto _layout = layout.accessor<int, 3>();
auto _idx = idx.accessor<int, 3>();
for (size_t h = 0; h < H; h++)
for (size_t m = 0; m < M; m++)
for (size_t n = 0; n < N; n++) {
for (int64_t h = 0; h < H; h++)
for (int64_t m = 0; m < M; m++)
for (int64_t n = 0; n < N; n++) {
if (_layout[h][m][n] == 0) continue;
_idx[h][m][n] = current++;
}
Expand Down
20 changes: 20 additions & 0 deletions op_builder/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,3 +553,23 @@ def libraries_args(self):
return ['cublas', 'curand']
else:
return []


class TorchCPUOpBuilder(CUDAOpBuilder):
def cxx_args(self):
import torch
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
CPU_ARCH = self.cpu_arch()
SIMD_WIDTH = self.simd_width()

args = super().cxx_args()
args += [
f'-L{CUDA_LIB64}',
'-lcudart',
'-lcublas',
'-g',
CPU_ARCH,
'-fopenmp',
SIMD_WIDTH,
]
return args
27 changes: 2 additions & 25 deletions op_builder/cpu_adagrad.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,16 @@
import os
import sys
import subprocess
from .builder import CUDAOpBuilder
from .builder import TorchCPUOpBuilder


class CPUAdagradBuilder(CUDAOpBuilder):
class CPUAdagradBuilder(TorchCPUOpBuilder):
BUILD_VAR = "DS_BUILD_CPU_ADAGRAD"
NAME = "cpu_adagrad"

def __init__(self):
super().__init__(name=self.NAME)

def is_compatible(self, verbose=True):
# Disable on Windows.
return sys.platform != "win32"

def absolute_name(self):
return f'deepspeed.ops.adagrad.{self.NAME}_op'

Expand All @@ -28,22 +24,3 @@ def include_paths(self):
import torch
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
return ['csrc/includes', CUDA_INCLUDE]

def cxx_args(self):
import torch
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
CPU_ARCH = self.cpu_arch()
SIMD_WIDTH = self.simd_width()

return [
'-O3',
'-std=c++14',
f'-L{CUDA_LIB64}',
'-lcudart',
'-lcublas',
'-g',
'-Wno-reorder',
CPU_ARCH,
'-fopenmp',
SIMD_WIDTH,
]
27 changes: 2 additions & 25 deletions op_builder/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,16 @@
import os
import sys
import subprocess
from .builder import CUDAOpBuilder
from .builder import TorchCPUOpBuilder


class CPUAdamBuilder(CUDAOpBuilder):
class CPUAdamBuilder(TorchCPUOpBuilder):
BUILD_VAR = "DS_BUILD_CPU_ADAM"
NAME = "cpu_adam"

def __init__(self):
super().__init__(name=self.NAME)

def is_compatible(self, verbose=True):
# Disable on Windows.
return sys.platform != "win32"

def absolute_name(self):
return f'deepspeed.ops.adam.{self.NAME}_op'

Expand All @@ -28,22 +24,3 @@ def include_paths(self):
import torch
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
return ['csrc/includes', CUDA_INCLUDE]

def cxx_args(self):
import torch
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
CPU_ARCH = self.cpu_arch()
SIMD_WIDTH = self.simd_width()

return [
'-O3',
'-std=c++14',
f'-L{CUDA_LIB64}',
'-lcudart',
'-lcublas',
'-g',
'-Wno-reorder',
CPU_ARCH,
'-fopenmp',
SIMD_WIDTH,
]

0 comments on commit 74493b2

Please sign in to comment.