Skip to content

Commit

Permalink
enable async io op on powerpc architectures (#1224)
Browse files Browse the repository at this point in the history
* enable async io op on powerpc architectures

* drop any empty strings returned by cxx_args

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
  • Loading branch information
adammoody and tjruwase authored Jul 15, 2021
1 parent 3fa2420 commit 89b0fb4
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 61 deletions.
2 changes: 2 additions & 0 deletions csrc/aio/py_lib/deepspeed_py_copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@ Licensed under the MIT license.
Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
*/

#if (__x86_64__ || __i386__)
#include <cpuid.h>
#include <x86intrin.h>
#endif

#include <deepspeed_aio_common.h>
#include <stdlib.h>
Expand Down
13 changes: 5 additions & 8 deletions op_builder/async_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,22 @@ def include_paths(self):
return ['csrc/aio/py_lib', 'csrc/aio/common']

def cxx_args(self):
args = [
CPU_ARCH = self.cpu_arch()
SIMD_WIDTH = self.simd_width()
return [
'-g',
'-Wall',
'-O0',
'-std=c++14',
'-shared',
'-fPIC',
'-Wno-reorder',
'-march=native',
CPU_ARCH,
'-fopenmp',
SIMD_WIDTH,
'-laio',
]

simd_width = self.simd_width()
if len(simd_width) > 0:
args.append(simd_width)

return args

def extra_ldflags(self):
return ['-laio']

Expand Down
64 changes: 44 additions & 20 deletions op_builder/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,34 @@ def libraries_installed(self, libraries):
valid = valid or result.wait() == 0
return valid

def strip_empty_entries(self, args):
'''
Drop any empty strings from the list of compile and link flags
'''
return [x for x in args if len(x) > 0]

def cpu_arch(self):
if not self.command_exists('lscpu'):
self.warning(
f"{self.name} attempted to query 'lscpu' to detect the CPU architecture. "
"However, 'lscpu' does not appear to exist on "
"your system, will fall back to use -march=native.")
return '-march=native'

result = subprocess.check_output('lscpu', shell=True)
result = result.decode('utf-8').strip().lower()
if 'ppc64le' in result:
# gcc does not provide -march on PowerPC, use -mcpu instead
return '-mcpu=native'
return '-march=native'

def simd_width(self):
if not self.command_exists('lscpu'):
self.warning(
f"{self.name} is attempted to query 'lscpu' to detect the existence "
f"{self.name} attempted to query 'lscpu' to detect the existence "
"of AVX instructions. However, 'lscpu' does not appear to exist on "
"your system, will fall back to non-vectorized execution.")
return ''
return '-D__SCALAR__'

result = subprocess.check_output('lscpu', shell=True)
result = result.decode('utf-8').strip().lower()
Expand All @@ -175,7 +196,7 @@ def simd_width(self):
return '-D__AVX512__'
elif 'avx2' in result:
return '-D__AVX256__'
return ''
return '-D__SCALAR__'

def python_requirements(self):
'''
Expand Down Expand Up @@ -220,11 +241,12 @@ def deepspeed_src_path(self, code_path):

def builder(self):
from torch.utils.cpp_extension import CppExtension
return CppExtension(name=self.absolute_name(),
sources=self.sources(),
include_dirs=self.include_paths(),
extra_compile_args={'cxx': self.cxx_args()},
extra_link_args=self.extra_ldflags())
return CppExtension(
name=self.absolute_name(),
sources=self.strip_empty_entries(self.sources()),
include_dirs=self.strip_empty_entries(self.include_paths()),
extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
extra_link_args=self.strip_empty_entries(self.extra_ldflags()))

def load(self, verbose=True):
from ...git_version_info import installed_ops, torch_info
Expand Down Expand Up @@ -264,15 +286,17 @@ def jit_load(self, verbose=True):
os.makedirs(ext_path, exist_ok=True)

start_build = time.time()
sources = [self.deepspeed_src_path(path) for path in self.sources()]
extra_include_paths = [
self.deepspeed_src_path(path) for path in self.include_paths()
]
op_module = load(
name=self.name,
sources=[self.deepspeed_src_path(path) for path in self.sources()],
extra_include_paths=[
self.deepspeed_src_path(path) for path in self.include_paths()
],
extra_cflags=self.cxx_args(),
extra_cuda_cflags=self.nvcc_args(),
extra_ldflags=self.extra_ldflags(),
sources=self.strip_empty_entries(sources),
extra_include_paths=self.strip_empty_entries(extra_include_paths),
extra_cflags=self.strip_empty_entries(self.cxx_args()),
extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
verbose=verbose)
build_duration = time.time() - start_build
if verbose:
Expand Down Expand Up @@ -356,12 +380,12 @@ def builder(self):
from torch.utils.cpp_extension import CUDAExtension
assert_no_cuda_mismatch()
return CUDAExtension(name=self.absolute_name(),
sources=self.sources(),
include_dirs=self.include_paths(),
libraries=self.libraries_args(),
sources=self.strip_empty_entries(self.sources()),
include_dirs=self.strip_empty_entries(self.include_paths()),
libraries=self.strip_empty_entries(self.libraries_args()),
extra_compile_args={
'cxx': self.cxx_args(),
'nvcc': self.nvcc_args()
'cxx': self.strip_empty_entries(self.cxx_args()),
'nvcc': self.strip_empty_entries(self.nvcc_args())
})

def cxx_args(self):
Expand Down
34 changes: 1 addition & 33 deletions op_builder/cpu_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,38 +29,6 @@ def include_paths(self):
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
return ['csrc/includes', CUDA_INCLUDE]

def cpu_arch(self):
if not self.command_exists('lscpu'):
self.warning(
"CPUAdam attempted to query 'lscpu' to detect the CPU architecture. "
"However, 'lscpu' does not appear to exist on "
"your system, will fall back to use -march=native.")
return ''

result = subprocess.check_output('lscpu', shell=True)
result = result.decode('utf-8').strip().lower()
if 'ppc64le' in result:
# gcc does not provide -march on PowerPC, use -mcpu instead
return '-mcpu=native'
return '-march=native'

def simd_width(self):
if not self.command_exists('lscpu'):
self.warning(
"CPUAdam attempted to query 'lscpu' to detect the existence "
"of AVX instructions. However, 'lscpu' does not appear to exist on "
"your system, will fall back to non-vectorized execution.")
return ''

result = subprocess.check_output('lscpu', shell=True)
result = result.decode('utf-8').strip().lower()
if 'genuineintel' in result:
if 'avx512' in result:
return '-D__AVX512__'
elif 'avx2' in result:
return '-D__AVX256__'
return '-D__SCALAR__'

def cxx_args(self):
import torch
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
Expand All @@ -77,5 +45,5 @@ def cxx_args(self):
'-Wno-reorder',
CPU_ARCH,
'-fopenmp',
SIMD_WIDTH
SIMD_WIDTH,
]

0 comments on commit 89b0fb4

Please sign in to comment.