Skip to content

Commit

Permalink
rem is_compiled_with_npu (#52385)
Browse files Browse the repository at this point in the history
* rem is_compiled_with_npu

* rem nup related code

* make lint happy

* rem test

* remove some tests

* Update grad_scaler.py

* fix an error
  • Loading branch information
Kim Yann authored Apr 6, 2023
1 parent 2acc2b1 commit 7976e2a
Show file tree
Hide file tree
Showing 54 changed files with 105 additions and 993 deletions.
3 changes: 0 additions & 3 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ exclude =
./python/paddle/fluid/tra**,
# Exclude third-party libraries
./python/paddle/utils/gast/**,
# Exclude files that will be removed in the future, see more at
# https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731
./python/paddle/fluid/tests/unittests/npu/**,
ignore =
# Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black
E203,
Expand Down
3 changes: 1 addition & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ exclude: |
patches/.+|
paddle/fluid/framework/fleet/heter_ps/cudf/.+|
paddle/fluid/distributed/ps/thirdparty/round_robin.h|
python/paddle/utils/gast/.+|
python/paddle/fluid/tests/unittests/npu/.+
python/paddle/utils/gast/.+
)$
repos:
# Common hooks
Expand Down
27 changes: 0 additions & 27 deletions paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -265,14 +265,6 @@ bool IsCompiledWithROCM() {
#endif
}

bool IsCompiledWithAscend() {
#ifndef PADDLE_WITH_ASCEND
return false;
#else
return true;
#endif
}

bool IsCompiledWithXPU() {
#ifndef PADDLE_WITH_XPU
return false;
Expand All @@ -281,8 +273,6 @@ bool IsCompiledWithXPU() {
#endif
}

bool IsCompiledWithNPU() { return false; }

bool IsCompiledWithCustomDevice(std::string device_type) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE
return false;
Expand Down Expand Up @@ -1592,14 +1582,6 @@ All parameter, weight, gradient are variables in Paddle.
return context;
#endif
})
.def_static(
"create",
[](paddle::platform::NPUPlace &place)
-> paddle::platform::DeviceContext * {
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version, "
"Please recompile or reinstall Paddle with NPU support."));
})
.def_static("create",
[](paddle::platform::CustomPlace &place)
-> paddle::platform::DeviceContext * {
Expand Down Expand Up @@ -1769,13 +1751,6 @@ All parameter, weight, gradient are variables in Paddle.
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self,
const Scope &scope,
const platform::NPUPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self,
const Scope &scope,
Expand Down Expand Up @@ -1985,9 +1960,7 @@ All parameter, weight, gradient are variables in Paddle.
});
m.def("is_compiled_with_avx", IsCompiledWithAVX);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_ascend", IsCompiledWithAscend);
m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_npu", IsCompiledWithNPU);
m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
m.def("is_compiled_with_ipu", IsCompiledWithIPU);
m.def("is_compiled_with_xpu", IsCompiledWithXPU);
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ extend_skip_glob = [
"python/paddle/fluid/[!t]**",
"python/paddle/fluid/tra**",
"python/paddle/utils/gast/**",
"python/paddle/fluid/tests/unittests/npu/**",
]

[tool.ruff]
Expand All @@ -23,7 +22,6 @@ exclude = [
"./python/paddle/fluid/[!t]**",
"./python/paddle/fluid/tra**",
"./python/paddle/utils/gast/**",
"./python/paddle/fluid/tests/unittests/npu/**",
]
target-version = "py37"
select = [
Expand Down
3 changes: 0 additions & 3 deletions python/paddle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,6 @@
from .framework import CPUPlace # noqa: F401
from .framework import IPUPlace # noqa: F401
from .framework import CUDAPlace # noqa: F401
from .framework import NPUPlace # noqa: F401
from .framework import CUDAPinnedPlace # noqa: F401
from .framework import CustomPlace # noqa: F401

Expand Down Expand Up @@ -363,7 +362,6 @@
from .device import set_device # noqa: F401
from .device import get_device # noqa: F401
from .device import is_compiled_with_xpu # noqa: F401
from .device import is_compiled_with_npu # noqa: F401
from .device import is_compiled_with_ipu # noqa: F401
from .device import is_compiled_with_cinn # noqa: F401
from .device import is_compiled_with_cuda # noqa: F401
Expand Down Expand Up @@ -512,7 +510,6 @@
'histogram',
'multiplex',
'CUDAPlace',
'NPUPlace',
'empty',
'shape',
'real',
Expand Down
5 changes: 0 additions & 5 deletions python/paddle/amp/auto_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,18 +344,13 @@ def amp_guard(
if enable and not (
tracer._expected_place.is_gpu_place()
or tracer._expected_place.is_xpu_place()
or tracer._expected_place.is_npu_place()
or tracer._expected_place.is_custom_place()
):
warnings.warn(
'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.'
% tracer._expected_place
)
enable = False
# For npu:
if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
warnings.warn('NPUPlace only support float16 amp.')
enable = False
# For xpu:
if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
warnings.warn('XPUPlace only support float16 amp.')
Expand Down
101 changes: 31 additions & 70 deletions python/paddle/amp/grad_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,10 @@ def __init__(
if enable and not (
tracer._expected_place.is_gpu_place()
or tracer._expected_place.is_xpu_place()
or tracer._expected_place.is_npu_place()
or tracer._expected_place.is_custom_place()
):
warnings.warn(
'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and CustomPlace, current place is %s, so it makes no effect.'
% tracer._expected_place
)
enable = False
Expand Down Expand Up @@ -326,74 +325,36 @@ def _unscale(self, optimizer):
if param.dtype == core.VarDesc.VarType.FP32
]
self._found_inf = self._temp_found_inf_value_false
if core.is_compiled_with_npu():
float_status = _legacy_C_ops.alloc_float_status()
_legacy_C_ops.clear_float_status(float_status, float_status)

if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
self._scale,
float_status,
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
self._scale,
float_status,
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
float_status,
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
else:
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
self._scale,
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
self._scale,
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)
if len(param_grads_fp16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp16,
self._scale,
param_grads_fp16,
self._temp_found_inf_fp16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp16
)
if len(param_grads_bf16):
_legacy_C_ops.check_finite_and_unscale(
param_grads_bf16,
self._scale,
param_grads_bf16,
self._temp_found_inf_bf16,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_bf16
)
if len(param_grads_fp32):
_legacy_C_ops.check_finite_and_unscale(
param_grads_fp32,
self._scale,
param_grads_fp32,
self._temp_found_inf_fp32,
)
self._found_inf = _C_ops.bitwise_or(
self._found_inf, self._temp_found_inf_fp32
)

optimizer_state["state"] = OptimizerState.UNSCALED

Expand Down
64 changes: 4 additions & 60 deletions python/paddle/device/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
'is_compiled_with_cinn',
'is_compiled_with_cuda',
'is_compiled_with_rocm',
'is_compiled_with_npu',
'is_compiled_with_custom_device',
'get_all_device_type',
'get_all_custom_device_type',
Expand All @@ -53,24 +52,6 @@
_cudnn_version = None


# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future
# for consistent.
def is_compiled_with_npu():
"""
Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.
Return:
bool, ``True`` if NPU is supported, otherwise ``False``.
Examples:
.. code-block:: python
import paddle
support_npu = paddle.device.is_compiled_with_npu()
"""
return core.is_compiled_with_npu()


def is_compiled_with_custom_device(device_type):
"""
Whether paddle was built with Paddle_CUSTOM_DEVICE .
Expand Down Expand Up @@ -210,15 +191,6 @@ def _convert_to_place(device):
selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
device_id = int(selected_xpus[0])
place = core.XPUPlace(device_id)
elif lower_device == 'npu':
if not core.is_compiled_with_npu():
raise ValueError(
"The device should not be 'npu', "
"since PaddlePaddle is not compiled with NPU"
)
selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
device_id = int(selected_npus[0])
place = core.NPUPlace(device_id)
elif lower_device == 'ipu':
if not core.is_compiled_with_ipu():
raise ValueError(
Expand All @@ -229,7 +201,6 @@ def _convert_to_place(device):
else:
avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
avaliable_npu_device = re.match(r'npu:\d+', lower_device)
if avaliable_gpu_device:
if not core.is_compiled_with_cuda():
raise ValueError(
Expand All @@ -250,31 +221,7 @@ def _convert_to_place(device):
device_id = device_info_list[1]
device_id = int(device_id)
place = core.XPUPlace(device_id)
if avaliable_npu_device:
if not core.is_compiled_with_npu():
device_info_list = device.split(':', 1)
device_type = device_info_list[0]
if device_type in core.get_all_custom_device_type():
device_id = device_info_list[1]
device_id = int(device_id)
place = core.CustomPlace(device_type, device_id)
return place
else:
raise ValueError(
"The device should not be {}, since PaddlePaddle is "
"not compiled with NPU or compiled with custom device".format(
avaliable_npu_device
)
)
device_info_list = device.split(':', 1)
device_id = device_info_list[1]
device_id = int(device_id)
place = core.NPUPlace(device_id)
if (
not avaliable_gpu_device
and not avaliable_xpu_device
and not avaliable_npu_device
):
if not avaliable_gpu_device and not avaliable_xpu_device:
device_info_list = device.split(':', 1)
device_type = device_info_list[0]
if device_type in core.get_all_custom_device_type():
Expand Down Expand Up @@ -346,9 +293,6 @@ def get_device():
elif isinstance(place, core.XPUPlace):
device_id = place.get_device_id()
device = 'xpu:' + str(device_id)
elif isinstance(place, core.NPUPlace):
device_id = place.get_device_id()
device = 'npu:' + str(device_id)
elif isinstance(place, core.IPUPlace):
num_devices = core.get_ipu_device_count()
device = f"ipus:{{0-{num_devices - 1}}}"
Expand Down Expand Up @@ -469,7 +413,7 @@ class Event:
Parameters:
device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
enable_timing (bool, optional): indicates if the event should measure time, default is False
blocking (bool, optional): if True, ``wait`` will be blocking, default is False
interprocess (bool): if True, the event can be shared between processes, default is False
Expand Down Expand Up @@ -614,7 +558,7 @@ class Stream:
Parameters:
device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
priority(int, optional): priority of the CUDA stream. Can be either
1 (high priority) or 2 (low priority). By default, streams have
priority 2.
Expand Down Expand Up @@ -936,7 +880,7 @@ def synchronize(device=None):
Parameters:
device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for. If device is None, the device is the current device. Default: None.
It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
Examples:
.. code-block:: python
# required: custom_device
Expand Down
5 changes: 0 additions & 5 deletions python/paddle/distributed/collective.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,11 +288,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
core.NCCLParallelContext(strategy, place).init_with_ring_id(
ring_id
)
elif core.is_compiled_with_npu():
place = core.NPUPlace(genv.device_id)
core.HCCLParallelContext(strategy, place).init_with_ring_id(
ring_id
)
elif core.is_compiled_with_xpu():
place = core.XPUPlace(genv.device_id)
core.BKCLParallelContext(strategy, place).init_with_ring_id(
Expand Down
Loading

0 comments on commit 7976e2a

Please sign in to comment.