Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] fix error when set main gpu to non-zero #5901

Merged
merged 2 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 94 additions & 60 deletions ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3559,12 +3559,31 @@ class sycl_gpu_mgr {
int work_group_size = 0;
std::string gpus_list = "";

/*
Use all GPU with same top max compute units
*/
sycl_gpu_mgr() {
detect_sycl_gpu_list_with_max_cu();
get_allow_gpus();
create_context_with_gpus();
}

/*
Use the assigned GPU as only one
*/
sycl_gpu_mgr(int main_gpu_id) {
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
dpct::device_info prop;
dpct::get_device_info(prop, device);
gpus.push_back(main_gpu_id);
devices.push_back(device);
work_group_size = prop.get_max_work_group_size();
max_compute_units = prop.get_max_compute_units();

get_allow_gpus();
create_context_with_gpus();
}

void create_context_with_gpus() {
sycl::context ctx = sycl::context(devices);
assert(gpus.size() > 0);
Expand All @@ -3580,7 +3599,7 @@ class sycl_gpu_mgr {
gpus_list += std::to_string(gpus[i]);
gpus_list += ",";
}
if (gpus_list.length() > 2) {
if (gpus_list.length() > 1) {
gpus_list.pop_back();
}
}
Expand Down Expand Up @@ -3629,8 +3648,8 @@ class sycl_gpu_mgr {
if (gpus[i] == id)
return i;
}
assert(false);
return -1;
printf("miss to get device index by id=%d\n", id);
GGML_ASSERT(false);
}

int get_next_index(int id) {
Expand All @@ -3639,8 +3658,7 @@ class sycl_gpu_mgr {
if (gpus[i] == id)
return i;
}
assert(false);
return -1;
GGML_ASSERT(false);
}
};

Expand All @@ -3649,6 +3667,7 @@ static int g_device_count = -1;
static int g_all_sycl_device_count = -1;
static int g_main_device = -1;
static int g_main_device_id = -1;
static bool g_ggml_backend_sycl_buffer_type_initialized = false;

static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};

Expand Down Expand Up @@ -13191,7 +13210,7 @@ void ggml_backend_sycl_print_sycl_devices() {
}

void print_gpu_device_list() {
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
fprintf(stderr, "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n",
g_sycl_gpu_mgr->get_gpu_count(),
g_sycl_gpu_mgr->gpus_list.c_str(),
g_sycl_gpu_mgr->max_compute_units);
Expand Down Expand Up @@ -13230,6 +13249,15 @@ void ggml_init_sycl() try {
#else
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
#endif

/* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
#else
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
*/

if (CHECK_TRY_ERROR(g_all_sycl_device_count =
dpct::dev_mgr::instance().device_count()) != 0) {
initialized = true;
Expand All @@ -13238,68 +13266,61 @@ void ggml_init_sycl() try {
}
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
ggml_backend_sycl_print_sycl_devices();

if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
print_gpu_device_list();
initialized = true;
g_sycl_loaded = true;
}

g_device_count = g_sycl_gpu_mgr->get_gpu_count();
g_work_group_size = g_sycl_gpu_mgr->work_group_size;

print_gpu_device_list();

int64_t total_vram = 0;
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
g_work_group_size = g_sycl_gpu_mgr->work_group_size;

/* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
#else
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
#endif
*/
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
g_device_caps[id].vmm = 0;
g_device_caps[id].device_id = -1;
g_device_caps[id].cc = 0;
g_tensor_split[id] = 0;
g_default_tensor_split[id] = 0;
}
int64_t total_vram = 0;

for (int i = 0; i < g_device_count; ++i) {
int device_id = g_sycl_gpu_mgr->gpus[i];
g_device_caps[i].vmm = 0;

dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(device_id))));
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
g_device_caps[id].vmm = 0;
g_device_caps[id].device_id = -1;
g_device_caps[id].cc = 0;
g_tensor_split[id] = 0;
g_default_tensor_split[id] = 0;
}

g_default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();
for (int i = 0; i < g_device_count; ++i) {
int device_id = g_sycl_gpu_mgr->gpus[i];
g_device_caps[i].vmm = 0;

g_device_caps[i].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
}
dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
prop, dpct::dev_mgr::instance().get_device(device_id))));

for (int i = 0; i < g_device_count; ++i) {
g_default_tensor_split[i] /= total_vram;
}
g_default_tensor_split[i] = total_vram;
total_vram += prop.get_global_mem_size();

for (int i = 0; i < g_device_count; ++i) {
SYCL_CHECK(ggml_sycl_set_device(i));
g_device_caps[i].cc =
100 * prop.get_major_version() + 10 * prop.get_minor_version();
}

// create sycl streams
for (int is = 0; is < MAX_STREAMS; ++is) {
SYCL_CHECK(CHECK_TRY_ERROR(
g_syclStreams[i][is] =
dpct::get_current_device().create_queue(
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
}
for (int i = 0; i < g_device_count; ++i) {
g_default_tensor_split[i] /= total_vram;
}

for (int i = 0; i < g_device_count; ++i) {
SYCL_CHECK(ggml_sycl_set_device(i));

const dpct::queue_ptr stream = g_syclStreams[i][0];
// create sycl handle
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
// create sycl streams
for (int is = 0; is < MAX_STREAMS; ++is) {
SYCL_CHECK(CHECK_TRY_ERROR(
g_syclStreams[i][is] =
dpct::get_current_device().create_queue(
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
}

initialized = true;
g_sycl_loaded = true;
const dpct::queue_ptr stream = g_syclStreams[i][0];
// create sycl handle
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
}
}
catch (sycl::exception const &exc) {
Expand Down Expand Up @@ -16699,22 +16720,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .is_host = */ nullptr,
};

ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
if (device_index>=g_device_count or device_index<0) {
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
device_index, g_device_count-1);
GGML_ASSERT(device_index<g_device_count);
}
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];

static bool ggml_backend_sycl_buffer_type_initialized = false;

if (!ggml_backend_sycl_buffer_type_initialized) {
if (!g_ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < g_device_count; i++) {
ggml_backend_sycl_buffer_types[i] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
};
}
ggml_backend_sycl_buffer_type_initialized = true;
g_ggml_backend_sycl_buffer_type_initialized = true;
}

return &ggml_backend_sycl_buffer_types[device];
return &ggml_backend_sycl_buffer_types[device_index];
}

// sycl split buffer type
Expand Down Expand Up @@ -17463,6 +17486,17 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
return g_sycl_gpu_mgr->get_index(device_id);
}

GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
if (!g_sycl_gpu_mgr) {
delete g_sycl_gpu_mgr;
}
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
ggml_init_sycl();
g_ggml_backend_sycl_buffer_type_initialized = false;
}

extern "C" int ggml_backend_sycl_reg_devices();

int ggml_backend_sycl_reg_devices() {
Expand Down
1 change: 1 addition & 0 deletions ggml-sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu);

#ifdef __cplusplus
}
Expand Down
16 changes: 12 additions & 4 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3750,6 +3750,14 @@ static bool llm_load_tensors(
model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;

#ifdef GGML_USE_SYCL
if (split_mode == LLAMA_SPLIT_MODE_NONE) {
ggml_backend_sycl_set_single_device(main_gpu);
//SYCL use device index (0, 1, 2), instead if device id.
main_gpu = ggml_backend_sycl_get_device_index(main_gpu);
}
#endif
Comment on lines +3753 to +3759
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Backends should not require calling backend-specific functions for normal usage. Is this ggml_backend_sycl_set_single_device function really necessary?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the author want to distinguish iGPU and dGPU and offload to dGPU if no device appointed, thus Backends need to query sycl-backend for device list and return the most powerful one. Do you have any suggestion?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the dGPU always be mapped to the device index zero? That way, it would be used by default in single GPU mode.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could the dGPU always be mapped to the device index zero? That way, it would be used by default in single GPU mode.

yes, this should be default. But we encountered this #5513, the user reported dGPU mapped to index 3.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, the device indices that the SYCL backend uses for its buffers and backends would be automatically ordered by the most powerful GPUs available in the system, such that the lowest indices are the most powerful GPUs. If this is not possible or desirable, it would still be ok to add a function that returns the list of available GPUs ordered by the most powerful first. Then, in llama.cpp we can use that list as a lookup table to convert the device indices used in llama.cpp to the device indices passed to the SYCL backend. This translation between llama.cpp device indices and SYCL backend device indices could be implemented in the llama_default_buffer_type_offload function for the buffers, and during the backend instance creation in llama_new_context_with_model.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SYCL backend will create GPU lists with most powerful GPUs in initial.
When llama.cpp provide the parameter: split-mode and main-gpu, SYCL backend will update the GPU list:

  1. If split-mode is none, GPU list will be updated to new list include the main-gpu device index only.
  2. If split-mode is layer or row, GPU list won't be changed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better method is to ask user provide gpu list to llama.cpp as parameter. ggml just create the GPU pool by the parameter, avoid to make mistake by detecting automatically.
It allows to support more feature, like mix GPUs, mix dGPU & iGPU.

If the gpu list include one GPU, the split mode is none in fact.

So, the parameters is changed:
from main-gpu + split-mode
to gpu-list + split-mode

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still do not understand why it is necessary to add the function ggml_backend_sycl_set_single_device. I could understand using a function such as ggml_backend_sycl_get_device_index to translate device indices from llama.cpp to the device indices used by SYCL, but that should be done always, regardless of the split mode, in llama_default_buffer_type_offload and llama_new_context_with_model as I mentioned earlier.

I am also concerned that since this function seems to change the global state of the SYCL backend, it will prevent using multiple devices simultaneously by loading a different model on each device with a different llama_model instance, and doing inference on each device in a different thread simultaneously.

In the future, we will also support using different backends simultaneously, for example so that a system with a NVIDIA device and an Intel device, we will be able to use the CUDA and SYCL backends at the same time with split mode layer. Adding these backend-specific functions will complicate the implementation of this functionality.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When using multiple devices with split mode layer or row, it is possible to exclude some devices by using the -ts parameter to set the split of a device to zero. For example, with -ts 1,0,1 only devices 0 and 2 will be used.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SYCL backend has two methods to get the GPUs info:

  1. automatically detect the GPUs which are most powerful. It's default behavior in most case, including unit test.
  2. according to the parameter: main-gpu comes from llama.cpp. ggml_backend_sycl_set_single_device is used in this case only.

It only impacts the GPU list in SYCL backend as global state.
This action will happen before llama_default_buffer_type_offload. so, it won't impact next model load process.


const int64_t n_layer = hparams.n_layer;
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);

Expand Down Expand Up @@ -12260,13 +12268,13 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_LAYER requires a backend for each GPU
int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);

for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
int device_id = id_list[i];
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, id_list[i], i);
llama_free(ctx);
return nullptr;
}
Expand Down
Loading