33// - Creates n_parallel (--parallel) contexts per model
44// - Runs inference in parallel on each context
55
6+ #include < array>
67#include < thread>
78#include < vector>
89#include < atomic>
@@ -38,13 +39,14 @@ int main(int argc, char ** argv) {
3839 cparams.n_seq_max = 1 ;
3940
4041 int dev_count = ggml_backend_dev_count ();
41- int gpu_dev_count = 0 ;
42+ std::vector<std::array< ggml_backend_dev_t , 2 >> gpus ;
4243 for (int i = 0 ; i < dev_count; ++i) {
4344 auto * dev = ggml_backend_dev_get (i);
4445 if (dev && ggml_backend_dev_type (dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
45- gpu_dev_count++ ;
46+ gpus. push_back ({dev, nullptr }) ;
4647 }
4748 }
49+ const int gpu_dev_count = (int )gpus.size ();
4850 const int num_models = gpu_dev_count + 1 + 1 ; // GPUs + 1 CPU model + 1 layer split
4951 // const int num_models = std::max(1, gpu_dev_count);
5052 const int num_contexts = std::max (1 , params.n_parallel );
@@ -58,12 +60,12 @@ int main(int argc, char ** argv) {
5860
5961 if (m < gpu_dev_count) {
6062 mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
61- mparams.main_gpu = m ;
63+ mparams.devices = gpus[m]. data () ;
6264 } else if (m == gpu_dev_count) {
6365 mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
6466 mparams.main_gpu = -1 ; // CPU model
6567 } else {
66- mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;;
68+ mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;
6769 }
6870
6971 llama_model * model = llama_model_load_from_file (params.model .path .c_str (), mparams);
0 commit comments