diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp index 853495b00d..e5158fb506 100644 --- a/tests/test-thread-safety.cpp +++ b/tests/test-thread-safety.cpp @@ -3,6 +3,7 @@ // - Creates n_parallel (--parallel) contexts per model // - Runs inference in parallel on each context +#include #include #include #include @@ -38,13 +39,14 @@ int main(int argc, char ** argv) { cparams.n_seq_max = 1; int dev_count = ggml_backend_dev_count(); - int gpu_dev_count = 0; + std::vector> gpus; for (int i = 0; i < dev_count; ++i) { auto * dev = ggml_backend_dev_get(i); if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { - gpu_dev_count++; + gpus.push_back({dev, nullptr}); } } + const int gpu_dev_count = (int)gpus.size(); const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split //const int num_models = std::max(1, gpu_dev_count); const int num_contexts = std::max(1, params.n_parallel); @@ -58,12 +60,12 @@ int main(int argc, char ** argv) { if (m < gpu_dev_count) { mparams.split_mode = LLAMA_SPLIT_MODE_NONE; - mparams.main_gpu = m; + mparams.devices = gpus[m].data(); } else if (m == gpu_dev_count) { mparams.split_mode = LLAMA_SPLIT_MODE_NONE; mparams.main_gpu = -1; // CPU model } else { - mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;; + mparams.split_mode = LLAMA_SPLIT_MODE_LAYER; } llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);