From d8eaa26e4d9228df3aa46a930db60c8eaab67c1b Mon Sep 17 00:00:00 2001 From: Acly Date: Wed, 22 Oct 2025 12:01:22 +0200 Subject: [PATCH] tests : fix test-thread-safety when compiling with multiple backends (#16699) * run one test per backend/device (even if it's the same device) --- tests/test-thread-safety.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp index 853495b00d..e5158fb506 100644 --- a/tests/test-thread-safety.cpp +++ b/tests/test-thread-safety.cpp @@ -3,6 +3,7 @@ // - Creates n_parallel (--parallel) contexts per model // - Runs inference in parallel on each context +#include #include #include #include @@ -38,13 +39,14 @@ int main(int argc, char ** argv) { cparams.n_seq_max = 1; int dev_count = ggml_backend_dev_count(); - int gpu_dev_count = 0; + std::vector> gpus; for (int i = 0; i < dev_count; ++i) { auto * dev = ggml_backend_dev_get(i); if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { - gpu_dev_count++; + gpus.push_back({dev, nullptr}); } } + const int gpu_dev_count = (int)gpus.size(); const int num_models = gpu_dev_count + 1 + 1; // GPUs + 1 CPU model + 1 layer split //const int num_models = std::max(1, gpu_dev_count); const int num_contexts = std::max(1, params.n_parallel); @@ -58,12 +60,12 @@ int main(int argc, char ** argv) { if (m < gpu_dev_count) { mparams.split_mode = LLAMA_SPLIT_MODE_NONE; - mparams.main_gpu = m; + mparams.devices = gpus[m].data(); } else if (m == gpu_dev_count) { mparams.split_mode = LLAMA_SPLIT_MODE_NONE; mparams.main_gpu = -1; // CPU model } else { - mparams.split_mode = LLAMA_SPLIT_MODE_LAYER;; + mparams.split_mode = LLAMA_SPLIT_MODE_LAYER; } llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);