mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-29 08:41:22 +00:00
ggml-backend : add GGML_BACKEND_DEVICE_TYPE_IGPU device type (#15797)
* ggml-backend : add GGML_BACKEND_DEVICE_TYPE_IGPU device type ggml-backend : add device id to device props llama : only use iGPU devices if there are no GPU devices llama : do not use multiple devices from different backends with the same device id
This commit is contained in:
@@ -132,6 +132,8 @@ extern "C" {
|
|||||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||||
// GPU device using dedicated memory
|
// GPU device using dedicated memory
|
||||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||||
|
// integrated GPU device using host memory
|
||||||
|
GGML_BACKEND_DEVICE_TYPE_IGPU,
|
||||||
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
||||||
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
||||||
};
|
};
|
||||||
@@ -150,11 +152,21 @@ extern "C" {
|
|||||||
|
|
||||||
// all the device properties
|
// all the device properties
|
||||||
struct ggml_backend_dev_props {
|
struct ggml_backend_dev_props {
|
||||||
|
// device name
|
||||||
const char * name;
|
const char * name;
|
||||||
|
// device description
|
||||||
const char * description;
|
const char * description;
|
||||||
|
// device free memory in bytes
|
||||||
size_t memory_free;
|
size_t memory_free;
|
||||||
|
// device total memory in bytes
|
||||||
size_t memory_total;
|
size_t memory_total;
|
||||||
|
// device type
|
||||||
enum ggml_backend_dev_type type;
|
enum ggml_backend_dev_type type;
|
||||||
|
// device id
|
||||||
|
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
||||||
|
// if the id is unknown, this should be NULL
|
||||||
|
const char * device_id;
|
||||||
|
// device capabilities
|
||||||
struct ggml_backend_dev_caps caps;
|
struct ggml_backend_dev_caps caps;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GGML_BACKEND_API_VERSION 1
|
#define GGML_BACKEND_API_VERSION 2
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend buffer type
|
// Backend buffer type
|
||||||
|
|||||||
@@ -400,9 +400,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
|
|||||||
|
|
||||||
ggml_backend_t ggml_backend_init_best(void) {
|
ggml_backend_t ggml_backend_init_best(void) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||||
if (!dev) {
|
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
|
||||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
}
|
|
||||||
if (!dev) {
|
if (!dev) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3210,6 +3210,7 @@ struct ggml_backend_cuda_device_context {
|
|||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
std::string description;
|
std::string description;
|
||||||
|
std::string pci_bus_id;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||||
@@ -3234,9 +3235,12 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||||
|
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||||
|
|
||||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||||
|
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
|
||||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
|
|
||||||
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
|
||||||
@@ -3804,6 +3808,10 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
|||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||||
dev_ctx->description = prop.name;
|
dev_ctx->description = prop.name;
|
||||||
|
|
||||||
|
char pci_bus_id[16] = {};
|
||||||
|
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||||
|
dev_ctx->pci_bus_id = pci_bus_id;
|
||||||
|
|
||||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||||
/* .reg = */ ®,
|
/* .reg = */ ®,
|
||||||
|
|||||||
@@ -12113,6 +12113,7 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
|
|||||||
|
|
||||||
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||||
UNUSED(dev);
|
UNUSED(dev);
|
||||||
|
// TODO: return GGML_BACKEND_DEVICE_TYPE_IGPU for integrated GPUs
|
||||||
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
return GGML_BACKEND_DEVICE_TYPE_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -12120,6 +12121,7 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
|
|||||||
props->name = ggml_backend_vk_device_get_name(dev);
|
props->name = ggml_backend_vk_device_get_name(dev);
|
||||||
props->description = ggml_backend_vk_device_get_description(dev);
|
props->description = ggml_backend_vk_device_get_description(dev);
|
||||||
props->type = ggml_backend_vk_device_get_type(dev);
|
props->type = ggml_backend_vk_device_get_type(dev);
|
||||||
|
// TODO: set props->device_id to PCI bus id
|
||||||
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
props->caps = {
|
props->caps = {
|
||||||
/* .async = */ false,
|
/* .async = */ false,
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ bool llama_supports_mlock(void) {
|
|||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
||||||
|
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU) != nullptr ||
|
||||||
llama_supports_rpc();
|
llama_supports_rpc();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -184,8 +185,13 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||||||
model->devices.push_back(*dev);
|
model->devices.push_back(*dev);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// default device selection
|
||||||
|
|
||||||
|
// build list of available devices
|
||||||
|
std::vector<ggml_backend_dev_t> gpus;
|
||||||
|
std::vector<ggml_backend_dev_t> igpus;
|
||||||
std::vector<ggml_backend_dev_t> rpc_servers;
|
std::vector<ggml_backend_dev_t> rpc_servers;
|
||||||
// use all available devices
|
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
switch (ggml_backend_dev_type(dev)) {
|
switch (ggml_backend_dev_type(dev)) {
|
||||||
@@ -194,19 +200,51 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||||||
// skip CPU backends since they are handled separately
|
// skip CPU backends since they are handled separately
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
case GGML_BACKEND_DEVICE_TYPE_GPU: {
|
||||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||||
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
|
||||||
rpc_servers.push_back(dev);
|
rpc_servers.push_back(dev);
|
||||||
} else {
|
} else {
|
||||||
model->devices.push_back(dev);
|
// check if there is already a GPU with the same device id
|
||||||
|
ggml_backend_dev_props props;
|
||||||
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
|
auto it = std::find_if(gpus.begin(), gpus.end(), [&props](ggml_backend_dev_t d) {
|
||||||
|
ggml_backend_dev_props d_props;
|
||||||
|
ggml_backend_dev_get_props(d, &d_props);
|
||||||
|
if (props.device_id && d_props.device_id) {
|
||||||
|
return strcmp(props.device_id, d_props.device_id) == 0;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (it != gpus.end()) {
|
||||||
|
LLAMA_LOG_INFO("%s: skipping device %s (%s) with id %s - already using device %s (%s) with the same id\n",
|
||||||
|
__func__,
|
||||||
|
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||||
|
props.device_id ? props.device_id : "unknown id",
|
||||||
|
ggml_backend_dev_name(*it), ggml_backend_dev_description(*it));
|
||||||
|
} else {
|
||||||
|
gpus.push_back(dev);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||||
|
igpus.push_back(dev);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// add RPC servers at the front of the list
|
|
||||||
if (!rpc_servers.empty()) {
|
// add RPC servers at the front of the list to minimize network transfers
|
||||||
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end());
|
||||||
|
|
||||||
|
// add GPUs
|
||||||
|
model->devices.insert(model->devices.end(), gpus.begin(), gpus.end());
|
||||||
|
|
||||||
|
// add integrated GPUs only if no other devices were found
|
||||||
|
if (model->devices.empty()) {
|
||||||
|
model->devices.insert(model->devices.end(), igpus.begin(), igpus.end());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -227,9 +265,12 @@ static struct llama_model * llama_model_load_from_file_impl(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (auto * dev : model->devices) {
|
for (auto * dev : model->devices) {
|
||||||
size_t free, total; // NOLINT
|
ggml_backend_dev_props props;
|
||||||
ggml_backend_dev_memory(dev, &free, &total);
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
LLAMA_LOG_INFO("%s: using device %s (%s) (%s) - %zu MiB free\n", __func__,
|
||||||
|
ggml_backend_dev_name(dev), ggml_backend_dev_description(dev),
|
||||||
|
props.device_id ? props.device_id : "unknown id",
|
||||||
|
props.memory_free/1024/1024);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int status = llama_model_load(path_model, splits, *model, params);
|
const int status = llama_model_load(path_model, splits, *model, params);
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ static std::string get_gpu_info() {
|
|||||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||||
auto * dev = ggml_backend_dev_get(i);
|
auto * dev = ggml_backend_dev_get(i);
|
||||||
auto dev_type = ggml_backend_dev_type(dev);
|
auto dev_type = ggml_backend_dev_type(dev);
|
||||||
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU || dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
|
||||||
gpu_list.push_back(ggml_backend_dev_description(dev));
|
gpu_list.push_back(ggml_backend_dev_description(dev));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -945,6 +945,7 @@ struct cmd_params_instance {
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// FIXME: use llama.cpp device selection logic
|
||||||
// add local GPU devices if any
|
// add local GPU devices if any
|
||||||
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
||||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||||
@@ -957,6 +958,10 @@ struct cmd_params_instance {
|
|||||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||||
devices.push_back(dev);
|
devices.push_back(dev);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case GGML_BACKEND_DEVICE_TYPE_IGPU:
|
||||||
|
// iGPUs are not used when there are RPC servers
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
devices.push_back(nullptr);
|
devices.push_back(nullptr);
|
||||||
|
|||||||
Reference in New Issue
Block a user