mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
CANN: Fix precision issue on 310I DUO multi-devices (#15784)
This commit is contained in:
@@ -293,17 +293,14 @@ We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers fr
|
|||||||
|
|
||||||
## Environment variable setup
|
## Environment variable setup
|
||||||
|
|
||||||
### GGML_CANN_ASYNC_MODE
|
|
||||||
|
|
||||||
Enables asynchronous operator submission. Disabled by default.
|
|
||||||
|
|
||||||
### GGML_CANN_MEM_POOL
|
### GGML_CANN_MEM_POOL
|
||||||
|
|
||||||
Specifies the memory pool management strategy:
|
Specifies the memory pool management strategy, Default is vmm.
|
||||||
|
|
||||||
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
|
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
|
||||||
|
|
||||||
- prio: Employs a priority queue-based memory pool management.
|
- prio: Employs a priority queue-based memory pool management.
|
||||||
|
|
||||||
- leg: Uses a fixed-size buffer pool.
|
- leg: Uses a fixed-size buffer pool.
|
||||||
|
|
||||||
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
|
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
|
||||||
@@ -312,9 +309,8 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
|
|||||||
|
|
||||||
### GGML_CANN_WEIGHT_NZ
|
### GGML_CANN_WEIGHT_NZ
|
||||||
|
|
||||||
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
|
Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.
|
||||||
|
|
||||||
### GGML_CANN_DISABLE_ACL_GRAPH
|
### GGML_CANN_ACL_GRAPH
|
||||||
|
|
||||||
When this variable is set, ACL graph execution is disabled and operators are executed in an op-by-op (eager) mode.
|
Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
|
||||||
This mode is mainly intended for debugging or for cases where the overhead of graph construction and execution is not desirable.
|
|
||||||
|
|||||||
@@ -1956,7 +1956,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|||||||
aclTensor* acl_weight_tensor;
|
aclTensor* acl_weight_tensor;
|
||||||
|
|
||||||
// Only check env once.
|
// Only check env once.
|
||||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||||
if (weight_to_nz && is_matmul_weight(weight)) {
|
if (weight_to_nz && is_matmul_weight(weight)) {
|
||||||
int64_t acl_stride[2] = {1, transpose_ne[1]};
|
int64_t acl_stride[2] = {1, transpose_ne[1]};
|
||||||
|
|
||||||
|
|||||||
@@ -420,7 +420,7 @@ struct ggml_backend_cann_context {
|
|||||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||||
device, async_mode ? "ON" : "OFF");
|
device, async_mode ? "ON" : "OFF");
|
||||||
#ifdef USE_ACL_GRAPH
|
#ifdef USE_ACL_GRAPH
|
||||||
acl_graph_mode = !(parse_bool(get_env("GGML_CANN_DISABLE_ACL_GRAPH").value_or("")));
|
acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
|
||||||
GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
|
GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
|
||||||
__func__, device,
|
__func__, device,
|
||||||
acl_graph_mode ? "GRAPH" : "EAGER",
|
acl_graph_mode ? "GRAPH" : "EAGER",
|
||||||
|
|||||||
@@ -1196,7 +1196,7 @@ static void ggml_backend_cann_buffer_set_tensor(
|
|||||||
// Why aclrtSynchronizeDevice?
|
// Why aclrtSynchronizeDevice?
|
||||||
|
|
||||||
// Only check env once.
|
// Only check env once.
|
||||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||||
if (!need_transform(tensor->type)) {
|
if (!need_transform(tensor->type)) {
|
||||||
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
||||||
ACL_MEMCPY_HOST_TO_DEVICE));
|
ACL_MEMCPY_HOST_TO_DEVICE));
|
||||||
@@ -1279,6 +1279,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
|
|||||||
ACL_MEMCPY_DEVICE_TO_DEVICE));
|
ACL_MEMCPY_DEVICE_TO_DEVICE));
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
#ifdef ASCEND_310P
|
||||||
|
// TODO: Support 310p P2P copy
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
// Different device but can access by peer.
|
// Different device but can access by peer.
|
||||||
int32_t canAccessPeer = 0;
|
int32_t canAccessPeer = 0;
|
||||||
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
|
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
|
||||||
@@ -1439,7 +1443,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
|||||||
int64_t ne0 = tensor->ne[0];
|
int64_t ne0 = tensor->ne[0];
|
||||||
|
|
||||||
// Only check env once.
|
// Only check env once.
|
||||||
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
||||||
|
|
||||||
// last line must bigger than 32, because every single op deal at
|
// last line must bigger than 32, because every single op deal at
|
||||||
// least 32 bytes.
|
// least 32 bytes.
|
||||||
@@ -2000,6 +2004,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|||||||
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
||||||
ggml_backend_is_cann(backend_dst));
|
ggml_backend_is_cann(backend_dst));
|
||||||
|
|
||||||
|
GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
|
||||||
|
|
||||||
if (!ggml_backend_buffer_is_cann(src->buffer) ||
|
if (!ggml_backend_buffer_is_cann(src->buffer) ||
|
||||||
!ggml_backend_buffer_is_cann(dst->buffer)) {
|
!ggml_backend_buffer_is_cann(dst->buffer)) {
|
||||||
return false;
|
return false;
|
||||||
@@ -2020,6 +2026,10 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (backend_src != backend_dst) {
|
if (backend_src != backend_dst) {
|
||||||
|
#ifdef ASCEND_310P
|
||||||
|
// TODO: Support 310p P2P copy
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
ggml_backend_cann_buffer_context* buf_ctx_src =
|
ggml_backend_cann_buffer_context* buf_ctx_src =
|
||||||
(ggml_backend_cann_buffer_context*)buf_src->context;
|
(ggml_backend_cann_buffer_context*)buf_src->context;
|
||||||
ggml_backend_cann_buffer_context* buf_ctx_dst =
|
ggml_backend_cann_buffer_context* buf_ctx_dst =
|
||||||
@@ -2036,7 +2046,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// need open both directions for memcpyasync between devices.
|
// need open both directions for memcpyasync between devices.
|
||||||
ggml_cann_set_device(cann_ctx_dst->device);
|
|
||||||
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
|
||||||
ggml_cann_set_device(cann_ctx_src->device);
|
ggml_cann_set_device(cann_ctx_src->device);
|
||||||
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
||||||
@@ -2047,8 +2056,15 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|||||||
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||||
cann_ctx_src->stream()));
|
cann_ctx_src->stream()));
|
||||||
|
|
||||||
//TODO: workaround for Event didn`t work here.
|
// record event on src stream after the copy
|
||||||
aclrtSynchronizeStream(cann_ctx_src->stream());
|
if (!cann_ctx_src->copy_event) {
|
||||||
|
ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
|
||||||
|
}
|
||||||
|
ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
|
||||||
|
|
||||||
|
// wait on dst stream for the copy to complete
|
||||||
|
ggml_cann_set_device(cann_ctx_dst->device);
|
||||||
|
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
|
||||||
} else {
|
} else {
|
||||||
// src and dst are on the same backend
|
// src and dst are on the same backend
|
||||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||||
|
|||||||
Reference in New Issue
Block a user