mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
sycl: use async memory allocation to fix crashes during graph recording (#16644)
* sycl: use async memory allocation to fix graph recording failures
GGML_SYCL_DISABLE_GRAPHS=0 causes crashes because:
- Host waits are currently unsupported in graph recording mode.
- SYCL malloc / free calls are unsupported in graph recording mode.
The following changes are made to fix SYCL graph functionality:
- When graphs are enabled, use the SYCL async memory extension for temp
buffers which is supported with SYCL graphs.
- For compiler versions that do not support this extension, skip
graphs with the affected op.
- Switch from USM shared to device memory as the async extension
currently just supports device allocations.
* Address reviewer feedback
* Use global async variable to decide path in sycl_ext_[malloc_device|free]
This commit is contained in:
@@ -30,6 +30,9 @@
|
|||||||
#include <regex>
|
#include <regex>
|
||||||
|
|
||||||
#include <sycl/sycl.hpp>
|
#include <sycl/sycl.hpp>
|
||||||
|
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||||
|
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
|
||||||
|
#endif
|
||||||
#include <sycl/half_type.hpp>
|
#include <sycl/half_type.hpp>
|
||||||
|
|
||||||
#include "ggml-sycl.h"
|
#include "ggml-sycl.h"
|
||||||
@@ -54,6 +57,7 @@ int g_ggml_sycl_disable_optimize = 0;
|
|||||||
int g_ggml_sycl_disable_graph = 0;
|
int g_ggml_sycl_disable_graph = 0;
|
||||||
int g_ggml_sycl_disable_dnn = 0;
|
int g_ggml_sycl_disable_dnn = 0;
|
||||||
int g_ggml_sycl_prioritize_dmmv = 0;
|
int g_ggml_sycl_prioritize_dmmv = 0;
|
||||||
|
int g_ggml_sycl_use_async_mem_op = 0;
|
||||||
|
|
||||||
static ggml_sycl_device_info ggml_sycl_init() {
|
static ggml_sycl_device_info ggml_sycl_init() {
|
||||||
ggml_sycl_device_info info = {};
|
ggml_sycl_device_info info = {};
|
||||||
@@ -237,7 +241,20 @@ static void ggml_check_sycl() try {
|
|||||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
*/
|
*/
|
||||||
|
// Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be
|
||||||
|
// properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in
|
||||||
|
// other places.
|
||||||
|
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||||
|
g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph;
|
||||||
|
if (g_ggml_sycl_use_async_mem_op) {
|
||||||
|
for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) {
|
||||||
|
if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) {
|
||||||
|
g_ggml_sycl_use_async_mem_op = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
||||||
dpct::dev_mgr::instance().device_count()) != 0) {
|
dpct::dev_mgr::instance().device_count()) != 0) {
|
||||||
initialized = true;
|
initialized = true;
|
||||||
@@ -3031,19 +3048,51 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper functions to unify device memory allocation for both async and sync paths
|
||||||
|
static inline void * sycl_ext_malloc_device(dpct::queue_ptr stream, size_t size) {
|
||||||
|
bool use_async = g_ggml_sycl_use_async_mem_op;
|
||||||
|
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||||
|
if (use_async) {
|
||||||
|
return syclex::async_malloc(*stream, sycl::usm::alloc::device, size);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// If async allocation extension is not available, use_async should always be false.
|
||||||
|
GGML_ASSERT(!use_async);
|
||||||
|
#endif
|
||||||
|
return sycl::malloc(size, *stream, sycl::usm::alloc::device);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void sycl_ext_free(dpct::queue_ptr stream, void * ptr) {
|
||||||
|
bool use_async = g_ggml_sycl_use_async_mem_op;
|
||||||
|
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
|
||||||
|
if (use_async) {
|
||||||
|
syclex::async_free(*stream, ptr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// If async allocation extension is not available, use_async should always be false.
|
||||||
|
GGML_ASSERT(!use_async);
|
||||||
|
#endif
|
||||||
|
sycl::free(ptr, *stream);
|
||||||
|
}
|
||||||
|
|
||||||
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
|
||||||
SYCL_CHECK(
|
|
||||||
CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
|
sycl::event copy_event;
|
||||||
.wait()));
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
||||||
|
if (!g_ggml_sycl_use_async_mem_op) {
|
||||||
|
copy_event.wait();
|
||||||
|
}
|
||||||
|
|
||||||
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
GGML_ASSERT((size % sizeof(block_q4_0) == 0));
|
||||||
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
|
||||||
int offset_blks = offset / sizeof(block_q4_0);
|
int offset_blks = offset / sizeof(block_q4_0);
|
||||||
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
|
auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
|
||||||
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
|
||||||
|
|
||||||
stream->parallel_for(
|
auto reorder_event = stream->parallel_for(
|
||||||
size / sizeof(block_q4_0),
|
size / sizeof(block_q4_0),
|
||||||
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
const block_q4_0* x = (const block_q4_0*)tmp_buf;
|
||||||
@@ -3054,9 +3103,11 @@ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nr
|
|||||||
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
*(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
|
||||||
}
|
}
|
||||||
*(d_ptr + ib) = x[ib].d;
|
*(d_ptr + ib) = x[ib].d;
|
||||||
}).wait_and_throw();
|
});
|
||||||
|
if (!g_ggml_sycl_use_async_mem_op) {
|
||||||
sycl::free(tmp_buf, *stream);
|
reorder_event.wait_and_throw();
|
||||||
|
}
|
||||||
|
sycl_ext_free(stream, tmp_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||||
@@ -3065,14 +3116,19 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
|||||||
|
|
||||||
const int nblocks = size / sizeof(block_q4_K);
|
const int nblocks = size / sizeof(block_q4_K);
|
||||||
|
|
||||||
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
|
|
||||||
|
sycl::event copy_event;
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
||||||
|
if (!g_ggml_sycl_use_async_mem_op) {
|
||||||
|
copy_event.wait();
|
||||||
|
}
|
||||||
|
|
||||||
auto * qs_ptr = data_device;
|
auto * qs_ptr = data_device;
|
||||||
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
|
auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
|
||||||
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
|
||||||
|
|
||||||
stream->parallel_for(nblocks, [=](auto i) {
|
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
|
||||||
const block_q4_K * x = (const block_q4_K *) tmp_buf;
|
const block_q4_K * x = (const block_q4_K *) tmp_buf;
|
||||||
const int ib = i;
|
const int ib = i;
|
||||||
|
|
||||||
@@ -3085,9 +3141,11 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
|
|||||||
}
|
}
|
||||||
|
|
||||||
dm_ptr[ib] = x[ib].dm;
|
dm_ptr[ib] = x[ib].dm;
|
||||||
}).wait_and_throw();
|
});
|
||||||
|
if (!g_ggml_sycl_use_async_mem_op) {
|
||||||
sycl::free(tmp_buf, *stream);
|
reorder_event.wait_and_throw();
|
||||||
|
}
|
||||||
|
sycl_ext_free(stream, tmp_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
|
||||||
@@ -3096,42 +3154,46 @@ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, d
|
|||||||
|
|
||||||
const int nblocks = size / sizeof(block_q6_K);
|
const int nblocks = size / sizeof(block_q6_K);
|
||||||
|
|
||||||
auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
|
uint8_t * tmp_buf = static_cast<uint8_t *>(sycl_ext_malloc_device(stream, size));
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
|
|
||||||
|
sycl::event copy_event;
|
||||||
|
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, size)));
|
||||||
|
if (!g_ggml_sycl_use_async_mem_op) {
|
||||||
|
copy_event.wait();
|
||||||
|
}
|
||||||
|
|
||||||
auto * ql_ptr = data_device;
|
auto * ql_ptr = data_device;
|
||||||
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
|
auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
|
||||||
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
|
auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
|
||||||
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
|
sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
|
||||||
|
|
||||||
stream
|
auto reorder_event = stream->parallel_for(nblocks, [=](auto i) {
|
||||||
->parallel_for(nblocks,
|
const block_q6_K * x = (const block_q6_K *) tmp_buf;
|
||||||
[=](auto i) {
|
const int ib = i;
|
||||||
const block_q6_K * x = (const block_q6_K *) tmp_buf;
|
|
||||||
const int ib = i;
|
|
||||||
|
|
||||||
const uint8_t * ql = x[ib].ql;
|
const uint8_t * ql = x[ib].ql;
|
||||||
const uint8_t * qh = x[ib].qh;
|
const uint8_t * qh = x[ib].qh;
|
||||||
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
|
uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
|
||||||
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
|
uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
|
||||||
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
|
uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
|
||||||
|
|
||||||
for (int j = 0; j < QK_K / 2; ++j) {
|
for (int j = 0; j < QK_K / 2; ++j) {
|
||||||
base_ql_ptr[j] = ql[j];
|
base_ql_ptr[j] = ql[j];
|
||||||
}
|
}
|
||||||
for (int j = 0; j < QK_K / 4; ++j) {
|
for (int j = 0; j < QK_K / 4; ++j) {
|
||||||
base_qh_ptr[j] = qh[j];
|
base_qh_ptr[j] = qh[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = 0; j < QK_K / 16; ++j) {
|
for (int j = 0; j < QK_K / 16; ++j) {
|
||||||
base_scales_ptr[j] = x[ib].scales[j];
|
base_scales_ptr[j] = x[ib].scales[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
dm_ptr[ib] = x[ib].d;
|
dm_ptr[ib] = x[ib].d;
|
||||||
})
|
});
|
||||||
.wait_and_throw();
|
if (!g_ggml_sycl_use_async_mem_op) {
|
||||||
|
reorder_event.wait_and_throw();
|
||||||
sycl::free(tmp_buf, *stream);
|
}
|
||||||
|
sycl_ext_free(stream, tmp_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
|
||||||
@@ -4056,6 +4118,18 @@ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
|
|||||||
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
|
GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
|
||||||
ggml_op_name(node_op));
|
ggml_op_name(node_op));
|
||||||
return false;
|
return false;
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
// We cannot use graphs with ggml_sycl_mul_mat() when SYCL async memory allocation extensions are not available,
|
||||||
|
// as SYCL malloc / free and host wait calls are not supported when recording to a graph which are all present
|
||||||
|
// in reordering.
|
||||||
|
if (!g_ggml_sycl_use_async_mem_op) {
|
||||||
|
GGML_LOG_INFO(
|
||||||
|
"%s: disabling SYCL graphs due to unsupported node type when using a compiler without the "
|
||||||
|
"oneAPI async memory allocation extension "
|
||||||
|
"%s\n",
|
||||||
|
__func__, ggml_op_name(node_op));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
Reference in New Issue
Block a user