mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-29 08:41:22 +00:00
cuBLAS: use host pinned memory and dequantize while copying (#1207)
* cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase
This commit is contained in:
26
llama_util.h
26
llama_util.h
@@ -405,4 +405,30 @@ struct llama_buffer {
|
||||
delete[] addr;
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
struct llama_ctx_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
void resize(size_t size) {
|
||||
if (addr) {
|
||||
ggml_cuda_host_free(addr);
|
||||
}
|
||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
~llama_ctx_buffer() {
|
||||
if (addr) {
|
||||
ggml_cuda_host_free(addr);
|
||||
}
|
||||
}
|
||||
};
|
||||
#else
|
||||
typedef llama_buffer llama_ctx_buffer;
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user