mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-01 09:01:57 +00:00
ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)
* ggml-cuda : add rope f16, restore performance * offload KQ_mask with all models * fix rope shift --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@@ -31,6 +31,7 @@ GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tens
|
||||
|
||||
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
||||
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_cuda_set_main_device(int main_device);
|
||||
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
||||
|
||||
Reference in New Issue
Block a user