mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : ggml-backend integration (#4766)
* llama : ggml-backend integration * ggml-backend : add names to buffers * fix unmap after loading * batched-bench : add tensor_split param * llama : check for null tensor_split * ggml-backend : increase GGML_MAX_BACKENDS * improve graph splitting, partial fix for --no-kv-offload * cuda : add ggml-backend split buffer support * cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available) * ggml : fix null backend dereference (#4807) * ggml : fix null backend dereference * ggml : also check ggml_backend_is_cpu * test-backend-ops : check buffer allocation failures * llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row) * ggml : fix mul_mat_id work size * llama : rewrite session kv load/set without graphs * minor * llama : only initialize used backends, free backends on context free * llama : abort ctx if cuda backend init fails * llama : rewrite lora with ggml-backend and compute on CPU ggml-ci * llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer * opencl : add ggml-backend buffer type * cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf) * llama : on Metal, by default offload the full model ggml-ci * metal : page align the data ptr (#4854) * Apply suggestions from code review Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * cuda : fix split buffer free * address review comments * llama-bench : add split-mode parameter * fix whitespace * opencl : fix double initialization * server : add --split-mode parameter * use async copy and compute to improve multi-gpu performance ggml-ci * use async memcpys to copy the graph outputs to the CPU * fix opencl * use a host buffer for the cpu compute buffer for faster copies to the gpu --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
		| @@ -16,9 +16,10 @@ extern "C" { | ||||
|     typedef void * ggml_backend_buffer_type_context_t; | ||||
|  | ||||
|     struct ggml_backend_buffer_type_i { | ||||
|         const char *          (*get_name)        (ggml_backend_buffer_type_t buft); | ||||
|         ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size); | ||||
|         size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment | ||||
|         size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding | ||||
|         size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding | ||||
|         bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend | ||||
|         // check if tensor data is in host memory | ||||
|         // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) | ||||
| @@ -34,16 +35,15 @@ extern "C" { | ||||
|     typedef void * ggml_backend_buffer_context_t; | ||||
|  | ||||
|     struct ggml_backend_buffer_i { | ||||
|         void   (*free_buffer)    (ggml_backend_buffer_t buffer); | ||||
|         //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras | ||||
|         void * (*get_base)       (ggml_backend_buffer_t buffer); | ||||
|         void   (*init_tensor)    (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); | ||||
|         void   (*set_tensor)     (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); | ||||
|         void   (*get_tensor)     (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size); | ||||
|         // (optional) copy tensor between different buffer-type, allow for single-copy tranfers | ||||
|         void   (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); | ||||
|         void   (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); | ||||
|         void   (*clear)          (ggml_backend_buffer_t buffer, uint8_t value); | ||||
|         const char * (*get_name)   (ggml_backend_buffer_t buffer); | ||||
|         void         (*free_buffer)(ggml_backend_buffer_t buffer); | ||||
|         void *       (*get_base)   (ggml_backend_buffer_t buffer); | ||||
|         void         (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); | ||||
|         void         (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); | ||||
|         void         (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size); | ||||
|         bool         (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer | ||||
|         void         (*clear)      (ggml_backend_buffer_t buffer, uint8_t value); | ||||
|         void         (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras | ||||
|     }; | ||||
|  | ||||
|     struct ggml_backend_buffer { | ||||
| @@ -51,6 +51,7 @@ extern "C" { | ||||
|         ggml_backend_buffer_type_t    buft; | ||||
|         ggml_backend_buffer_context_t context; | ||||
|         size_t size; | ||||
|         enum ggml_backend_buffer_usage usage; | ||||
|     }; | ||||
|  | ||||
|     ggml_backend_buffer_t ggml_backend_buffer_init( | ||||
| @@ -59,6 +60,8 @@ extern "C" { | ||||
|                    ggml_backend_buffer_context_t   context, | ||||
|                    size_t                          size); | ||||
|  | ||||
|     // do not use directly, use ggml_backend_tensor_copy instead | ||||
|     bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); | ||||
|  | ||||
|     // | ||||
|     // Backend | ||||
| @@ -74,22 +77,20 @@ extern "C" { | ||||
|         // buffer allocation | ||||
|         ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); | ||||
|  | ||||
|         // (optional) asynchroneous tensor data access | ||||
|         // (optional) asynchronous tensor data access | ||||
|         void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); | ||||
|         void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size); | ||||
|         bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); | ||||
|  | ||||
|         // (optional) asynchroneous tensor copy | ||||
|         void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); | ||||
|         void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); | ||||
|  | ||||
|         // (optional) complete all pending operations | ||||
|         void (*synchronize)(ggml_backend_t backend); | ||||
|  | ||||
|         // compute graph with a plan | ||||
|         ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); | ||||
|         ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); | ||||
|         void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan); | ||||
|         void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); | ||||
|  | ||||
|         // compute graph without a plan | ||||
|         // compute graph without a plan (async) | ||||
|         bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); | ||||
|  | ||||
|         // check if the backend supports an operation | ||||
| @@ -102,7 +103,6 @@ extern "C" { | ||||
|         ggml_backend_context_t context; | ||||
|     }; | ||||
|  | ||||
|  | ||||
|     // | ||||
|     // Backend registry | ||||
|     // | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren