mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	move BLAS to a separate backend (#6210)
* move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -17,13 +17,15 @@ extern "C" { | ||||
|  | ||||
|     struct ggml_backend_buffer_type_i { | ||||
|         const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft); | ||||
|         // allocate a buffer of this type | ||||
|         ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size); | ||||
|         size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment | ||||
|         size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size | ||||
|         size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding | ||||
|         bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend | ||||
|         // tensor alignment | ||||
|         size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); | ||||
|         // max buffer size that can be allocated | ||||
|         size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); | ||||
|         // data size needed to allocate the tensor, including padding | ||||
|         size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); | ||||
|         // check if tensor data is in host memory | ||||
|         // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) | ||||
|         bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft); | ||||
|     }; | ||||
|  | ||||
| @@ -92,27 +94,37 @@ extern "C" { | ||||
|         void (*GGML_CALL synchronize)(ggml_backend_t backend); | ||||
|  | ||||
|         // compute graph with a plan (not used currently) | ||||
|         // create a new plan for a graph | ||||
|         ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); | ||||
|         void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan); | ||||
|         // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology | ||||
|         void                      (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); | ||||
|         // compute the graph with the plan | ||||
|         enum ggml_status          (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); | ||||
|  | ||||
|         // compute graph with a plan | ||||
|         enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); | ||||
|         // compute graph without a plan (async) | ||||
|         enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph); | ||||
|  | ||||
|         // check if the backend supports an operation | ||||
|         // check if the backend can compute an operation | ||||
|         bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); | ||||
|  | ||||
|         // check if the backend can use tensors allocated in a buffer type | ||||
|         bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); | ||||
|  | ||||
|         // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer | ||||
|         // these should be expensive operations with large batch sizes that may benefit from running on this backend | ||||
|         // even if the weight has to be copied from the CPU temporarily | ||||
|         bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); | ||||
|  | ||||
|         // (optional) event synchronization | ||||
|         // create a new event that can record events on this backend instance | ||||
|         ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend); | ||||
|         void                 (*GGML_CALL event_free)        (ggml_backend_event_t event); | ||||
|         // record an event on the backend instance that created it | ||||
|         void                 (*GGML_CALL event_record)      (ggml_backend_event_t event); | ||||
|         // wait for an event on on a different backend instance | ||||
|         void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event); | ||||
|         // block until an event is recorded | ||||
|         void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event); | ||||
|     }; | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren