mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	 1607a5e5b0
			
		
	
	1607a5e5b0
	
	
	
		
			
			* backend-cpu: add online flow for aarch64 Q4_0 GEMV/GEMM kernels --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
		
			
				
	
	
		
			178 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			178 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include "ggml.h"
 | |
| #include "ggml-backend.h"
 | |
| 
 | |
| #ifdef  __cplusplus
 | |
| extern "C" {
 | |
| #endif
 | |
| 
 | |
|     // Scheduling priorities
 | |
|     enum ggml_sched_priority {
 | |
|         GGML_SCHED_PRIO_NORMAL,
 | |
|         GGML_SCHED_PRIO_MEDIUM,
 | |
|         GGML_SCHED_PRIO_HIGH,
 | |
|         GGML_SCHED_PRIO_REALTIME
 | |
|     };
 | |
| 
 | |
|     // Threadpool params
 | |
|     // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
 | |
|     struct ggml_threadpool_params {
 | |
|         bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
 | |
|         int                 n_threads;                   // number of threads
 | |
|         enum ggml_sched_priority prio;                   // thread priority
 | |
|         uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
 | |
|         bool                strict_cpu;                  // strict cpu placement
 | |
|         bool                paused;                      // start in paused state
 | |
|     };
 | |
| 
 | |
|     struct ggml_threadpool;     // forward declaration, see ggml.c
 | |
| 
 | |
|     typedef struct ggml_threadpool * ggml_threadpool_t;
 | |
| 
 | |
|     // the compute plan that needs to be prepared for ggml_graph_compute()
 | |
|     // since https://github.com/ggerganov/ggml/issues/287
 | |
|     struct ggml_cplan {
 | |
|         size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
 | |
|         uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 | |
| 
 | |
|         int n_threads;
 | |
|         struct ggml_threadpool * threadpool;
 | |
| 
 | |
|         // abort ggml_graph_compute when true
 | |
|         ggml_abort_callback abort_callback;
 | |
|         void *              abort_callback_data;
 | |
|     };
 | |
| 
 | |
|     // numa strategies
 | |
|     enum ggml_numa_strategy {
 | |
|         GGML_NUMA_STRATEGY_DISABLED   = 0,
 | |
|         GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
 | |
|         GGML_NUMA_STRATEGY_ISOLATE    = 2,
 | |
|         GGML_NUMA_STRATEGY_NUMACTL    = 3,
 | |
|         GGML_NUMA_STRATEGY_MIRROR     = 4,
 | |
|         GGML_NUMA_STRATEGY_COUNT
 | |
|     };
 | |
| 
 | |
|     GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
 | |
|     GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
 | |
| 
 | |
|     GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
 | |
|     GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
 | |
| 
 | |
|     GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
 | |
|     GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
 | |
| 
 | |
|     GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
 | |
|     GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
 | |
| 
 | |
|     GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
 | |
|     GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
 | |
| 
 | |
|     GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
 | |
|     GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
 | |
| 
 | |
|     GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
 | |
|     GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
 | |
| 
 | |
|     GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
 | |
|     GGML_BACKEND_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
 | |
|     GGML_BACKEND_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
 | |
|     GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
 | |
|     GGML_BACKEND_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
 | |
|     GGML_BACKEND_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
 | |
|     GGML_BACKEND_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
 | |
|     GGML_BACKEND_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
 | |
| 
 | |
|     // ggml_graph_plan() has to be called before ggml_graph_compute()
 | |
|     // when plan.work_size > 0, caller must allocate memory for plan.work_data
 | |
|     GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
 | |
|                   const struct ggml_cgraph * cgraph,
 | |
|                                        int   n_threads, /* = GGML_DEFAULT_N_THREADS */
 | |
|                     struct ggml_threadpool * threadpool /* = NULL */ );
 | |
|     GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
 | |
| 
 | |
|     // same as ggml_graph_compute() but the work data is allocated as a part of the context
 | |
|     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
 | |
|     GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
 | |
| 
 | |
|     //
 | |
|     // system info
 | |
|     //
 | |
| 
 | |
|     // x86
 | |
|     GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_avx        (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_fma        (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
 | |
|     // ARM
 | |
|     GGML_BACKEND_API int ggml_cpu_has_neon       (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_sve        (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
 | |
|     // other
 | |
|     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
 | |
|     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 | |
| 
 | |
|     // Internal types and functions exposed for tests and benchmarks
 | |
| 
 | |
|     typedef void (*ggml_from_float_to_mat_t)
 | |
|                                      (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
 | |
|     typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
 | |
|                                        const void * GGML_RESTRICT y, size_t by, int nrc);
 | |
|     typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
 | |
|                                        const void * GGML_RESTRICT y, int nr, int nc);
 | |
|     typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
 | |
|                                        const void * GGML_RESTRICT y, int nr, int nc);
 | |
| 
 | |
|     struct ggml_type_traits_cpu {
 | |
|         ggml_from_float_t        from_float;
 | |
|         ggml_from_float_to_mat_t from_float_to_mat;
 | |
|         ggml_vec_dot_t           vec_dot;
 | |
|         enum ggml_type           vec_dot_type;
 | |
|         int64_t                  nrows; // number of rows to process simultaneously
 | |
|         int64_t                  ncols; // number of columns to process simultaneously
 | |
|         ggml_gemv_t              gemv;
 | |
|         ggml_gemm_t              gemm;
 | |
|     };
 | |
| 
 | |
|     GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
 | |
| 
 | |
|     GGML_BACKEND_API void ggml_cpu_init(void);
 | |
| 
 | |
|     //
 | |
|     // CPU backend
 | |
|     //
 | |
| 
 | |
|     GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
 | |
| 
 | |
|     GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
 | |
|     GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
 | |
|     GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
 | |
|     GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 | |
| 
 | |
|     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 | |
| 
 | |
| #ifdef GGML_USE_CPU_HBM
 | |
|     GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 | |
| #endif
 | |
| 
 | |
|     GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
 | |
|     GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
 | |
| 
 | |
| #ifdef __cplusplus
 | |
| }
 | |
| #endif
 |