mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	* ggml: add s390x ARCH_FLAGS for compilation
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: add SIMD for s390x using vector intrinsics
SIMD is activated for:
* ggml_vec_dot_f32
* ggml_vec_dot_f16
* ggml_vec_mad_f32
* ggml_vec_mad_f16
* ggml_vec_mad_f32_unroll
* ggml_vec_scale_f32
* ggml_vec_scale_f16
SIMD is NOT activated for:
* ggml_vec_dot_f16_unroll (pending bugfix)
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix missing escape character in GGML_F32x4_REDUCE
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: add temporary patch for GGML_F32_ARR and GGML_F16_ARR
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix s390x GGML_F32x4_REDUCE
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: full SIMD activation for F32,F16 s390x
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: add option to disable s390x VXE/VXE2
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: change vecintrin.h include to ggml-cpu-impl
* add __VXE__ and __VXE2__ macros
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* cmake: add s390x target detection for VX/VXE/VXE2
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: move s390x vector intrinsics to ggml-cpu-impl.h
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x Q8_0 SIMD
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: correct documentation for Q8_0
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x reduce code complexity Q8_0
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x bugfix typo Q8_0
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activated for Q4_1
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x inline vec_reve
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activation for Q4_0
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: add VXE backend feature
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: remove test.py
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activation for quantize_row_q8_0
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activation for quantize_row_q8_1
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activation for iq4_xs
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: bugfix iq4_xs
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activation for iq4_nl
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: add float, double, and long vector data type
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: clean up iq4_xs SIMD
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix improper use of restrict keyword
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: update warning message for ggml_vec_tbl
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: untested implementation of ggml_vec_dot_iq2_xxs_q8_K
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: update ggml_vec_dot_q4_1_q8_1 to use typedefs
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: switch to restrict for iq4_nl
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: slight dot product speed improvement for q4_1_q8_1
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activation for q6_K
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: add missing `_t` to ggml_int8x16x4_t
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix missing `_t` for ggml_vec_xl_s8x4
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix more missing `_t`
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: add unroll and prefetch to Q8_0
increase of 3.86% for prompt processing and 32.22% for token generation
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: patch Q8_0 to use proper vector sizes
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: optimise Q8_0 dot prod compute kernel further
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: add unroll and prefetch to Q4_1
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: refactor Q6_K variable naming for readability
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix Q6_K typos
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activation for Q5_K
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix wrong char*x16_t naming
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: Q5_K y0 wrong signness
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix Q5_K invalid uchar type
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix Q5_K invalid uchar type
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: s390x SIMD activation for Q4_K
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: fix Q4_K invalid vector intrinsics
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: simplify ggml_padd_s16 compute kernel
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: correct ggml-cpu vxe wording
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: change ggml_aligned_malloc alignment to 256
256 is the cache line size for s390x platforms
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: resolve pr merge via cherry-pick 225bbbf
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml : fix LoongArch compile error with 128-bit SIMD (#11701)
* ggml: resolve pr merge via cherry-pick 4571953
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
* ggml: cmake remove fork when determining s390x machine type
thank you @ericcurtin
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---------
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
Co-authored-by: Jinyang He <hejinyang@loongson.cn>
Co-authored-by: junchao-zhao <68935141+junchao-loongson@users.noreply.github.com>
		
	
		
			
				
	
	
		
			138 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			138 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
#pragma once
 | 
						|
 | 
						|
#include "ggml.h"
 | 
						|
#include "ggml-backend.h"
 | 
						|
 | 
						|
#ifdef  __cplusplus
 | 
						|
extern "C" {
 | 
						|
#endif
 | 
						|
 | 
						|
    // the compute plan that needs to be prepared for ggml_graph_compute()
 | 
						|
    // since https://github.com/ggml-org/ggml/issues/287
 | 
						|
    struct ggml_cplan {
 | 
						|
        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
 | 
						|
        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 | 
						|
 | 
						|
        int n_threads;
 | 
						|
        struct ggml_threadpool * threadpool;
 | 
						|
 | 
						|
        // abort ggml_graph_compute when true
 | 
						|
        ggml_abort_callback abort_callback;
 | 
						|
        void *              abort_callback_data;
 | 
						|
    };
 | 
						|
 | 
						|
    // numa strategies
 | 
						|
    enum ggml_numa_strategy {
 | 
						|
        GGML_NUMA_STRATEGY_DISABLED   = 0,
 | 
						|
        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
 | 
						|
        GGML_NUMA_STRATEGY_ISOLATE    = 2,
 | 
						|
        GGML_NUMA_STRATEGY_NUMACTL    = 3,
 | 
						|
        GGML_NUMA_STRATEGY_MIRROR     = 4,
 | 
						|
        GGML_NUMA_STRATEGY_COUNT
 | 
						|
    };
 | 
						|
 | 
						|
    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
 | 
						|
    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
 | 
						|
 | 
						|
    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
 | 
						|
    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
 | 
						|
 | 
						|
    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
 | 
						|
    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
 | 
						|
 | 
						|
    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
 | 
						|
    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
 | 
						|
 | 
						|
    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
 | 
						|
    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
 | 
						|
 | 
						|
    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
 | 
						|
    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
 | 
						|
 | 
						|
    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
 | 
						|
    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
 | 
						|
 | 
						|
    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
 | 
						|
    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
 | 
						|
    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
 | 
						|
    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
 | 
						|
    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
 | 
						|
 | 
						|
    // ggml_graph_plan() has to be called before ggml_graph_compute()
 | 
						|
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
 | 
						|
    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
 | 
						|
                  const struct ggml_cgraph * cgraph,
 | 
						|
                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
 | 
						|
                    struct ggml_threadpool * threadpool /* = NULL */ );
 | 
						|
    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
 | 
						|
 | 
						|
    // same as ggml_graph_compute() but the work data is allocated as a part of the context
 | 
						|
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
 | 
						|
    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
 | 
						|
 | 
						|
    //
 | 
						|
    // system info
 | 
						|
    //
 | 
						|
 | 
						|
    // x86
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
 | 
						|
    // ARM
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
 | 
						|
    // other
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
 | 
						|
    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 | 
						|
 | 
						|
    // Internal types and functions exposed for tests and benchmarks
 | 
						|
 | 
						|
    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
 | 
						|
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
 | 
						|
 | 
						|
    struct ggml_type_traits_cpu {
 | 
						|
        ggml_from_float_t        from_float;
 | 
						|
        ggml_vec_dot_t           vec_dot;
 | 
						|
        enum ggml_type           vec_dot_type;
 | 
						|
        int64_t                  nrows; // number of rows to process simultaneously
 | 
						|
    };
 | 
						|
 | 
						|
    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
 | 
						|
 | 
						|
    GGML_BACKEND_API void ggml_cpu_init(void);
 | 
						|
 | 
						|
    //
 | 
						|
    // CPU backend
 | 
						|
    //
 | 
						|
 | 
						|
    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
 | 
						|
 | 
						|
    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
 | 
						|
    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
 | 
						|
    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
 | 
						|
    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 | 
						|
 | 
						|
    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 | 
						|
 | 
						|
#ifdef __cplusplus
 | 
						|
}
 | 
						|
#endif
 |