mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	ggml : reuse quantum structs across backends (#5943)
* ggml : reuse quant blocks across backends ggml-ci * ggml : define helper constants only for CUDA and SYCL ggml-ci * ggml : define helper quantum constants for SYCL ggml-ci
This commit is contained in:
		
							
								
								
									
										409
									
								
								ggml-common.h
									
									
									
									
									
								
							
							
						
						
									
										409
									
								
								ggml-common.h
									
									
									
									
									
								
							@@ -1,4 +1,408 @@
 | 
			
		||||
#pragma once
 | 
			
		||||
#ifndef GGML_COMMON_DECL
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_COMMON_DECL_C)
 | 
			
		||||
#include <stdint.h>
 | 
			
		||||
 | 
			
		||||
typedef uint16_t ggml_half;
 | 
			
		||||
typedef uint32_t ggml_half2;
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_AGGR
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_DECL
 | 
			
		||||
#elif defined(GGML_COMMON_DECL_METAL)
 | 
			
		||||
#include <metal_stdlib>
 | 
			
		||||
 | 
			
		||||
typedef half  ggml_half;
 | 
			
		||||
typedef half2 ggml_half2;
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_AGGR
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_DECL
 | 
			
		||||
#elif defined(GGML_COMMON_DECL_CUDA)
 | 
			
		||||
#include <cuda_fp16.h>
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
 | 
			
		||||
typedef half  ggml_half;
 | 
			
		||||
typedef half2 ggml_half2;
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_AGGR data
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_DECL
 | 
			
		||||
#elif defined(GGML_COMMON_DECL_HIP)
 | 
			
		||||
#include <hip/hip_fp16.h>
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
 | 
			
		||||
typedef half  ggml_half;
 | 
			
		||||
typedef half2 ggml_half2;
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_AGGR data
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_DECL
 | 
			
		||||
#elif defined(GGML_COMMON_DECL_SYCL)
 | 
			
		||||
#include <sycl/half_type.hpp>
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
 | 
			
		||||
typedef sycl::half  ggml_half;
 | 
			
		||||
typedef sycl::half2 ggml_half2;
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_AGGR data
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_DECL
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_COMMON_DECL)
 | 
			
		||||
 | 
			
		||||
#ifndef __cplusplus
 | 
			
		||||
#ifndef static_assert
 | 
			
		||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 | 
			
		||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
 | 
			
		||||
#else
 | 
			
		||||
#define static_assert(cond, msg) struct global_scope_noop_trick
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
#endif // __cplusplus
 | 
			
		||||
 | 
			
		||||
// QK = number of values after dequantization
 | 
			
		||||
// QK_K = super-block size
 | 
			
		||||
 | 
			
		||||
#ifdef GGML_QKK_64
 | 
			
		||||
#define QK_K 64
 | 
			
		||||
#define K_SCALE_SIZE 4
 | 
			
		||||
#else
 | 
			
		||||
#define QK_K 256
 | 
			
		||||
#define K_SCALE_SIZE 12
 | 
			
		||||
#endif // GGML_QKK_64
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
 | 
			
		||||
// QR = QK / number of values before dequantization
 | 
			
		||||
// QI = number of 32 bit integers before dequantization
 | 
			
		||||
 | 
			
		||||
#define QI4_0 (QK4_0 / (4 * QR4_0))
 | 
			
		||||
#define QR4_0 2
 | 
			
		||||
 | 
			
		||||
#define QI4_1 (QK4_1 / (4 * QR4_1))
 | 
			
		||||
#define QR4_1 2
 | 
			
		||||
 | 
			
		||||
#define QI5_0 (QK5_0 / (4 * QR5_0))
 | 
			
		||||
#define QR5_0 2
 | 
			
		||||
 | 
			
		||||
#define QI5_1 (QK5_1 / (4 * QR5_1))
 | 
			
		||||
#define QR5_1 2
 | 
			
		||||
 | 
			
		||||
#define QI8_0 (QK8_0 / (4 * QR8_0))
 | 
			
		||||
#define QR8_0 1
 | 
			
		||||
 | 
			
		||||
#define QI8_1 (QK8_1 / (4 * QR8_1))
 | 
			
		||||
#define QR8_1 1
 | 
			
		||||
 | 
			
		||||
#define QI2_K (QK_K / (4*QR2_K))
 | 
			
		||||
#define QR2_K 4
 | 
			
		||||
 | 
			
		||||
#define QI3_K (QK_K / (4*QR3_K))
 | 
			
		||||
#define QR3_K 4
 | 
			
		||||
 | 
			
		||||
#define QI4_K (QK_K / (4*QR4_K))
 | 
			
		||||
#define QR4_K 2
 | 
			
		||||
 | 
			
		||||
#define QI5_K (QK_K / (4*QR5_K))
 | 
			
		||||
#define QR5_K 2
 | 
			
		||||
 | 
			
		||||
#define QI6_K (QK_K / (4*QR6_K))
 | 
			
		||||
#define QR6_K 2
 | 
			
		||||
 | 
			
		||||
#define QI2_XXS (QK_K / (4*QR2_XXS))
 | 
			
		||||
#define QR2_XXS 8
 | 
			
		||||
 | 
			
		||||
#define QI2_XS (QK_K / (4*QR2_XS))
 | 
			
		||||
#define QR2_XS 8
 | 
			
		||||
 | 
			
		||||
#define QI2_S (QK_K / (4*QR2_S))
 | 
			
		||||
#define QR2_S 8
 | 
			
		||||
 | 
			
		||||
#define QI3_XXS (QK_K / (4*QR3_XXS))
 | 
			
		||||
#define QR3_XXS 8
 | 
			
		||||
 | 
			
		||||
#define QI3_XS (QK_K / (4*QR3_XS))
 | 
			
		||||
#define QR3_XS 8
 | 
			
		||||
 | 
			
		||||
#define QI1_S (QK_K / (4*QR1_S))
 | 
			
		||||
#define QR1_S 8
 | 
			
		||||
 | 
			
		||||
#define QI4_NL (QK4_NL / (4*QR4_NL))
 | 
			
		||||
#define QR4_NL 2
 | 
			
		||||
 | 
			
		||||
#if QK_K == 64
 | 
			
		||||
#define QI4_XS QI4_NL
 | 
			
		||||
#define QR4_XS QR4_NL
 | 
			
		||||
#else
 | 
			
		||||
#define QI4_XS (QK_K / (4*QR4_XS))
 | 
			
		||||
#define QR4_XS 8
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
 | 
			
		||||
 | 
			
		||||
#define QK4_0 32
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;           // delta
 | 
			
		||||
    uint8_t qs[QK4_0 / 2]; // nibbles / quants
 | 
			
		||||
} block_q4_0;
 | 
			
		||||
static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 block size/padding");
 | 
			
		||||
 | 
			
		||||
#define QK4_1 32
 | 
			
		||||
typedef struct {
 | 
			
		||||
    union {
 | 
			
		||||
        struct {
 | 
			
		||||
            ggml_half d; // delta
 | 
			
		||||
            ggml_half m; // min
 | 
			
		||||
        } GGML_COMMON_AGGR;
 | 
			
		||||
        ggml_half2 dm;
 | 
			
		||||
    };
 | 
			
		||||
    uint8_t qs[QK4_1 / 2]; // nibbles / quants
 | 
			
		||||
} block_q4_1;
 | 
			
		||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
 | 
			
		||||
 | 
			
		||||
#define QK5_0 32
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;           // delta
 | 
			
		||||
    uint8_t qh[4];         // 5-th bit of quants
 | 
			
		||||
    uint8_t qs[QK5_0 / 2]; // nibbles / quants
 | 
			
		||||
} block_q5_0;
 | 
			
		||||
static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
 | 
			
		||||
 | 
			
		||||
#define QK5_1 32
 | 
			
		||||
typedef struct {
 | 
			
		||||
    union {
 | 
			
		||||
        struct {
 | 
			
		||||
            ggml_half d; // delta
 | 
			
		||||
            ggml_half m; // min
 | 
			
		||||
        } GGML_COMMON_AGGR;
 | 
			
		||||
        ggml_half2 dm;
 | 
			
		||||
    };
 | 
			
		||||
    uint8_t qh[4];         // 5-th bit of quants
 | 
			
		||||
    uint8_t qs[QK5_1 / 2]; // nibbles / quants
 | 
			
		||||
} block_q5_1;
 | 
			
		||||
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 | 
			
		||||
 | 
			
		||||
#define QK8_0 32
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;       // delta
 | 
			
		||||
    int8_t  qs[QK8_0]; // quants
 | 
			
		||||
} block_q8_0;
 | 
			
		||||
static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block size/padding");
 | 
			
		||||
 | 
			
		||||
#define QK8_1 32
 | 
			
		||||
typedef struct {
 | 
			
		||||
    union {
 | 
			
		||||
        struct {
 | 
			
		||||
            ggml_half d; // delta
 | 
			
		||||
            ggml_half s; // d * sum(qs[i])
 | 
			
		||||
        } GGML_COMMON_AGGR;
 | 
			
		||||
        ggml_half2 ds;
 | 
			
		||||
    };
 | 
			
		||||
    int8_t qs[QK8_1]; // quants
 | 
			
		||||
} block_q8_1;
 | 
			
		||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
 | 
			
		||||
 | 
			
		||||
//
 | 
			
		||||
// Super-block quantization structures
 | 
			
		||||
//
 | 
			
		||||
 | 
			
		||||
// 2-bit quantization
 | 
			
		||||
// weight is represented as x = a * q + b
 | 
			
		||||
// 16 blocks of 16 elements each
 | 
			
		||||
// Effectively 2.625 bits per weight
 | 
			
		||||
typedef struct {
 | 
			
		||||
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
 | 
			
		||||
    uint8_t qs[QK_K/4];      // quants
 | 
			
		||||
    union {
 | 
			
		||||
        struct {
 | 
			
		||||
            ggml_half d;    // super-block scale for quantized scales
 | 
			
		||||
            ggml_half dmin; // super-block scale for quantized mins
 | 
			
		||||
        } GGML_COMMON_AGGR;
 | 
			
		||||
        ggml_half2 dm;
 | 
			
		||||
    };
 | 
			
		||||
} block_q2_K;
 | 
			
		||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
 | 
			
		||||
 | 
			
		||||
// 3-bit quantization
 | 
			
		||||
// weight is represented as x = a * q
 | 
			
		||||
// 16 blocks of 16 elements each
 | 
			
		||||
// Effectively 3.4375 bits per weight
 | 
			
		||||
#ifdef GGML_QKK_64
 | 
			
		||||
typedef struct {
 | 
			
		||||
    uint8_t hmask[QK_K/8]; // quants - high bit
 | 
			
		||||
    uint8_t qs[QK_K/4];    // quants - low 2 bits
 | 
			
		||||
    uint8_t scales[2];
 | 
			
		||||
    ggml_half d;           // super-block scale
 | 
			
		||||
} block_q3_K;
 | 
			
		||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
 | 
			
		||||
#else
 | 
			
		||||
typedef struct {
 | 
			
		||||
    uint8_t hmask[QK_K/8]; // quants - high bit
 | 
			
		||||
    uint8_t qs[QK_K/4];    // quants - low 2 bits
 | 
			
		||||
    uint8_t scales[12];    // scales, quantized with 6 bits
 | 
			
		||||
    ggml_half d;           // super-block scale
 | 
			
		||||
} block_q3_K;
 | 
			
		||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// 4-bit quantization
 | 
			
		||||
// 8 blocks of 32 elements each
 | 
			
		||||
// weight is represented as x = a * q + b
 | 
			
		||||
// Effectively 4.5 bits per weight
 | 
			
		||||
#ifdef GGML_QKK_64
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d[2];     // super-block scales/mins
 | 
			
		||||
    uint8_t scales[2];  // 4-bit block scales/mins
 | 
			
		||||
    uint8_t qs[QK_K/2]; // 4--bit quants
 | 
			
		||||
} block_q4_K;
 | 
			
		||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
 | 
			
		||||
#else
 | 
			
		||||
typedef struct {
 | 
			
		||||
    union {
 | 
			
		||||
        struct {
 | 
			
		||||
            ggml_half d;    // super-block scale for quantized scales
 | 
			
		||||
            ggml_half dmin; // super-block scale for quantized mins
 | 
			
		||||
        } GGML_COMMON_AGGR;
 | 
			
		||||
        ggml_half2 dm;
 | 
			
		||||
    };
 | 
			
		||||
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
 | 
			
		||||
    uint8_t qs[QK_K/2];           // 4--bit quants
 | 
			
		||||
} block_q4_K;
 | 
			
		||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// 5-bit quantization
 | 
			
		||||
// 8 blocks of 32 elements each
 | 
			
		||||
// weight is represented as x = a * q + b
 | 
			
		||||
// Effectively 5.5 bits per weight
 | 
			
		||||
#ifdef GGML_QKK_64
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;             // super-block scale
 | 
			
		||||
    int8_t  scales[QK_K/16]; // 8-bit block scales
 | 
			
		||||
    uint8_t qh[QK_K/8];      // quants, high bit
 | 
			
		||||
    uint8_t qs[QK_K/2];      // quants, low 4 bits
 | 
			
		||||
} block_q5_K;
 | 
			
		||||
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
 | 
			
		||||
#else
 | 
			
		||||
typedef struct {
 | 
			
		||||
    union {
 | 
			
		||||
        struct {
 | 
			
		||||
            ggml_half d;    // super-block scale for quantized scales
 | 
			
		||||
            ggml_half dmin; // super-block scale for quantized mins
 | 
			
		||||
        } GGML_COMMON_AGGR;
 | 
			
		||||
        ggml_half2 dm;
 | 
			
		||||
    };
 | 
			
		||||
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
 | 
			
		||||
    uint8_t qh[QK_K/8];           // quants, high bit
 | 
			
		||||
    uint8_t qs[QK_K/2];           // quants, low 4 bits
 | 
			
		||||
} block_q5_K;
 | 
			
		||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
// 6-bit quantization
 | 
			
		||||
// weight is represented as x = a * q
 | 
			
		||||
// 16 blocks of 16 elements each
 | 
			
		||||
// Effectively 6.5625 bits per weight
 | 
			
		||||
typedef struct {
 | 
			
		||||
    uint8_t ql[QK_K/2];      // quants, lower 4 bits
 | 
			
		||||
    uint8_t qh[QK_K/4];      // quants, upper 2 bits
 | 
			
		||||
    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
 | 
			
		||||
    ggml_half d;             // super-block scale
 | 
			
		||||
} block_q6_K;
 | 
			
		||||
static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
 | 
			
		||||
 | 
			
		||||
// This is only used for intermediate quantization and dot products
 | 
			
		||||
typedef struct {
 | 
			
		||||
    float   d;              // delta
 | 
			
		||||
    int8_t  qs[QK_K];       // quants
 | 
			
		||||
    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
 | 
			
		||||
} block_q8_K;
 | 
			
		||||
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
 | 
			
		||||
 | 
			
		||||
// (Almost) "true" 2-bit quantization.
 | 
			
		||||
// Due to the need to use blocks as per ggml design, it ends up using
 | 
			
		||||
// 2.0625 bpw because of the 16-bit scale for each block of 256.
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;
 | 
			
		||||
    uint16_t qs[QK_K/8];
 | 
			
		||||
} block_iq2_xxs;
 | 
			
		||||
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
 | 
			
		||||
 | 
			
		||||
// 2.3125 bpw quants
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;
 | 
			
		||||
    uint16_t qs[QK_K/8];
 | 
			
		||||
    uint8_t  scales[QK_K/32];
 | 
			
		||||
} block_iq2_xs;
 | 
			
		||||
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
 | 
			
		||||
 | 
			
		||||
// 2.5625 bpw quants
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;
 | 
			
		||||
    uint8_t qs[QK_K/4];
 | 
			
		||||
    uint8_t qh[QK_K/32];
 | 
			
		||||
    uint8_t scales[QK_K/32];
 | 
			
		||||
} block_iq2_s;
 | 
			
		||||
static_assert(sizeof(block_iq2_s) == sizeof(ggml_half) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
 | 
			
		||||
 | 
			
		||||
// (Almost) "true" 3-bit quantization.
 | 
			
		||||
// Due to the need to use blocks as per ggml design, it ends up using
 | 
			
		||||
// 3.0625 bpw because of the 16-bit scale for each block of 256.
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;
 | 
			
		||||
    uint8_t qs[3*QK_K/8];
 | 
			
		||||
} block_iq3_xxs;
 | 
			
		||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
 | 
			
		||||
 | 
			
		||||
// 3.4375 bpw
 | 
			
		||||
#if QK_K == 64
 | 
			
		||||
#define IQ3S_N_SCALE 2
 | 
			
		||||
#else
 | 
			
		||||
#define IQ3S_N_SCALE QK_K/64
 | 
			
		||||
#endif
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;
 | 
			
		||||
    uint8_t qs[QK_K/4];
 | 
			
		||||
    uint8_t qh[QK_K/32];
 | 
			
		||||
    uint8_t signs[QK_K/8];
 | 
			
		||||
    uint8_t scales[IQ3S_N_SCALE];
 | 
			
		||||
} block_iq3_s;
 | 
			
		||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;
 | 
			
		||||
    uint8_t  qs[QK_K/8];
 | 
			
		||||
    uint16_t qh[QK_K/32];
 | 
			
		||||
} block_iq1_s;
 | 
			
		||||
static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
 | 
			
		||||
 | 
			
		||||
// Non-linear quants
 | 
			
		||||
#define QK4_NL 32
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;
 | 
			
		||||
    uint8_t qs[QK4_NL/2];
 | 
			
		||||
} block_iq4_nl;
 | 
			
		||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
 | 
			
		||||
 | 
			
		||||
#if QK_K == 64
 | 
			
		||||
#define block_iq4_xs block_iq4_nl
 | 
			
		||||
#else
 | 
			
		||||
typedef struct {
 | 
			
		||||
    ggml_half d;
 | 
			
		||||
    uint16_t scales_h;
 | 
			
		||||
    uint8_t  scales_l[QK_K/64];
 | 
			
		||||
    uint8_t  qs[QK_K/2];
 | 
			
		||||
} block_iq4_xs;
 | 
			
		||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif // GGML_COMMON_DECL
 | 
			
		||||
#endif // GGML_COMMON_DECL
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
#ifndef GGML_COMMON_IMPL
 | 
			
		||||
 | 
			
		||||
#if defined(GGML_COMMON_IMPL_C)
 | 
			
		||||
#include <stdint.h>
 | 
			
		||||
@@ -14,7 +418,7 @@
 | 
			
		||||
#define GGML_TABLE_END() };
 | 
			
		||||
 | 
			
		||||
#define GGML_COMMON_IMPL
 | 
			
		||||
#elif defined(GGML_COMMON_IMPL_CUDA)
 | 
			
		||||
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
 | 
			
		||||
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
 | 
			
		||||
@@ -1423,3 +1827,4 @@ GGML_TABLE_END()
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif // GGML_COMMON_IMPL
 | 
			
		||||
#endif // GGML_COMMON_IMPL
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user