mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	k-quants : support for super-block size of 64 (#2001)
* k_quants: WIP super-blocks with 64 weights * k_quants: WIP super-blocks with 64 weights Q6_K scalar and AVX2 works * k_quants: WIP super-blocks with 64 weights Q4_K scalar and AVX2 works * k_quants: WIP super-blocks with 64 weights Q2_K scalar and AVX2 works. Q2_K is way too slow (it is actually slower than the scalar implementation) * k_quants: WIP super-blocks with 64 weights Q3_K scalar and AVX2 works. * k_quants: WIP super-blocks with 64 weights Q5_K scalar and AVX2 works, and with that all k_quants are done on AVX2 and scalar * k_quants: WIP super-blocks with 64 weights Q6_K working on CUDA. Cannot make it run quite as gast as with super-blocks with 256 weigths: 8% slower on 4080, 20% slower on the 1660 (but there we fit 1 less layer on the GPU because pf the larger model size), so some fraction of these 20% is due to that, * k_quants: WIP super-blocks with 64 weights Q4_K working on CUDA. ~10% slower on GTX-1660, 16% slower on 4080. * k_quants: WIP super-blocks with 64 weights Q2_K working on CUDA. ~3% slower on GTX-1660, 10% slower on 4080. * k_quants: WIP super-blocks with 64 weights Q3_K working on CUDA. * k_quants: WIP super-blocks with 64 weights Q5_K working on CUDA, and with this CUDA is done. * k_quants: WIP super-blocks with 64 weights Q6_K working on ARM_NEON * k_quants: WIP super-blocks with 64 weights Q4_K working on ARM_NEON, but quite a bit slower than 256 weights * k_quants: WIP super-blocks with 64 weights Q2_K working on ARM_NEON, but quite a bit slower than 256 weights * k_quants: WIP super-blocks with 64 weights Q3_K working on ARM_NEON, but quite a bit slower than 256 weights. * k_quants: WIP super-blocks with 64 weights Q5_K working on ARM_NEON, but quite a bit slower than 256 weights. With that, we have full support for ARM_NEON, although performance is not quite there. * k_quants: WIP super-blocks with 64 weights Slightly more efficient Q3_K and Q5_K * k_quants: WIP super-blocks with 64 weights Another small improvement for Q3_K and Q5_K on ARM_NEON * k_quants: WIP super-blocks with 64 weights Yet another speedup for Q5_K on ARM_NEON. We are now within 10% of the QK_K = 256 version. * k_quants: WIP super-blocks with 64 weights * We are able to pass preprocessor macros to the Metal compiler * Q6_K works and is actually slightly more efficient than the QK_K = 256 version (25.2 ms vs 25.8 ms) * k_quants: WIP super-blocks with 64 weights Q4_K works on Metal and is actually slightly faster than QK_K = 256 (21.95 ms vs 24.0 ms). * k_quants: WIP super-blocks with 64 weights Q2_K works on Metal and is very slightly faster than QK_K = 256 (23.8 ms vs 24.2 ms). * k_quants: WIP super-blocks with 64 weights Q3_K works on Metal and is slightly faster than QK_K = 256 (26.6 ms vs 28.3 ms). * k_quants: WIP super-blocks with 64 weights Q5_K works on Metal and is slightly faster than QK_K = 256 (23.7 ms vs 26.3 ms). * k_quants: call them _K, not _k, also on Metal * k_quants: correctly define QK_K in llama.cpp * Fixed bug in q4_K quantization added with the 64-block addition * Simplify via lambda * k_quants: swicth Q3_K to 4-bit scales when QK_K = 64 Otherwise there isn't much benefit from this quantization type. There is some very slight loss in accuracy, but we reduce size by ~7%. E.g., for OpenLLaMA-3B, Q3_K_S perplexity is 8.6131 with 8-bit scales and 8.6352 with 4-bit, while file size decreases from 1.53G to 1.44G. * k_quants: switch Q4_K to 4-bit scales when QK_K = 64 Here the loss in accuracy is greater than for Q3_K, but the Q4_K points still move further to the left on the perplexity vs size curve. * k_quants: forgot to add the Metal changes in last commit * k_quants: change Q5_K to be type 0 when QK_K = 64 Still needs AVX2 implementation * k_quants: AVX2 implementation for new 64-weight Q5_K * k_quants: 10% faster ARM_NEON Q5_K dot product * k_quants: fixed issue caused by merging with master --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
		| @@ -75,6 +75,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for | ||||
| option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF) | ||||
| option(LLAMA_METAL                           "llama: use Metal"                                 OFF) | ||||
| option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON) | ||||
| option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF) | ||||
|  | ||||
| option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE}) | ||||
| option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE}) | ||||
| @@ -225,6 +226,14 @@ if (LLAMA_BLAS) | ||||
|     endif() | ||||
| endif() | ||||
|  | ||||
| if (LLAMA_K_QUANTS) | ||||
|     set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) | ||||
|     add_compile_definitions(GGML_USE_K_QUANTS) | ||||
|     if (LLAMA_QKK_64) | ||||
|         add_compile_definitions(GGML_QKK_64) | ||||
|     endif() | ||||
| endif() | ||||
|  | ||||
| if (LLAMA_CUBLAS) | ||||
|     cmake_minimum_required(VERSION 3.17) | ||||
|  | ||||
| @@ -289,11 +298,6 @@ if (LLAMA_METAL) | ||||
|         ) | ||||
| endif() | ||||
|  | ||||
| if (LLAMA_K_QUANTS) | ||||
|     set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) | ||||
|     add_compile_definitions(GGML_USE_K_QUANTS) | ||||
| endif() | ||||
|  | ||||
| if (LLAMA_CLBLAST) | ||||
|     find_package(CLBlast) | ||||
|     if (CLBlast_FOUND) | ||||
|   | ||||
							
								
								
									
										9
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										9
									
								
								Makefile
									
									
									
									
									
								
							| @@ -43,8 +43,11 @@ endif | ||||
|  | ||||
| # keep standard at C11 and C++11 | ||||
| # -Ofast tends to produce faster code, but may not be available for some compilers. | ||||
| #OPT = -Ofast | ||||
| ifdef LLAMA_FAST | ||||
| OPT = -Ofast | ||||
| else | ||||
| OPT = -O3 | ||||
| endif | ||||
| CFLAGS   = -I.              $(OPT) -std=c11   -fPIC | ||||
| CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC | ||||
| LDFLAGS  = | ||||
| @@ -131,6 +134,10 @@ ifndef LLAMA_NO_K_QUANTS | ||||
| 	CFLAGS   += -DGGML_USE_K_QUANTS | ||||
| 	CXXFLAGS += -DGGML_USE_K_QUANTS | ||||
| 	OBJS     += k_quants.o | ||||
| ifdef LLAMA_QKK_64 | ||||
| 	CFLAGS   += -DGGML_QKK_64 | ||||
| 	CXXFLAGS += -DGGML_QKK_64 | ||||
| endif | ||||
| endif | ||||
|  | ||||
| ifndef LLAMA_NO_ACCELERATE | ||||
|   | ||||
							
								
								
									
										370
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										370
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							| @@ -117,7 +117,13 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo | ||||
|  | ||||
| //================================= k-quants | ||||
|  | ||||
| #ifdef GGML_QKK_64 | ||||
| #define QK_K 64 | ||||
| #define K_SCALE_SIZE 4 | ||||
| #else | ||||
| #define QK_K 256 | ||||
| #define K_SCALE_SIZE 12 | ||||
| #endif | ||||
|  | ||||
| typedef struct { | ||||
|     uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits | ||||
| @@ -128,13 +134,25 @@ typedef struct { | ||||
| static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); | ||||
|  | ||||
| typedef struct { | ||||
|     uint8_t hmask[QK_K/8]; | ||||
|     uint8_t qs[QK_K/4]; // nibbles / quants | ||||
|     uint8_t scales[3*QK_K/64]; | ||||
|     half d; | ||||
|     uint8_t hmask[QK_K/8];     // quants - high bit | ||||
|     uint8_t qs[QK_K/4];        // quants - low 2 bits | ||||
| #ifdef GGML_QKK_64 | ||||
|     uint8_t scales[2]; // scales, quantized with 8 bits | ||||
| #else | ||||
|     uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits | ||||
| #endif | ||||
|     half d;             // super-block scale | ||||
| } block_q3_K; | ||||
| static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); | ||||
| //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); | ||||
|  | ||||
| #ifdef GGML_QKK_64 | ||||
| typedef struct { | ||||
|     half    d[2];              // super-block scales/mins | ||||
|     uint8_t scales[2];         // 4-bit block scales/mins | ||||
|     uint8_t qs[QK_K/2];        // 4--bit quants | ||||
| } block_q4_K; | ||||
| static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); | ||||
| #else | ||||
| typedef struct { | ||||
|     half d;                    // super-block scale for quantized scales | ||||
|     half dmin;                 // super-block scale for quantized mins | ||||
| @@ -142,15 +160,26 @@ typedef struct { | ||||
|     uint8_t qs[QK_K/2];        // 4--bit quants | ||||
| } block_q4_K; | ||||
| static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); | ||||
| #endif | ||||
|  | ||||
| #ifdef GGML_QKK_64 | ||||
| typedef struct { | ||||
|     half    d;                   // super-block scale for quantized scales | ||||
|     half    dmin;                // super-block scale for quantized mins | ||||
|     uint8_t scales[3*QK_K/64];   // scales, quantized with 6 bits | ||||
|     half d;                  // super-block scale | ||||
|     int8_t scales[QK_K/16];  // block scales | ||||
|     uint8_t qh[QK_K/8];      // quants, high bit | ||||
|     uint8_t qs[QK_K/2];      // quants, low 4 bits | ||||
| } block_q5_K; | ||||
| static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); | ||||
| #else | ||||
| typedef struct { | ||||
|     half d;               // super-block scale for quantized scales | ||||
|     half dmin;            // super-block scale for quantized mins | ||||
|     uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits | ||||
|     uint8_t qh[QK_K/8];          // quants, high bit | ||||
|     uint8_t qs[QK_K/2];          // quants, low 4 bits | ||||
| } block_q5_K; | ||||
| static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); | ||||
| static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); | ||||
| #endif | ||||
|  | ||||
| typedef struct { | ||||
|     uint8_t ql[QK_K/2];   // quants, lower 4 bits | ||||
| @@ -349,13 +378,14 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in | ||||
| static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { | ||||
|  | ||||
|     const int i   = blockIdx.x; | ||||
|     const block_q2_K * x = (const block_q2_K *) vx; | ||||
|  | ||||
|     const int tid = threadIdx.x; | ||||
| #if QK_K == 256 | ||||
|     const int n   = tid/32; | ||||
|     const int l   = tid - 32*n; | ||||
|     const int is  = 8*n + l/16; | ||||
|  | ||||
|     const block_q2_K * x = (const block_q2_K *) vx; | ||||
|  | ||||
|     const uint8_t q = x[i].qs[32*n + l]; | ||||
|     float * y = yy + i*QK_K + 128*n; | ||||
|  | ||||
| @@ -365,21 +395,32 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) { | ||||
|     y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); | ||||
|     y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); | ||||
|     y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); | ||||
| #else | ||||
|     const int is = tid/16;  // 0 or 1 | ||||
|     const int il = tid%16;  // 0...15 | ||||
|     const uint8_t q = x[i].qs[il] >> (2*is); | ||||
|     float * y = yy + i*QK_K + 16*is + il; | ||||
|     float dall = x[i].d; | ||||
|     float dmin = x[i].dmin; | ||||
|     y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); | ||||
|     y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { | ||||
|  | ||||
|     int r = threadIdx.x/4; | ||||
|     int i = blockIdx.x; | ||||
|     int tid = r/2; | ||||
|     int is0 = r%2; | ||||
|     int l0 = 16*is0 + 4*(threadIdx.x%4); | ||||
|     int n = tid / 4; | ||||
|     int j = tid - 4*n; | ||||
|  | ||||
|     const int i = blockIdx.x; | ||||
|     const block_q3_K * x = (const block_q3_K *) vx; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const int r = threadIdx.x/4; | ||||
|     const int tid = r/2; | ||||
|     const int is0 = r%2; | ||||
|     const int l0 = 16*is0 + 4*(threadIdx.x%4); | ||||
|     const int n = tid / 4; | ||||
|     const int j = tid - 4*n; | ||||
|  | ||||
|     uint8_t m = 1 << (4*n + j); | ||||
|     int is = 8*n + 2*j + is0; | ||||
|     int shift = 2*j; | ||||
| @@ -396,9 +437,31 @@ static __global__ void dequantize_block_q3_K(const void * vx, float * yy) { | ||||
|     const uint8_t * hm = x[i].hmask; | ||||
|  | ||||
|     for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); | ||||
| #else | ||||
|     const int tid = threadIdx.x; | ||||
|     const int is  = tid/16;  // 0 or 1 | ||||
|     const int il  = tid%16;  // 0...15 | ||||
|     const int im  = il/8;    // 0...1 | ||||
|     const int in  = il%8;    // 0...7 | ||||
|  | ||||
|     float * y = yy + i*QK_K + 16*is + il; | ||||
|  | ||||
|     const uint8_t q = x[i].qs[il] >> (2*is); | ||||
|     const uint8_t h = x[i].hmask[in] >> (2*is + im); | ||||
|     const float   d = (float)x[i].d; | ||||
|  | ||||
|     if (is == 0) { | ||||
|         y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); | ||||
|         y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); | ||||
|     } else { | ||||
|         y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); | ||||
|         y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| #if QK_K == 256 | ||||
| static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { | ||||
|     if (j < 4) { | ||||
|         d = q[j] & 63; m = q[j + 4] & 63; | ||||
| @@ -407,19 +470,14 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t | ||||
|         m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4); | ||||
|     } | ||||
| } | ||||
| #endif | ||||
|  | ||||
| static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { | ||||
|     const block_q4_K * x = (const block_q4_K *) vx; | ||||
|  | ||||
|     const int i = blockIdx.x; | ||||
|  | ||||
|     //// assume 64 threads - this is very slightly better than the one below | ||||
|     //const int tid = threadIdx.x; | ||||
|     //const int il  = tid/16; | ||||
|     //const int ir  = tid%16; | ||||
|     //const int is  = 2*il; | ||||
|     //const int n   = 2; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     // assume 32 threads | ||||
|     const int tid = threadIdx.x; | ||||
|     const int il  = tid/8; | ||||
| @@ -443,6 +501,15 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) { | ||||
|         y[l + 0] = d1 * (q[l] & 0xF) - m1; | ||||
|         y[l +32] = d2 * (q[l] >>  4) - m2; | ||||
|     } | ||||
| #else | ||||
|     const int tid = threadIdx.x; | ||||
|     const uint8_t * q = x[i].qs; | ||||
|     float * y = yy + i*QK_K; | ||||
|     const float d = (float)x[i].d[0]; | ||||
|     const float m = (float)x[i].d[1]; | ||||
|     y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); | ||||
|     y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { | ||||
| @@ -450,6 +517,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { | ||||
|  | ||||
|     const int i = blockIdx.x; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     // assume 64 threads - this is very slightly better than the one below | ||||
|     const int tid = threadIdx.x; | ||||
|     const int il  = tid/16;   // il is in 0...3 | ||||
| @@ -476,12 +544,25 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) { | ||||
|     hm <<= 1; | ||||
|     y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2; | ||||
|     y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2; | ||||
| #else | ||||
|     const int tid = threadIdx.x; | ||||
|     const uint8_t q = x[i].qs[tid]; | ||||
|     const int im = tid/8;  // 0...3 | ||||
|     const int in = tid%8;  // 0...7 | ||||
|     const int is = tid/16; // 0 or 1 | ||||
|     const uint8_t h = x[i].qh[in] >> im; | ||||
|     const float d = x[i].d; | ||||
|     float * y = yy + i*QK_K + tid; | ||||
|     y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); | ||||
|     y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { | ||||
|     const block_q6_K * x = (const block_q6_K *) vx; | ||||
|  | ||||
|     const int i = blockIdx.x; | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     // assume 64 threads - this is very slightly better than the one below | ||||
|     const int tid = threadIdx.x; | ||||
| @@ -501,6 +582,24 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) { | ||||
|     y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); | ||||
|     y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32); | ||||
|     y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32); | ||||
| #else | ||||
|  | ||||
|     // assume 32 threads | ||||
|     const int tid = threadIdx.x; | ||||
|     const int ip  = tid/16;         // 0 or 1 | ||||
|     const int il  = tid - 16*ip;    // 0...15 | ||||
|  | ||||
|     float * y = yy + i*QK_K + 16*ip + il; | ||||
|  | ||||
|     const float d = x[i].d; | ||||
|  | ||||
|     const uint8_t   ql = x[i].ql[16*ip + il]; | ||||
|     const uint8_t   qh = x[i].qh[il] >> (2*ip); | ||||
|     const int8_t  * sc = x[i].scales; | ||||
|  | ||||
|     y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); | ||||
|     y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { | ||||
| @@ -515,6 +614,9 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float | ||||
|  | ||||
|     const block_q2_K * x = (const block_q2_K *)vx + ib0; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15 | ||||
|     const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1 | ||||
|  | ||||
| @@ -528,8 +630,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float | ||||
|     const int s_offset = 8*im; | ||||
|     const int y_offset = 128*im + l0; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
|     uint32_t aux[4]; | ||||
|     const uint8_t * d = (const uint8_t *)aux; | ||||
|     const uint8_t * m = (const uint8_t *)(aux + 2); | ||||
| @@ -565,6 +665,39 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float | ||||
|         tmp += dall * sum1 - dmin * sum2; | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7 | ||||
|     const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3 | ||||
|     const int offset = tid * K_QUANTS_PER_ITERATION; | ||||
|  | ||||
|     uint32_t uaux[2]; | ||||
|     const uint8_t * d = (const uint8_t *)uaux; | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|  | ||||
|         const float   * y = yy + i * QK_K + offset; | ||||
|         const uint8_t * q = x[i].qs + offset; | ||||
|         const uint32_t * s = (const uint32_t *)x[i].scales; | ||||
|  | ||||
|         uaux[0] = s[0] & 0x0f0f0f0f; | ||||
|         uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; | ||||
|  | ||||
|         const half2 * dh = (const half2 *)&x[i].d; | ||||
|  | ||||
|         const float2 dall = __half22float2(dh[0]); | ||||
|  | ||||
|         float sum1 = 0, sum2 = 0; | ||||
|         for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { | ||||
|             const uint8_t ql = q[l]; | ||||
|             sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) | ||||
|                   + y[l+16] * d[1] * ((ql >> 2) & 3) | ||||
|                   + y[l+32] * d[2] * ((ql >> 4) & 3) | ||||
|                   + y[l+48] * d[3] * ((ql >> 6) & 3); | ||||
|             sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; | ||||
|         } | ||||
|         tmp += dall.x * sum1 - dall.y * sum2; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
|     __syncthreads(); | ||||
| @@ -573,16 +706,13 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float | ||||
|         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); | ||||
|     } | ||||
|  | ||||
|     if (tid == 0) { | ||||
|     if (threadIdx.x == 0) { | ||||
|         dst[row] = tmp; | ||||
|     } | ||||
| } | ||||
|  | ||||
| static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { | ||||
|  | ||||
|     const uint16_t kmask1 = 0x0303; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|  | ||||
|     const int row = blockIdx.y*blockDim.y + threadIdx.y; | ||||
|     if (row > nrows) return; | ||||
|  | ||||
| @@ -591,6 +721,13 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float | ||||
|  | ||||
|     const block_q3_K * x = (const block_q3_K *)vx + ib0; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     const uint16_t kmask1 = 0x0303; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|  | ||||
|     const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16 | ||||
|     const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1 | ||||
|  | ||||
| @@ -610,8 +747,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float | ||||
|  | ||||
|     const uint16_t s_shift = 4*im; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { | ||||
|  | ||||
|         const float   * y  = yy + i * QK_K + y_offset; | ||||
| @@ -640,6 +775,34 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float | ||||
|         tmp += d * sum; | ||||
|  | ||||
|     } | ||||
| #else | ||||
|  | ||||
|     const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7 | ||||
|     const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3 | ||||
|     const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14 | ||||
|     const int in = offset/8;                                 // 0 or 1 | ||||
|     const int im = offset%8;                                 // 0...7 | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|  | ||||
|         const float   * y = yy + i * QK_K + offset; | ||||
|         const uint8_t * q = x[i].qs + offset; | ||||
|         const uint8_t * s = x[i].scales; | ||||
|  | ||||
|         const float dall = (float)x[i].d; | ||||
|  | ||||
|         float sum = 0; | ||||
|         for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { | ||||
|             const uint8_t hl = x[i].hmask[im+l] >> in; | ||||
|             const uint8_t ql = q[l]; | ||||
|             sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) | ||||
|                  + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) | ||||
|                  + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) | ||||
|                  + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); | ||||
|         } | ||||
|         tmp += sum; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
|     __syncthreads(); | ||||
| @@ -648,22 +811,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float | ||||
|         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); | ||||
|     } | ||||
|  | ||||
|     if (tid == 0) { | ||||
|     if (threadIdx.x == 0) { | ||||
|         dst[row] = tmp; | ||||
|     } | ||||
| } | ||||
|  | ||||
| static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { | ||||
|  | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
|  | ||||
|     const int row = blockIdx.y*blockDim.y + threadIdx.y; | ||||
|     if (row > nrows) return; | ||||
|     const int num_blocks_per_row = ncols / QK_K; | ||||
|     const int ib0 = row*num_blocks_per_row; | ||||
|  | ||||
|     const block_q4_K * x = (const block_q4_K *)vx + ib0; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
|  | ||||
|     const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16 | ||||
|     const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0,1 | ||||
|  | ||||
| @@ -683,8 +849,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float | ||||
|     uint16_t aux[4]; | ||||
|     const uint8_t * sc = (const uint8_t *)aux; | ||||
|  | ||||
|     const block_q4_K * x = (const block_q4_K *)vx + ib0; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { | ||||
| @@ -713,6 +877,36 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float | ||||
|         tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 | ||||
|     const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION); | ||||
|  | ||||
|     const int step = tid * K_QUANTS_PER_ITERATION; | ||||
|  | ||||
|     uint16_t aux16[2]; | ||||
|     const uint8_t * s = (const uint8_t *)aux16; | ||||
|  | ||||
|     float tmp = 0; | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|         const uint8_t * q = x[i].qs + step; | ||||
|         const float   * y = yy + i*QK_K + step; | ||||
|         const uint16_t * a = (const uint16_t *)x[i].scales; | ||||
|         aux16[0] = a[0] & 0x0f0f; | ||||
|         aux16[1] = (a[0] >> 4) & 0x0f0f; | ||||
|         const float d = (float)x[i].d[0]; | ||||
|         const float m = (float)x[i].d[1]; | ||||
|         float sum = 0.f; | ||||
|         for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { | ||||
|             sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) | ||||
|                  + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) | ||||
|                  + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3]) | ||||
|                  + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]); | ||||
|         } | ||||
|         tmp += sum; | ||||
|     } | ||||
|  | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
|     __syncthreads(); | ||||
| @@ -728,15 +922,19 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float | ||||
|  | ||||
| static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) { | ||||
|  | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
|  | ||||
|     //const int row = blockIdx.x*blockDim.y + threadIdx.y; | ||||
|     const int row = blockIdx.x; | ||||
|     const int num_blocks_per_row = ncols / QK_K; | ||||
|     const int ib0 = row*num_blocks_per_row; | ||||
|  | ||||
|     const block_q5_K * x = (const block_q5_K *)vx + ib0; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
|  | ||||
|     const int tid = threadIdx.x/2;  // 0...15 | ||||
|     const int ix  = threadIdx.x%2; | ||||
|  | ||||
| @@ -757,10 +955,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float | ||||
|     uint16_t aux[4]; | ||||
|     const uint8_t * sc = (const uint8_t *)aux; | ||||
|  | ||||
|     const block_q5_K * x = (const block_q5_K *)vx + ib0; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2) { | ||||
|  | ||||
|         const uint8_t * ql1 = x[i].qs + q_offset; | ||||
| @@ -793,9 +987,32 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float | ||||
|                   + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; | ||||
|         } | ||||
|         tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; | ||||
|  | ||||
|     } | ||||
|  | ||||
| #else | ||||
|     const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 | ||||
|     const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION); | ||||
|     const int step = tid * K_QUANTS_PER_ITERATION; | ||||
|     const int im = step/8; | ||||
|     const int in = step%8; | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|         const uint8_t * q = x[i].qs + step; | ||||
|         const int8_t  * s = x[i].scales; | ||||
|         const float   * y = yy + i*QK_K + step; | ||||
|         const float     d = x[i].d; | ||||
|         float sum = 0.f; | ||||
|         for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { | ||||
|             const uint8_t h = x[i].qh[in+j] >> im; | ||||
|             sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) | ||||
|                  + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) | ||||
|                  + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16)) | ||||
|                  + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16)); | ||||
|         } | ||||
|         tmp += sum; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
|     __syncthreads(); | ||||
| #pragma unroll | ||||
| @@ -803,7 +1020,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float | ||||
|         tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); | ||||
|     } | ||||
|  | ||||
|     if (tid == 0) { | ||||
|     if (threadIdx.x == 0) { | ||||
|         dst[row] = tmp; | ||||
|     } | ||||
| } | ||||
| @@ -820,6 +1037,8 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float | ||||
|  | ||||
|     const block_q6_K * x = (const block_q6_K *)vx + ib0; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16 | ||||
|     const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1 | ||||
|  | ||||
| @@ -874,6 +1093,37 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float | ||||
|  | ||||
|     } | ||||
|  | ||||
| #else | ||||
|  | ||||
|     const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7 | ||||
|     const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3 | ||||
|  | ||||
|     const int step = tid * K_QUANTS_PER_ITERATION; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|  | ||||
|         const float   * y  = yy + i * QK_K + step; | ||||
|         const uint8_t * ql = x[i].ql + step; | ||||
|         const uint8_t * qh = x[i].qh + step; | ||||
|         const int8_t  * s  = x[i].scales; | ||||
|  | ||||
|         const float d = x[i+0].d; | ||||
|  | ||||
|         float sum = 0; | ||||
|         for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { | ||||
|             sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) | ||||
|                  + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) | ||||
|                  + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32) | ||||
|                  + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32); | ||||
|         } | ||||
|         tmp += sum; | ||||
|  | ||||
|     } | ||||
|  | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
|     __syncthreads(); | ||||
| #pragma unroll | ||||
| @@ -1252,12 +1502,20 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, const int k, cu | ||||
|  | ||||
| static void dequantize_row_q2_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { | ||||
|     const int nb = k / QK_K; | ||||
| #if QK_K == 256 | ||||
|     dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y); | ||||
| #else | ||||
|     dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static void dequantize_row_q3_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { | ||||
|     const int nb = k / QK_K; | ||||
| #if QK_K == 256 | ||||
|     dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y); | ||||
| #else | ||||
|     dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { | ||||
| @@ -1267,12 +1525,20 @@ static void dequantize_row_q4_K_cuda(const void * vx, float * y, const int k, cu | ||||
|  | ||||
| static void dequantize_row_q5_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { | ||||
|     const int nb = k / QK_K; | ||||
| #if QK_K == 256 | ||||
|     dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y); | ||||
| #else | ||||
|     dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { | ||||
|     const int nb = k / QK_K; | ||||
| #if QK_K == 256 | ||||
|     dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y); | ||||
| #else | ||||
|     dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { | ||||
|   | ||||
							
								
								
									
										66
									
								
								ggml-metal.m
									
									
									
									
									
								
							
							
						
						
									
										66
									
								
								ggml-metal.m
									
									
									
									
									
								
							| @@ -51,21 +51,21 @@ struct ggml_metal_context { | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_f16); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q4_0); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q4_1); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q2_k); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q3_k); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q4_k); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q5_k); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q6_k); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q2_K); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q3_K); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q4_K); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q5_K); | ||||
|     GGML_METAL_DECL_KERNEL(get_rows_q6_K); | ||||
|     GGML_METAL_DECL_KERNEL(rms_norm); | ||||
|     GGML_METAL_DECL_KERNEL(norm); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); | ||||
|     GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); | ||||
|     GGML_METAL_DECL_KERNEL(rope); | ||||
|     GGML_METAL_DECL_KERNEL(alibi_f32); | ||||
|     GGML_METAL_DECL_KERNEL(cpy_f32_f16); | ||||
| @@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) { | ||||
|             exit(1); | ||||
|         } | ||||
|  | ||||
| #ifdef GGML_QKK_64 | ||||
|         MTLCompileOptions* options = [MTLCompileOptions new]; | ||||
|         options.preprocessorMacros = @{ @"QK_K" : @(64) }; | ||||
|         ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error]; | ||||
| #else | ||||
|         ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; | ||||
| #endif | ||||
|         if (error) { | ||||
|             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); | ||||
|             exit(1); | ||||
| @@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) { | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_f16); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q4_0); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q4_1); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q2_k); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q3_k); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q4_k); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q5_k); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q6_k); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q2_K); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q3_K); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q4_K); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q5_K); | ||||
|         GGML_METAL_ADD_KERNEL(get_rows_q6_K); | ||||
|         GGML_METAL_ADD_KERNEL(rms_norm); | ||||
|         GGML_METAL_ADD_KERNEL(norm); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); | ||||
|         GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); | ||||
|         GGML_METAL_ADD_KERNEL(rope); | ||||
|         GGML_METAL_ADD_KERNEL(alibi_f32); | ||||
|         GGML_METAL_ADD_KERNEL(cpy_f32_f16); | ||||
| @@ -662,7 +668,7 @@ void ggml_metal_graph_compute( | ||||
|  | ||||
|                                             nth0 = 4; | ||||
|                                             nth1 = 16; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32]; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; | ||||
|                                         } break; | ||||
|                                     case GGML_TYPE_Q3_K: | ||||
|                                         { | ||||
| @@ -671,7 +677,7 @@ void ggml_metal_graph_compute( | ||||
|  | ||||
|                                             nth0 = 4; | ||||
|                                             nth1 = 16; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32]; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32]; | ||||
|                                         } break; | ||||
|                                     case GGML_TYPE_Q4_K: | ||||
|                                         { | ||||
| @@ -680,7 +686,7 @@ void ggml_metal_graph_compute( | ||||
|  | ||||
|                                             nth0 = 4; | ||||
|                                             nth1 = 16; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32]; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; | ||||
|                                         } break; | ||||
|                                     case GGML_TYPE_Q5_K: | ||||
|                                         { | ||||
| @@ -689,7 +695,7 @@ void ggml_metal_graph_compute( | ||||
|  | ||||
|                                             nth0 = 4; | ||||
|                                             nth1 = 16; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32]; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; | ||||
|                                         } break; | ||||
|                                     case GGML_TYPE_Q6_K: | ||||
|                                         { | ||||
| @@ -698,7 +704,7 @@ void ggml_metal_graph_compute( | ||||
|  | ||||
|                                             nth0 = 4; | ||||
|                                             nth1 = 16; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32]; | ||||
|                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; | ||||
|                                         } break; | ||||
|                                     default: | ||||
|                                         { | ||||
| @@ -750,11 +756,11 @@ void ggml_metal_graph_compute( | ||||
|                                 case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; | ||||
|                                 case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; | ||||
|                                 case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break; | ||||
|                                 case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break; | ||||
|                                 case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break; | ||||
|                                 case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break; | ||||
|                                 case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break; | ||||
|                                 case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break; | ||||
|                                 case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break; | ||||
|                                 case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break; | ||||
|                                 case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break; | ||||
|                                 case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break; | ||||
|                                 case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break; | ||||
|                                 default: GGML_ASSERT(false && "not implemented"); | ||||
|                             } | ||||
|  | ||||
|   | ||||
							
								
								
									
										414
									
								
								ggml-metal.metal
									
									
									
									
									
								
							
							
						
						
									
										414
									
								
								ggml-metal.metal
									
									
									
									
									
								
							| @@ -428,7 +428,7 @@ kernel void kernel_mul_mat_q4_0_f32( | ||||
|     } | ||||
|     threadgroup_barrier(mem_flags::mem_threadgroup); | ||||
|     if (ith == 0) { | ||||
|         for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; | ||||
|         for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; | ||||
|         dst[r1*ne0 + r0] = sum[0]; | ||||
|     } | ||||
| } | ||||
| @@ -497,7 +497,7 @@ kernel void kernel_mul_mat_q4_1_f32( | ||||
|     } | ||||
|     threadgroup_barrier(mem_flags::mem_threadgroup); | ||||
|     if (ith == 0) { | ||||
|         for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; | ||||
|         for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; | ||||
|         dst[r1*ne0 + r0] = sum[0]; | ||||
|     } | ||||
| } | ||||
| @@ -775,47 +775,76 @@ kernel void kernel_cpy_f32_f32( | ||||
|  | ||||
| //============================================ k-quants ====================================================== | ||||
|  | ||||
| #ifndef QK_K | ||||
| #define QK_K 256 | ||||
| #else | ||||
| static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64"); | ||||
| #endif | ||||
|  | ||||
| #if QK_K == 256 | ||||
| #define K_SCALE_SIZE 12 | ||||
| #else | ||||
| #define K_SCALE_SIZE 4 | ||||
| #endif | ||||
|  | ||||
| typedef struct { | ||||
|     uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits | ||||
|     uint8_t qs[QK_K/4];      // quants | ||||
|     half d;           // super-block scale for quantized scales | ||||
|     half dmin;        // super-block scale for quantized mins | ||||
| } block_q2_k; | ||||
| } block_q2_K; | ||||
| // 84 bytes / block | ||||
|  | ||||
| typedef struct { | ||||
|     uint8_t hmask[QK_K/8];     // quants - high bit | ||||
|     uint8_t qs[QK_K/4];        // quants - low 2 bits | ||||
|     uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits | ||||
|     half d;                    // super-block scale | ||||
| } block_q3_k; | ||||
| // 110 bytes / block | ||||
| #if QK_K == 64 | ||||
|     uint8_t scales[2]; | ||||
| #else | ||||
|     uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits | ||||
| #endif | ||||
|     half d;             // super-block scale | ||||
| } block_q3_K; | ||||
|  | ||||
| #if QK_K == 64 | ||||
| typedef struct { | ||||
|     half    d[2];          // super-block scales/mins | ||||
|     uint8_t scales[2]; | ||||
|     uint8_t qs[QK_K/2];    // 4-bit quants | ||||
| } block_q4_K; | ||||
| #else | ||||
| typedef struct { | ||||
|     half d;             // super-block scale for quantized scales | ||||
|     half dmin;          // super-block scale for quantized mins | ||||
|     uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits | ||||
|     uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits | ||||
|     uint8_t qs[QK_K/2];        // 4--bit quants | ||||
| } block_q4_k; | ||||
| // 144 bytes / block | ||||
| } block_q4_K; | ||||
| #endif | ||||
|  | ||||
| #if QK_K == 64 | ||||
| typedef struct { | ||||
|     half  d;                     // super-block scales/mins | ||||
|     int8_t  scales[QK_K/16];     // 8-bit block scales | ||||
|     uint8_t qh[QK_K/8];          // quants, high bit | ||||
|     uint8_t qs[QK_K/2];          // quants, low 4 bits | ||||
| } block_q5_K; | ||||
| #else | ||||
| typedef struct { | ||||
|     half d;                      // super-block scale for quantized scales | ||||
|     half dmin;                   // super-block scale for quantized mins | ||||
|     uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits | ||||
|     uint8_t qh[QK_K/8];          // quants, high bit | ||||
|     uint8_t qs[QK_K/2];          // quants, low 4 bits | ||||
| } block_q5_k; | ||||
| } block_q5_K; | ||||
| // 176 bytes / block | ||||
| #endif | ||||
|  | ||||
| typedef struct { | ||||
|     uint8_t ql[QK_K/2];      // quants, lower 4 bits | ||||
|     uint8_t qh[QK_K/4];      // quants, upper 2 bits | ||||
|     int8_t  scales[QK_K/16]; // scales, quantized with 8 bits | ||||
|     half d;                  // super-block scale | ||||
| } block_q6_k; | ||||
| } block_q6_K; | ||||
| // 210 bytes / block | ||||
|  | ||||
| static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { | ||||
| @@ -836,7 +865,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { | ||||
|  | ||||
| //========================================== dequantization ============================= | ||||
|  | ||||
| static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, int k) { | ||||
| static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) { | ||||
|     assert(k % QK_K == 0); | ||||
|     const int nb = k / QK_K; | ||||
|  | ||||
| @@ -847,6 +876,7 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i | ||||
|  | ||||
|         device const uint8_t * q = x[i].qs; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|         int is = 0; | ||||
|         float dl, ml; | ||||
|         for (int n = 0; n < QK_K; n += 128) { | ||||
| @@ -865,14 +895,29 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i | ||||
|             } | ||||
|             q += 32; | ||||
|         } | ||||
| #else | ||||
|         float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); | ||||
|         float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); | ||||
|         float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); | ||||
|         float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); | ||||
|         for (int l = 0; l < 16; ++l) { | ||||
|             y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1; | ||||
|             y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2; | ||||
|             y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3; | ||||
|             y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4; | ||||
|         } | ||||
|         y += QK_K; | ||||
| #endif | ||||
|  | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) { | ||||
| static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) { | ||||
|     assert(k % QK_K == 0); | ||||
|     const int nb = k / QK_K; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     const uint16_t kmask1 = 0x0303; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|  | ||||
| @@ -918,22 +963,49 @@ static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, i | ||||
|             } | ||||
|             q += 32; | ||||
|         } | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     for (int i = 0; i < nb; i++) { | ||||
|  | ||||
|         const float d_all = (float)(x[i].d); | ||||
|  | ||||
|         device const uint8_t * q = x[i].qs; | ||||
|         device const uint8_t * hm = x[i].hmask; | ||||
|  | ||||
|         const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); | ||||
|         const float d2 = d_all * ((x[i].scales[0] >>  4) - 8); | ||||
|         const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); | ||||
|         const float d4 = d_all * ((x[i].scales[1] >>  4) - 8); | ||||
|  | ||||
|         for (int l = 0; l < 8; ++l) { | ||||
|             uint8_t h = hm[l]; | ||||
|             y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); | ||||
|             y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); | ||||
|             y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); | ||||
|             y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); | ||||
|             y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); | ||||
|             y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); | ||||
|             y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); | ||||
|             y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); | ||||
|         } | ||||
|         y += QK_K; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) { | ||||
| static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) { | ||||
|     assert(k % QK_K == 0); | ||||
|     const int nb = k / QK_K; | ||||
|  | ||||
|  | ||||
|     for (int i = 0; i < nb; i++) { | ||||
|  | ||||
|         device const uint8_t * q = x[i].qs; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|         const float d = x[i].d; | ||||
|         const float min = x[i].dmin; | ||||
|  | ||||
|         device const uint8_t * q = x[i].qs; | ||||
|         device const uint8_t * scales = x[i].scales; | ||||
|  | ||||
|         int is = 0; | ||||
| @@ -945,14 +1017,29 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i | ||||
|             for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l]  >> 4) - m2; | ||||
|             q += 32; is += 2; | ||||
|         } | ||||
| #else | ||||
|         device const uint8_t * s = x[i].scales; | ||||
|         device const half2 * dh = (device const half2 *)x[i].d; | ||||
|         const float2 d = (float2)dh[0]; | ||||
|         const float d1 = d[0] * (s[0] & 0xF); | ||||
|         const float d2 = d[0] * (s[1] & 0xF); | ||||
|         const float m1 = d[1] * (s[0] >>  4); | ||||
|         const float m2 = d[1] * (s[1] >>  4); | ||||
|         for (int l = 0; l < 32; ++l) { | ||||
|             y[l+ 0] = d1 * (q[l] & 0xF) - m1; | ||||
|             y[l+32] = d2 * (q[l] >>  4) - m2; | ||||
|         } | ||||
|         y += QK_K; | ||||
| #endif | ||||
|  | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) { | ||||
| static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) { | ||||
|     assert(k % QK_K == 0); | ||||
|     const int nb = k / QK_K; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|    for (int i = 0; i < nb; i++) { | ||||
|  | ||||
|         const float d = (float)(x[i].d); | ||||
| @@ -973,10 +1060,32 @@ static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, i | ||||
|             u1 <<= 2; u2 <<= 2; | ||||
|         } | ||||
|     } | ||||
| #else | ||||
|     for (int i = 0; i < nb; i++) { | ||||
|  | ||||
|         const float d = (float)x[i].d; | ||||
|  | ||||
|         device const uint8_t * ql = x[i].qs; | ||||
|         device const uint8_t * qh = x[i].qh; | ||||
|         device const int8_t  * sc = x[i].scales; | ||||
|  | ||||
|         for (int l = 0; l < 8; ++l) { | ||||
|             y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); | ||||
|             y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); | ||||
|             y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); | ||||
|             y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); | ||||
|             y[l+32] = d * sc[2] * ((ql[l+ 0] >>  4) - (qh[l] & 0x10 ? 0 : 16)); | ||||
|             y[l+40] = d * sc[2] * ((ql[l+ 8] >>  4) - (qh[l] & 0x20 ? 0 : 16)); | ||||
|             y[l+48] = d * sc[3] * ((ql[l+16] >>  4) - (qh[l] & 0x40 ? 0 : 16)); | ||||
|             y[l+56] = d * sc[3] * ((ql[l+24] >>  4) - (qh[l] & 0x80 ? 0 : 16)); | ||||
|         } | ||||
|         y += QK_K; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) { | ||||
| static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) { | ||||
|     assert(k % QK_K == 0); | ||||
|     const int nb = k / QK_K; | ||||
|  | ||||
| @@ -988,6 +1097,7 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i | ||||
|  | ||||
|         const float d = x[i].d; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|         for (int n = 0; n < QK_K; n += 128) { | ||||
|             for (int l = 0; l < 32; ++l) { | ||||
|                 int is = l/16; | ||||
| @@ -1005,10 +1115,23 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i | ||||
|             qh += 32; | ||||
|             sc += 8; | ||||
|         } | ||||
| #else | ||||
|         for (int l = 0; l < 16; ++l) { | ||||
|             const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; | ||||
|             const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; | ||||
|             const int8_t q3 = (int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; | ||||
|             const int8_t q4 = (int8_t)((ql[l+16]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; | ||||
|             y[l+ 0] = d * sc[0] * q1; | ||||
|             y[l+16] = d * sc[1] * q2; | ||||
|             y[l+32] = d * sc[2] * q3; | ||||
|             y[l+48] = d * sc[3] * q4; | ||||
|         } | ||||
|         y  += 64; | ||||
| #endif | ||||
|     } | ||||
| } | ||||
|  | ||||
| kernel void kernel_get_rows_q2_k( | ||||
| kernel void kernel_get_rows_q2_K( | ||||
|         device const  void * src0, | ||||
|         device const   int * src1, | ||||
|         device       float * dst, | ||||
| @@ -1019,12 +1142,12 @@ kernel void kernel_get_rows_q2_k( | ||||
|     const int i = tpig; | ||||
|     const int r = ((device int32_t *) src1)[i]; | ||||
|  | ||||
|     dequantize_row_q2_k( | ||||
|             (device const block_q2_k *) ((device char *) src0 + r*nb01), | ||||
|     dequantize_row_q2_K( | ||||
|             (device const block_q2_K *) ((device char *) src0 + r*nb01), | ||||
|                        (device float *) ((device char *)  dst + i*nb1), ne00); | ||||
| } | ||||
|  | ||||
| kernel void kernel_get_rows_q3_k( | ||||
| kernel void kernel_get_rows_q3_K( | ||||
|         device const  void * src0, | ||||
|         device const   int * src1, | ||||
|         device       float * dst, | ||||
| @@ -1035,12 +1158,12 @@ kernel void kernel_get_rows_q3_k( | ||||
|     const int i = tpig; | ||||
|     const int r = ((device int32_t *) src1)[i]; | ||||
|  | ||||
|     dequantize_row_q3_k( | ||||
|             (device const block_q3_k *) ((device char *) src0 + r*nb01), | ||||
|     dequantize_row_q3_K( | ||||
|             (device const block_q3_K *) ((device char *) src0 + r*nb01), | ||||
|                        (device float *) ((device char *)  dst + i*nb1), ne00); | ||||
| } | ||||
|  | ||||
| kernel void kernel_get_rows_q4_k( | ||||
| kernel void kernel_get_rows_q4_K( | ||||
|         device const  void * src0, | ||||
|         device const   int * src1, | ||||
|         device       float * dst, | ||||
| @@ -1051,12 +1174,12 @@ kernel void kernel_get_rows_q4_k( | ||||
|     const int i = tpig; | ||||
|     const int r = ((device int32_t *) src1)[i]; | ||||
|  | ||||
|     dequantize_row_q4_k( | ||||
|             (device const block_q4_k *) ((device char *) src0 + r*nb01), | ||||
|     dequantize_row_q4_K( | ||||
|             (device const block_q4_K *) ((device char *) src0 + r*nb01), | ||||
|                        (device float *) ((device char *)  dst + i*nb1), ne00); | ||||
| } | ||||
|  | ||||
| kernel void kernel_get_rows_q5_k( | ||||
| kernel void kernel_get_rows_q5_K( | ||||
|         device const  void * src0, | ||||
|         device const   int * src1, | ||||
|         device       float * dst, | ||||
| @@ -1067,12 +1190,12 @@ kernel void kernel_get_rows_q5_k( | ||||
|     const int i = tpig; | ||||
|     const int r = ((device int32_t *) src1)[i]; | ||||
|  | ||||
|     dequantize_row_q5_k( | ||||
|             (device const block_q5_k *) ((device char *) src0 + r*nb01), | ||||
|     dequantize_row_q5_K( | ||||
|             (device const block_q5_K *) ((device char *) src0 + r*nb01), | ||||
|                        (device float *) ((device char *)  dst + i*nb1), ne00); | ||||
| } | ||||
|  | ||||
| kernel void kernel_get_rows_q6_k( | ||||
| kernel void kernel_get_rows_q6_K( | ||||
|         device const  void * src0, | ||||
|         device const   int * src1, | ||||
|         device       float * dst, | ||||
| @@ -1083,14 +1206,14 @@ kernel void kernel_get_rows_q6_k( | ||||
|     const int i = tpig; | ||||
|     const int r = ((device int32_t *) src1)[i]; | ||||
|  | ||||
|     dequantize_row_q6_k( | ||||
|             (device const block_q6_k *) ((device char *) src0 + r*nb01), | ||||
|     dequantize_row_q6_K( | ||||
|             (device const block_q6_K *) ((device char *) src0 + r*nb01), | ||||
|                        (device float *) ((device char *)  dst + i*nb1), ne00); | ||||
| } | ||||
|  | ||||
| //====================================== dot products ========================= | ||||
|  | ||||
| kernel void kernel_mul_mat_q2_k_f32( | ||||
| kernel void kernel_mul_mat_q2_K_f32( | ||||
|         device const  void * src0, | ||||
|         device const float * src1, | ||||
|         device       float * dst, | ||||
| @@ -1107,12 +1230,15 @@ kernel void kernel_mul_mat_q2_k_f32( | ||||
|     const int64_t r0 = tgpig.x; | ||||
|     const int64_t r1 = tgpig.y; | ||||
|  | ||||
|     device const block_q2_k * x = (device const block_q2_k *) src0 + r0*nb; | ||||
|     device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb; | ||||
|     device const float     * yy = (device const float      *) src1 + r1*ne10; | ||||
|  | ||||
|     const int nth = tptg.x*tptg.y; | ||||
|     const int ith = tptg.y*tpitg.x + tpitg.y; | ||||
|  | ||||
|     float sumf = 0; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const int tid = tpitg.y;    // 0...16 | ||||
|     const int il  = tid/4;      // 0...3 | ||||
|     const int ir  = tid%4;      // 0...3 | ||||
| @@ -1125,9 +1251,6 @@ kernel void kernel_mul_mat_q2_k_f32( | ||||
|     const int y_offset = 64*il + n*ir; | ||||
|     const int q_offset = 32*ip + n*ir; | ||||
|  | ||||
|     sum[ith] = 0.0f; | ||||
|  | ||||
|     float sumf = 0; | ||||
|     for (int i = tpitg.x; i < nb; i += tptg.x) { | ||||
|  | ||||
|         device const uint8_t * q = x[i].qs + q_offset; | ||||
| @@ -1140,7 +1263,6 @@ kernel void kernel_mul_mat_q2_k_f32( | ||||
|  | ||||
|         device const float   * y = yy + i*QK_K + y_offset; | ||||
|  | ||||
|         //float4 s = {0.f, 0.f, 0.f, 0.f}; | ||||
|         float2 s = {0.f, 0.f}; | ||||
|         float smin = 0; | ||||
|         for (int l = 0; l < n; ++l) { | ||||
| @@ -1155,25 +1277,38 @@ kernel void kernel_mul_mat_q2_k_f32( | ||||
|         sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin; | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     const int il = 4 * tpitg.x; | ||||
|  | ||||
|     uint32_t aux[2]; | ||||
|     thread const uint8_t * d = (thread const uint8_t *)aux; | ||||
|     thread const uint8_t * m = (thread const uint8_t *)aux + 4; | ||||
|  | ||||
|     for (int i = tpitg.y; i < nb; i += tptg.y) { | ||||
|  | ||||
|         device const uint8_t * q = x[i].qs + il; | ||||
|         device const float   * y = yy + i*QK_K + il; | ||||
|  | ||||
|         const float dall = (float)x[i].d; | ||||
|         const float dmin = (float)x[i].dmin; | ||||
|  | ||||
|         device const uint32_t * a = (device const uint32_t *)x[i].scales; | ||||
|         aux[0] = a[0] & 0x0f0f0f0f; | ||||
|         aux[1] = (a[0] >> 4) & 0x0f0f0f0f; | ||||
|  | ||||
|         for (int l = 0; l < 4; ++l) { | ||||
|             sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0]) | ||||
|                   + y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1]) | ||||
|                   + y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2]) | ||||
|                   + y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]); | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     sum[ith] = sumf; | ||||
|  | ||||
|     //int mask1 = (ith%4 == 0); | ||||
|     //int mask2 = (ith%16 == 0); | ||||
|  | ||||
|     //threadgroup_barrier(mem_flags::mem_threadgroup); | ||||
|     //for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i]; | ||||
|     //threadgroup_barrier(mem_flags::mem_threadgroup); | ||||
|     //for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i]; | ||||
|     //threadgroup_barrier(mem_flags::mem_threadgroup); | ||||
|     //if (ith == 0) { | ||||
|     //    for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; | ||||
|     //    dst[r1*ne0 + r0] = sum[0]; | ||||
|     //} | ||||
|  | ||||
|     // | ||||
|     // Accumulate the sum from all threads in the threadgroup | ||||
|     // This version is slightly faster than the commented out one below, | ||||
|     // which I copy-pasted from ggerganov's q4_0 dot product for metal. | ||||
|     // | ||||
|     threadgroup_barrier(mem_flags::mem_threadgroup); | ||||
|     if (ith%4 == 0) { | ||||
| @@ -1190,7 +1325,7 @@ kernel void kernel_mul_mat_q2_k_f32( | ||||
|     } | ||||
| } | ||||
|  | ||||
| kernel void kernel_mul_mat_q3_k_f32( | ||||
| kernel void kernel_mul_mat_q3_K_f32( | ||||
|         device const  void * src0, | ||||
|         device const float * src1, | ||||
|         device       float * dst, | ||||
| @@ -1203,23 +1338,25 @@ kernel void kernel_mul_mat_q3_k_f32( | ||||
|         uint2 tpitg[[thread_position_in_threadgroup]], | ||||
|         uint2  tptg[[threads_per_threadgroup]]) { | ||||
|  | ||||
|     const uint16_t kmask1 = 0x0303; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|  | ||||
|     const uint8_t m3 = 3; | ||||
|     const int8_t  m4 = 4; | ||||
|  | ||||
|     const int nb = ne00/QK_K; | ||||
|  | ||||
|     const int64_t r0 = tgpig.x; | ||||
|     const int64_t r1 = tgpig.y; | ||||
|  | ||||
|     device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb; | ||||
|     device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb; | ||||
|     device const float     * yy = (device const float      *) src1 + r1*ne10; | ||||
|  | ||||
|     const int nth = tptg.x*tptg.y; | ||||
|     const int ith = tptg.y*tpitg.x + tpitg.y; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     const uint8_t m3 = 3; | ||||
|     const int8_t  m4 = 4; | ||||
|  | ||||
|     const uint16_t kmask1 = 0x0303; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|  | ||||
|     const int tid = tpitg.y;        // expecting 16 | ||||
|     const int ip  = tid/8;          // 0 or 1 | ||||
|     const int il  = tid/2 - 4*ip;   // 0...3 | ||||
| @@ -1273,6 +1410,39 @@ kernel void kernel_mul_mat_q3_k_f32( | ||||
|  | ||||
|     //sum[ith] = sumf; | ||||
|     sum[ith] = sumf1 - 32.f*sumf2; | ||||
| #else | ||||
|     const int il = 4 * tpitg.x;  // 0, 4, 8, 12 | ||||
|     const int im = il/8;         // 0, 0, 1, 1 | ||||
|     const int in = il%8;         // 0, 4, 0, 4 | ||||
|  | ||||
|     float sumf = 0; | ||||
|  | ||||
|     for (int i = tpitg.y; i < nb; i += tptg.y) { | ||||
|  | ||||
|         const float d_all = (float)(x[i].d); | ||||
|  | ||||
|         device const uint8_t * q = x[i].qs + il; | ||||
|         device const uint8_t * h = x[i].hmask + in; | ||||
|         device const float   * y = yy + i * QK_K + il; | ||||
|  | ||||
|         const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); | ||||
|         const float d2 = d_all * ((x[i].scales[0] >>  4) - 8); | ||||
|         const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); | ||||
|         const float d4 = d_all * ((x[i].scales[1] >>  4) - 8); | ||||
|  | ||||
|         for (int l = 0; l < 4; ++l) { | ||||
|             const uint8_t hm = h[l] >> im; | ||||
|             sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4)) | ||||
|                   + y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4)) | ||||
|                   + y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4)) | ||||
|                   + y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4)); | ||||
|         } | ||||
|  | ||||
|     } | ||||
|  | ||||
|     sum[ith] = sumf; | ||||
|  | ||||
| #endif | ||||
|  | ||||
|     // | ||||
|     // Accumulate the sum from all threads in the threadgroup | ||||
| @@ -1293,7 +1463,7 @@ kernel void kernel_mul_mat_q3_k_f32( | ||||
|  | ||||
| } | ||||
|  | ||||
| kernel void kernel_mul_mat_q4_k_f32( | ||||
| kernel void kernel_mul_mat_q4_K_f32( | ||||
|         device const  void * src0, | ||||
|         device const float * src1, | ||||
|         device       float * dst, | ||||
| @@ -1305,21 +1475,25 @@ kernel void kernel_mul_mat_q4_k_f32( | ||||
|         uint2 tpitg[[thread_position_in_threadgroup]], | ||||
|         uint2  tptg[[threads_per_threadgroup]]) { | ||||
|  | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
|  | ||||
|     const int nb = ne00/QK_K; | ||||
|  | ||||
|     const int64_t r0 = tgpig.x; | ||||
|     const int64_t r1 = tgpig.y; | ||||
|  | ||||
|     device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb; | ||||
|     device const float     * yy = (device const float      *) src1 + r1*ne10; | ||||
|  | ||||
|     const int nth = tptg.x*tptg.y; | ||||
|     const int ith = tptg.y*tpitg.x + tpitg.y; | ||||
|  | ||||
|     device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb; | ||||
|     device const float     * yy = (device const float      *) src1 + r1*ne10; | ||||
|  | ||||
|     float sumf = 0; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
|  | ||||
|     const int tid = tpitg.y;   // 0...16 | ||||
|     const int il  = tid/4;     // 0...3 | ||||
|     const int ir  = tid - 4*il;// 0...3 | ||||
| @@ -1332,11 +1506,8 @@ kernel void kernel_mul_mat_q4_k_f32( | ||||
|     const int q_offset = 32*im + l0; | ||||
|     const int y_offset = 64*im + l0; | ||||
|  | ||||
|     sum[ith] = 0.0f; | ||||
|  | ||||
|     uchar2 sc1, sc2, sc3, sc4; | ||||
|  | ||||
|     float sumf = 0; | ||||
|     for (int i = tpitg.x; i < nb; i += tptg.x) { | ||||
|  | ||||
|         device const uint8_t * q1 = (x + i)->qs + q_offset; | ||||
| @@ -1365,6 +1536,30 @@ kernel void kernel_mul_mat_q4_k_f32( | ||||
|         sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin; | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     uint16_t aux16[2]; | ||||
|     thread const uint8_t * scales = (thread const uint8_t *)aux16; | ||||
|  | ||||
|     const int il  = 4*tpitg.x; | ||||
|  | ||||
|     for (int i = tpitg.y; i < nb; i += tptg.y) { | ||||
|  | ||||
|         device const uint8_t * q = x[i].qs + il; | ||||
|         device const float   * y = yy + i * QK_K + il; | ||||
|  | ||||
|         const float d = (float)x[i].d[0]; | ||||
|         const float m = (float)x[i].d[1]; | ||||
|  | ||||
|         device const uint16_t * a = (device const uint16_t *)x[i].scales; | ||||
|         aux16[0] = a[0] & 0x0f0f; | ||||
|         aux16[1] = (a[0] >> 4) & 0x0f0f; | ||||
|  | ||||
|         for (int l = 0; l < 4; ++l) { | ||||
|             sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16]) | ||||
|                   + d * scales[1] * (y[l+32] * (q[l] >>  4) + y[l+48] * (q[l+16] >>  4)) - m * scales[3] * (y[l+32] + y[l+48]); | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     sum[ith] = sumf; | ||||
|  | ||||
| @@ -1401,7 +1596,7 @@ kernel void kernel_mul_mat_q4_k_f32( | ||||
|     //} | ||||
| } | ||||
|  | ||||
| kernel void kernel_mul_mat_q5_k_f32( | ||||
| kernel void kernel_mul_mat_q5_K_f32( | ||||
|         device const  void * src0, | ||||
|         device const float * src1, | ||||
|         device       float * dst, | ||||
| @@ -1413,21 +1608,25 @@ kernel void kernel_mul_mat_q5_k_f32( | ||||
|         uint2 tpitg[[thread_position_in_threadgroup]], | ||||
|         uint2  tptg[[threads_per_threadgroup]]) { | ||||
|  | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
|  | ||||
|     const int nb = ne00/QK_K; | ||||
|  | ||||
|     const int64_t r0 = tgpig.x; | ||||
|     const int64_t r1 = tgpig.y; | ||||
|  | ||||
|     device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb; | ||||
|     device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb; | ||||
|     device const float     * yy = (device const float      *) src1 + r1*ne10; | ||||
|  | ||||
|     const int nth = tptg.x*tptg.y; | ||||
|     const int ith = tptg.y*tpitg.x + tpitg.y; | ||||
|  | ||||
|     float sumf = 0; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
|  | ||||
|     const int tid = tpitg.y;   // 0...16 | ||||
|     const int il  = tid/4;     // 0...3 | ||||
|     const int ir  = tid - 4*il;// 0...3 | ||||
| @@ -1447,7 +1646,6 @@ kernel void kernel_mul_mat_q5_k_f32( | ||||
|  | ||||
|     uchar2 sc1, sc2, sc3, sc4; | ||||
|  | ||||
|     float sumf = 0; | ||||
|     for (int i = tpitg.x; i < nb; i += tptg.x) { | ||||
|  | ||||
|         device const uint8_t * q1 = (x + i)->qs + q_offset; | ||||
| @@ -1479,6 +1677,28 @@ kernel void kernel_mul_mat_q5_k_f32( | ||||
|         sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin; | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     const int il  = 4 * tpitg.x;  // 0, 4, 8, 12 | ||||
|     const int im  = il/8;         // 0, 0, 1, 1 | ||||
|     const int in  = il%8;         // 0, 4, 0, 4 | ||||
|  | ||||
|     for (int i = tpitg.y; i < nb; i += tptg.y) { | ||||
|  | ||||
|         const float d = (float)x[i].d; | ||||
|         device const uint8_t * q = x[i].qs + il; | ||||
|         device const uint8_t * h = x[i].qh + in; | ||||
|         device const int8_t  * s = x[i].scales; | ||||
|         device const float   * y = yy + i*QK_K + il; | ||||
|  | ||||
|         for (int l = 0; l < 4; ++l) { | ||||
|             const uint8_t hl = h[l] >> im; | ||||
|             sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16)) | ||||
|                   + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16)) | ||||
|                   + y[l+32] * d * s[2] * ((q[l+ 0] >>  4) - (hl & 0x10 ? 0 : 16)) | ||||
|                   + y[l+48] * d * s[3] * ((q[l+16] >>  4) - (hl & 0x40 ? 0 : 16)); | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
|     sum[ith] = sumf; | ||||
|  | ||||
|     // | ||||
| @@ -1500,7 +1720,7 @@ kernel void kernel_mul_mat_q5_k_f32( | ||||
|  | ||||
| } | ||||
|  | ||||
| kernel void kernel_mul_mat_q6_k_f32( | ||||
| kernel void kernel_mul_mat_q6_K_f32( | ||||
|         device const  void * src0, | ||||
|         device const float * src1, | ||||
|         device       float * dst, | ||||
| @@ -1522,12 +1742,15 @@ kernel void kernel_mul_mat_q6_k_f32( | ||||
|     const int64_t r0 = tgpig.x; | ||||
|     const int64_t r1 = tgpig.y; | ||||
|  | ||||
|     device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb; | ||||
|     device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb; | ||||
|     device const float     * yy = (device const float      *) src1 + r1*ne10; | ||||
|  | ||||
|     const int nth = tptg.x*tptg.y; | ||||
|     const int ith = tptg.y*tpitg.x + tpitg.y; | ||||
|  | ||||
|     float sumf = 0; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     // Note: we absolutely assume that tptg.y = 16 and QK_K = 256! | ||||
|     const int iqs  = 16 * tpitg.y; | ||||
|     const int ip   = iqs / 128;         // 0 or 1 | ||||
| @@ -1540,7 +1763,6 @@ kernel void kernel_mul_mat_q6_k_f32( | ||||
|     const int q_offset_l = 64*ip + l0; | ||||
|     const int q_offset_h = 32*ip + l0; | ||||
|  | ||||
|     float sumf = 0; | ||||
|     for (int i = tpitg.x; i < nb; i += tptg.x) { | ||||
|  | ||||
|         device const uint8_t * ql = x[i].ql + q_offset_l; | ||||
| @@ -1562,6 +1784,28 @@ kernel void kernel_mul_mat_q6_k_f32( | ||||
|         sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]); | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     const int il  = 4*tpitg.x;    // 0, 4, 8, 12 | ||||
|  | ||||
|     for (int i = tpitg.y; i < nb; i += tptg.y) { | ||||
|         device const float * y = yy + i * QK_K + il; | ||||
|         device const uint8_t * ql = x[i].ql + il; | ||||
|         device const uint8_t * qh = x[i].qh + il; | ||||
|         device const int8_t  * s  = x[i].scales; | ||||
|  | ||||
|         const float d = x[i].d; | ||||
|  | ||||
|         float4 sums = {0.f, 0.f, 0.f, 0.f}; | ||||
|         for (int l = 0; l < 4; ++l) { | ||||
|             sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32); | ||||
|             sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32); | ||||
|             sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32); | ||||
|             sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32); | ||||
|         } | ||||
|         sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]); | ||||
|     } | ||||
|  | ||||
| #endif | ||||
|  | ||||
|     sum[ith] = sumf; | ||||
|  | ||||
|   | ||||
							
								
								
									
										1140
									
								
								k_quants.c
									
									
									
									
									
								
							
							
						
						
									
										1140
									
								
								k_quants.c
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										51
									
								
								k_quants.h
									
									
									
									
									
								
							
							
						
						
									
										51
									
								
								k_quants.h
									
									
									
									
									
								
							| @@ -7,7 +7,13 @@ | ||||
| #include <stddef.h> | ||||
|  | ||||
| // Super-block size | ||||
| #ifdef GGML_QKK_64 | ||||
| #define QK_K 64 | ||||
| #define K_SCALE_SIZE 4 | ||||
| #else | ||||
| #define QK_K 256 | ||||
| #define K_SCALE_SIZE 12 | ||||
| #endif | ||||
|  | ||||
| // | ||||
| // Super-block quantization structures | ||||
| @@ -29,38 +35,67 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w | ||||
| // weight is represented as x = a * q | ||||
| // 16 blocks of 16 elemenets each | ||||
| // Effectively 3.4375 bits per weight | ||||
| #ifdef GGML_QKK_64 | ||||
| typedef struct { | ||||
|     uint8_t hmask[QK_K/8];     // quants - high bit | ||||
|     uint8_t qs[QK_K/4];        // quants - low 2 bits | ||||
|     uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits | ||||
|     uint8_t scales[2]; | ||||
|     ggml_fp16_t d;             // super-block scale | ||||
| } block_q3_K; | ||||
| static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding"); | ||||
| static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding"); | ||||
| #else | ||||
| typedef struct { | ||||
|     uint8_t hmask[QK_K/8];     // quants - high bit | ||||
|     uint8_t qs[QK_K/4];        // quants - low 2 bits | ||||
|     uint8_t scales[12];        // scales, quantized with 6 bits | ||||
|     ggml_fp16_t d;             // super-block scale | ||||
| } block_q3_K; | ||||
| static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding"); | ||||
| #endif | ||||
|  | ||||
| // 4-bit quantization | ||||
| // 16 blocks of 32 elements each | ||||
| // weight is represented as x = a * q + b | ||||
| // Effectively 4.5 bits per weight | ||||
| #ifdef GGML_QKK_64 | ||||
| typedef struct { | ||||
|     ggml_fp16_t d[2];          // super-block scales/mins | ||||
|     uint8_t scales[2];         // 4-bit block scales/mins | ||||
|     uint8_t qs[QK_K/2];        // 4--bit quants | ||||
| } block_q4_K; | ||||
| static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); | ||||
| #else | ||||
| typedef struct { | ||||
|     ggml_fp16_t d;             // super-block scale for quantized scales | ||||
|     ggml_fp16_t dmin;          // super-block scale for quantized mins | ||||
|     uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits | ||||
|     uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits | ||||
|     uint8_t qs[QK_K/2];        // 4--bit quants | ||||
| } block_q4_K; | ||||
| static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); | ||||
| static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding"); | ||||
| #endif | ||||
|  | ||||
| // 5-bit quantization | ||||
| // 16 blocks of 32 elements each | ||||
| // weight is represented as x = a * q + b | ||||
| // Effectively 5.5 bits per weight | ||||
| #ifdef GGML_QKK_64 | ||||
| typedef struct { | ||||
|     ggml_fp16_t d;               // super-block scale for quantized scales | ||||
|     ggml_fp16_t dmin;            // super-block scale for quantized mins | ||||
|     uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits | ||||
|     ggml_fp16_t d;               // super-block scale | ||||
|     int8_t  scales[QK_K/16];     // 8-bit block scales | ||||
|     uint8_t qh[QK_K/8];          // quants, high bit | ||||
|     uint8_t qs[QK_K/2];          // quants, low 4 bits | ||||
| } block_q5_K; | ||||
| static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); | ||||
| static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); | ||||
| #else | ||||
| typedef struct { | ||||
|     ggml_fp16_t d;               // super-block scale for quantized scales | ||||
|     ggml_fp16_t dmin;            // super-block scale for quantized mins | ||||
|     uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits | ||||
|     uint8_t qh[QK_K/8];          // quants, high bit | ||||
|     uint8_t qs[QK_K/2];          // quants, low 4 bits | ||||
| } block_q5_K; | ||||
| static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); | ||||
| #endif | ||||
|  | ||||
| // 6-bit quantization | ||||
| // weight is represented as x = a * q | ||||
|   | ||||
							
								
								
									
										17
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -21,9 +21,13 @@ | ||||
| #endif | ||||
| #ifdef GGML_USE_K_QUANTS | ||||
| #ifndef QK_K | ||||
| #ifdef GGML_QKK_64 | ||||
| #define QK_K 64 | ||||
| #else | ||||
| #define QK_K 256 | ||||
| #endif | ||||
| #endif | ||||
| #endif | ||||
|  | ||||
| #include <array> | ||||
| #include <ctime> | ||||
| @@ -2470,6 +2474,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     std::vector<std::thread> workers; | ||||
|     std::mutex mutex; | ||||
|  | ||||
|     auto use_more_bits = [] (int i_layer, int num_layers) -> bool { | ||||
|         return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2; | ||||
|     }; | ||||
|  | ||||
|     size_t idx = 0; | ||||
|     for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { | ||||
|         llama_buffer read_data; | ||||
| @@ -2524,15 +2532,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; | ||||
|                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; | ||||
|                 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && | ||||
|                          (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 || | ||||
|                          (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; | ||||
|                         use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; | ||||
|                 else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && | ||||
|                         (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; | ||||
|                 ++i_attention_wv; | ||||
|             } else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { | ||||
|                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; | ||||
|                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; | ||||
|                 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && | ||||
|                          (i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 || | ||||
|                          (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; | ||||
|                          use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; | ||||
|                 //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; | ||||
|                 ++i_feed_forward_w2; | ||||
|             } else if (tensor.name.find("attention.wo.weight") != std::string::npos) { | ||||
|                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow