mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : drop support for QK_K=64 (#7473)
* ggml : drop support for QK_K=64 ggml-ci * opencl : restore QK_K=256 define
This commit is contained in:
		
							
								
								
									
										472
									
								
								ggml-sycl.cpp
									
									
									
									
									
								
							
							
						
						
									
										472
									
								
								ggml-sycl.cpp
									
									
									
									
									
								
							| @@ -4197,7 +4197,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri | ||||
|     const block_q2_K * x = (const block_q2_K *) vx; | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| #if QK_K == 256 | ||||
|     const int n   = tid/32; | ||||
|     const int l   = tid - 32*n; | ||||
|     const int is  = 8*n + l/16; | ||||
| @@ -4211,18 +4210,6 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri | ||||
|     y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); | ||||
|     y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); | ||||
|     y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); | ||||
| #else | ||||
|     const int is = tid/16;  // 0 or 1 | ||||
|     const int il = tid%16;  // 0...15 | ||||
|     const uint8_t q = x[i].qs[il] >> (2*is); | ||||
|     dst_t * y = yy + i*QK_K + 16*is + il; | ||||
|  | ||||
|     float dall = x[i].dm[0]; | ||||
|     float dmin = x[i].dm[1]; | ||||
|     y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); | ||||
|     y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| template<typename dst_t> | ||||
| @@ -4232,7 +4219,6 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri | ||||
|     const int i = item_ct1.get_group(2); | ||||
|     const block_q3_K * x = (const block_q3_K *) vx; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const int r = item_ct1.get_local_id(2) / 4; | ||||
|     const int tid = r/2; | ||||
|     const int is0 = r%2; | ||||
| @@ -4256,31 +4242,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri | ||||
|     const uint8_t * hm = x[i].hmask; | ||||
|  | ||||
|     for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); | ||||
| #else | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
|     const int is  = tid/16;  // 0 or 1 | ||||
|     const int il  = tid%16;  // 0...15 | ||||
|     const int im  = il/8;    // 0...1 | ||||
|     const int in  = il%8;    // 0...7 | ||||
|  | ||||
|     dst_t * y = yy + i*QK_K + 16*is + il; | ||||
|  | ||||
|     const uint8_t q = x[i].qs[il] >> (2*is); | ||||
|     const uint8_t h = x[i].hmask[in] >> (2*is + im); | ||||
|     const float   d = (float)x[i].d; | ||||
|  | ||||
|     if (is == 0) { | ||||
|         y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); | ||||
|         y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); | ||||
|     } else { | ||||
|         y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); | ||||
|         y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| #if QK_K == 256 | ||||
| static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { | ||||
|     if (j < 4) { | ||||
|         d = q[j] & 63; m = q[j + 4] & 63; | ||||
| @@ -4289,7 +4252,6 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8 | ||||
|         m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4); | ||||
|     } | ||||
| } | ||||
| #endif | ||||
|  | ||||
| template<typename dst_t> | ||||
| static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, | ||||
| @@ -4298,7 +4260,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri | ||||
|  | ||||
|     const int i = item_ct1.get_group(2); | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     // assume 32 threads | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
|     const int il  = tid/8; | ||||
| @@ -4322,15 +4283,6 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri | ||||
|         y[l + 0] = d1 * (q[l] & 0xF) - m1; | ||||
|         y[l +32] = d2 * (q[l] >>  4) - m2; | ||||
|     } | ||||
| #else | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
|     const uint8_t * q = x[i].qs; | ||||
|     dst_t * y = yy + i*QK_K; | ||||
|     const float d = (float)x[i].dm[0]; | ||||
|     const float m = (float)x[i].dm[1]; | ||||
|     y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); | ||||
|     y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template<typename dst_t> | ||||
| @@ -4340,7 +4292,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri | ||||
|  | ||||
|     const int i = item_ct1.get_group(2); | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     // assume 64 threads - this is very slightly better than the one below | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
|     const int il  = tid/16;   // il is in 0...3 | ||||
| @@ -4367,18 +4318,6 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri | ||||
|     hm <<= 1; | ||||
|     y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2; | ||||
|     y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2; | ||||
| #else | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
|     const uint8_t q = x[i].qs[tid]; | ||||
|     const int im = tid/8;  // 0...3 | ||||
|     const int in = tid%8;  // 0...7 | ||||
|     const int is = tid/16; // 0 or 1 | ||||
|     const uint8_t h = x[i].qh[in] >> im; | ||||
|     const float d = x[i].d; | ||||
|     dst_t * y = yy + i*QK_K + tid; | ||||
|     y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); | ||||
|     y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template<typename dst_t> | ||||
| @@ -4387,7 +4326,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri | ||||
|     const block_q6_K * x = (const block_q6_K *) vx; | ||||
|  | ||||
|     const int i = item_ct1.get_group(2); | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     // assume 64 threads - this is very slightly better than the one below | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| @@ -4407,24 +4345,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri | ||||
|     y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); | ||||
|     y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32); | ||||
|     y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32); | ||||
| #else | ||||
|  | ||||
|     // assume 32 threads | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
|     const int ip  = tid/16;         // 0 or 1 | ||||
|     const int il  = tid - 16*ip;    // 0...15 | ||||
|  | ||||
|     dst_t * y = yy + i*QK_K + 16*ip + il; | ||||
|  | ||||
|     const float d = x[i].d; | ||||
|  | ||||
|     const uint8_t   ql = x[i].ql[16*ip + il]; | ||||
|     const uint8_t   qh = x[i].qh[il] >> (2*ip); | ||||
|     const int8_t  * sc = x[i].scales; | ||||
|  | ||||
|     y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); | ||||
|     y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template<typename dst_t> | ||||
| @@ -4438,7 +4358,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res | ||||
|     const block_iq2_xxs * x = (const block_iq2_xxs  *) vx; | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| #if QK_K == 256 | ||||
|     const int il = tid/8; // 0...3 | ||||
|     const int ib = tid%8; // 0...7 | ||||
|     dst_t * y = yy + i*QK_K + 32*ib + 8*il; | ||||
| @@ -4449,10 +4368,6 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res | ||||
|     const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; | ||||
|     const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127]; | ||||
|     for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f); | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| template<typename dst_t> | ||||
| @@ -4466,7 +4381,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest | ||||
|     const block_iq2_xs * x = (const block_iq2_xs *) vx; | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| #if QK_K == 256 | ||||
|     const int il = tid/8; // 0...3 | ||||
|     const int ib = tid%8; // 0...7 | ||||
|     dst_t * y = yy + i*QK_K + 32*ib + 8*il; | ||||
| @@ -4475,10 +4389,6 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest | ||||
|     const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; | ||||
|     const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; | ||||
|     for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| @@ -4490,7 +4400,6 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, | ||||
|     const block_iq2_s * x = (const block_iq2_s *) vx; | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| #if QK_K == 256 | ||||
|     const int il = tid/8; // 0...3 | ||||
|     const int ib = tid%8; // 0...7 | ||||
|     dst_t * y = yy + i*QK_K + 32*ib + 8*il; | ||||
| @@ -4498,13 +4407,9 @@ dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, | ||||
|     const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; | ||||
|     const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; | ||||
| #pragma unroll | ||||
|     for (int j = 0; j < 8; ++j) | ||||
|     for (int j = 0; j < 8; ++j) { | ||||
|         y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); | ||||
| #else | ||||
|     assert(false); | ||||
|  | ||||
| #endif | ||||
|  | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<typename dst_t> | ||||
| @@ -4518,7 +4423,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res | ||||
|     const block_iq3_xxs * x = (const block_iq3_xxs  *) vx; | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| #if QK_K == 256 | ||||
|     const int il = tid/8; // 0...3 | ||||
|     const int ib = tid%8; // 0...7 | ||||
|     dst_t * y = yy + i*QK_K + 32*ib + 8*il; | ||||
| @@ -4533,10 +4437,6 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res | ||||
|         y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); | ||||
|         y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); | ||||
|     } | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| @@ -4549,7 +4449,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, | ||||
|     const block_iq3_s * x = (const block_iq3_s *) vx; | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| #if QK_K == 256 | ||||
|     const int il = tid/8; // 0...3 | ||||
|     const int ib = tid%8; // 0...7 | ||||
|     dst_t * y = yy + i*QK_K + 32*ib + 8*il; | ||||
| @@ -4563,10 +4462,6 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, | ||||
|         y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); | ||||
|         y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); | ||||
|     } | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| @@ -4579,7 +4474,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, | ||||
|     const block_iq1_s * x = (const block_iq1_s  *) vx; | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| #if QK_K == 256 | ||||
|     const int il = tid/8; // 0...3 | ||||
|     const int ib = tid%8; // 0...7 | ||||
|     dst_t * y = yy + i*QK_K + 32*ib + 8*il; | ||||
| @@ -4593,10 +4487,6 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, | ||||
|     for (int j = 0; j < 8; ++j) { | ||||
|         y[j] = d * (q[j] + delta); | ||||
|     } | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| @@ -4609,7 +4499,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, | ||||
|     const block_iq1_m * x = (const block_iq1_m  *) vx; | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2); | ||||
| #if QK_K == 256 | ||||
|     const int il = tid/8; // 0...3 | ||||
|     const int ib = tid%8; // 0...7 | ||||
|     dst_t * y = yy + i*QK_K + 32*ib + 8*il; | ||||
| @@ -4627,10 +4516,6 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, | ||||
|     for (int j = 0; j < 8; ++j) { | ||||
|         y[j] = d * (q[j] + delta); | ||||
|     } | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
|  | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| @@ -4704,7 +4589,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const int tid = | ||||
|         item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 | ||||
|     const int ix = | ||||
| @@ -4755,42 +4639,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, | ||||
|         tmp += dall * sum1 - dmin * sum2; | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     const int tid = item_ct1.get_local_id(2) / | ||||
|                     (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7 | ||||
|     const int ix = item_ct1.get_local_id(2) % | ||||
|                    (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3 | ||||
|     const int offset = tid * K_QUANTS_PER_ITERATION; | ||||
|  | ||||
|     uint32_t uaux[2]; | ||||
|     const uint8_t * d = (const uint8_t *)uaux; | ||||
|  | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|  | ||||
|         const float   * y = yy + i * QK_K + offset; | ||||
|         const uint8_t * q = x[i].qs + offset; | ||||
|         const uint32_t * s = (const uint32_t *)x[i].scales; | ||||
|  | ||||
|         uaux[0] = s[0] & 0x0f0f0f0f; | ||||
|         uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; | ||||
|  | ||||
|         const sycl::float2 dall = | ||||
|             x[i].dm.convert<float, sycl::rounding_mode::automatic>(); | ||||
|  | ||||
|         float sum1 = 0, sum2 = 0; | ||||
|         for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { | ||||
|             const uint8_t ql = q[l]; | ||||
|             sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) | ||||
|                   + y[l+16] * d[1] * ((ql >> 2) & 3) | ||||
|                   + y[l+32] * d[2] * ((ql >> 4) & 3) | ||||
|                   + y[l+48] * d[3] * ((ql >> 6) & 3); | ||||
|             sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; | ||||
|         } | ||||
|         tmp += dall.x() * sum1 - dall.y() * sum2; | ||||
|     } | ||||
|  | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
| #pragma unroll | ||||
| @@ -4828,8 +4676,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     const uint16_t kmask1 = 0x0303; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|  | ||||
| @@ -4882,34 +4728,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, | ||||
|         tmp += d * sum; | ||||
|  | ||||
|     } | ||||
| #else | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7 | ||||
|     const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3 | ||||
|     const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14 | ||||
|     const int in = offset/8;                                 // 0 or 1 | ||||
|     const int im = offset%8;                                 // 0...7 | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|  | ||||
|         const float   * y = yy + i * QK_K + offset; | ||||
|         const uint8_t * q = x[i].qs + offset; | ||||
|         const uint8_t * s = x[i].scales; | ||||
|  | ||||
|         const float dall = (float)x[i].d; | ||||
|  | ||||
|         float sum = 0; | ||||
|         for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { | ||||
|             const uint8_t hl = x[i].hmask[im+l] >> in; | ||||
|             const uint8_t ql = q[l]; | ||||
|             sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) | ||||
|                  + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) | ||||
|                  + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) | ||||
|                  + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); | ||||
|         } | ||||
|         tmp += sum; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
| #pragma unroll | ||||
| @@ -4944,7 +4762,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, | ||||
|  | ||||
|     const block_q4_K * x = (const block_q4_K *)vx + ib0; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
| @@ -5033,36 +4850,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, | ||||
| #endif | ||||
|  | ||||
|     } | ||||
| #else | ||||
|     const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15 | ||||
|     const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); | ||||
|  | ||||
|     const int step = tid * K_QUANTS_PER_ITERATION; | ||||
|  | ||||
|     uint16_t aux16[2]; | ||||
|     const uint8_t * s = (const uint8_t *)aux16; | ||||
|  | ||||
|     float tmp = 0; | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|         const uint8_t * q = x[i].qs + step; | ||||
|         const float   * y = yy + i*QK_K + step; | ||||
|         const uint16_t * a = (const uint16_t *)x[i].scales; | ||||
|         aux16[0] = a[0] & 0x0f0f; | ||||
|         aux16[1] = (a[0] >> 4) & 0x0f0f; | ||||
|         const float d = (float)x[i].dm[0]; | ||||
|         const float m = (float)x[i].dm[1]; | ||||
|         float sum = 0.f; | ||||
|         for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { | ||||
|             sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) | ||||
|                  + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) | ||||
|                  + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3]) | ||||
|                  + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]); | ||||
|         } | ||||
|         tmp += sum; | ||||
|     } | ||||
|  | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
| #pragma unroll | ||||
| @@ -5097,7 +4884,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const uint16_t kmask1 = 0x3f3f; | ||||
|     const uint16_t kmask2 = 0x0f0f; | ||||
|     const uint16_t kmask3 = 0xc0c0; | ||||
| @@ -5174,30 +4960,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, | ||||
|                dmin * smin; | ||||
|     } | ||||
|  | ||||
| #else | ||||
|     const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...15 | ||||
|     const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); | ||||
|     const int step = tid * K_QUANTS_PER_ITERATION; | ||||
|     const int im = step/8; | ||||
|     const int in = step%8; | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|         const uint8_t * q = x[i].qs + step; | ||||
|         const int8_t  * s = x[i].scales; | ||||
|         const float   * y = yy + i*QK_K + step; | ||||
|         const float     d = x[i].d; | ||||
|         float sum = 0.f; | ||||
|         for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { | ||||
|             const uint8_t h = x[i].qh[in+j] >> im; | ||||
|             sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) | ||||
|                  + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) | ||||
|                  + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16)) | ||||
|                  + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16)); | ||||
|         } | ||||
|         tmp += sum; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
| #pragma unroll | ||||
|     for (int mask = 16; mask > 0; mask >>= 1) { | ||||
| @@ -5224,8 +4986,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa | ||||
|  | ||||
|     const block_q6_K * x = (const block_q6_K *)vx + ib0; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     const int tid = | ||||
|         item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 | ||||
|     const int ix = | ||||
| @@ -5282,37 +5042,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa | ||||
|  | ||||
|     } | ||||
|  | ||||
| #else | ||||
|  | ||||
|     const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION);  // 0...7 | ||||
|     const int ix  = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION);  // 0...3 | ||||
|  | ||||
|     const int step = tid * K_QUANTS_PER_ITERATION; | ||||
|  | ||||
|     float tmp = 0; // partial sum for thread in warp | ||||
|  | ||||
|     for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { | ||||
|  | ||||
|         const float   * y  = yy + i * QK_K + step; | ||||
|         const uint8_t * ql = x[i].ql + step; | ||||
|         const uint8_t * qh = x[i].qh + step; | ||||
|         const int8_t  * s  = x[i].scales; | ||||
|  | ||||
|         const float d = x[i+0].d; | ||||
|  | ||||
|         float sum = 0; | ||||
|         for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { | ||||
|             sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) | ||||
|                  + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) | ||||
|                  + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32) | ||||
|                  + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32); | ||||
|         } | ||||
|         tmp += sum; | ||||
|  | ||||
|     } | ||||
|  | ||||
| #endif | ||||
|  | ||||
|     // sum up partial sums and write back result | ||||
| #pragma unroll | ||||
|     for (int mask = 16; mask > 0; mask >>= 1) { | ||||
| @@ -6857,7 +6586,6 @@ static __dpct_inline__ float | ||||
| vec_dot_q4_K_q8_1(const void *__restrict__ vbq, | ||||
|                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) { | ||||
|  | ||||
| #ifndef GGML_QKK_64 | ||||
|     const block_q4_K * bq4_K = (const block_q4_K *) vbq; | ||||
|  | ||||
|     int    v[2]; | ||||
| @@ -6899,52 +6627,6 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq, | ||||
|     } | ||||
|  | ||||
|     return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); | ||||
|  | ||||
| #else | ||||
|  | ||||
| #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics | ||||
|     const block_q4_K * bq4_K = (const block_q4_K *) vbq; | ||||
|  | ||||
|     float sumf_d = 0.0f; | ||||
|     float sumf_m = 0.0f; | ||||
|  | ||||
|     uint16_t aux16[2]; | ||||
|     const uint8_t * s = (const uint8_t *)aux16; | ||||
|  | ||||
|     const uint16_t * a = (const uint16_t *)bq4_K->scales; | ||||
|     aux16[0] = a[0] & 0x0f0f; | ||||
|     aux16[1] = (a[0] >> 4) & 0x0f0f; | ||||
|  | ||||
|     const float dall = bq4_K->dm[0]; | ||||
|     const float dmin = bq4_K->dm[1]; | ||||
|  | ||||
|     const float d8_1 = bq8_1[0].ds[0]; | ||||
|     const float d8_2 = bq8_1[1].ds[1]; | ||||
|  | ||||
|     const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); | ||||
|     const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); | ||||
|     const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); | ||||
|     const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); | ||||
|  | ||||
|     const int * q4 = (const int *)bq4_K->qs + (iqs/2); | ||||
|     const int v1 = q4[0]; | ||||
|     const int v2 = q4[4]; | ||||
|  | ||||
|     const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0)); | ||||
|     const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); | ||||
|     const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0)); | ||||
|     const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0)); | ||||
|  | ||||
|     sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); | ||||
|     sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); | ||||
|  | ||||
|     return dall * sumf_d - dmin * sumf_m; | ||||
|  | ||||
| #else | ||||
|     bad_arch(); | ||||
| #endif // __SYCL_ARCH__ >= VER_4VEC | ||||
|  | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <int mmq_y> | ||||
| @@ -7003,11 +6685,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql, | ||||
|  | ||||
|         const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|         x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; | ||||
| #else | ||||
|         x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; | ||||
| #endif | ||||
|     } | ||||
|  | ||||
| #pragma unroll | ||||
| @@ -7050,7 +6728,6 @@ static __dpct_inline__ float | ||||
| vec_dot_q5_K_q8_1(const void *__restrict__ vbq, | ||||
|                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) { | ||||
|  | ||||
| #ifndef GGML_QKK_64 | ||||
|     const block_q5_K * bq5_K = (const block_q5_K *) vbq; | ||||
|  | ||||
|     int   vl[2]; | ||||
| @@ -7092,48 +6769,6 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq, | ||||
|     } | ||||
|  | ||||
|     return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); | ||||
|  | ||||
| #else | ||||
|  | ||||
| #if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics | ||||
|     const block_q5_K * bq5_K = (const block_q5_K *) vbq; | ||||
|  | ||||
|     const int8_t * s = bq5_K->scales; | ||||
|  | ||||
|     const float d = bq5_K->d; | ||||
|  | ||||
|     const float d8_1 = bq8_1[0].ds[0]; | ||||
|     const float d8_2 = bq8_1[1].ds[1]; | ||||
|  | ||||
|     const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); | ||||
|     const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); | ||||
|     const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); | ||||
|     const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); | ||||
|  | ||||
|     const int * ql = (const int *)bq5_K->qs + (iqs/2); | ||||
|     const int vl1 = ql[0]; | ||||
|     const int vl2 = ql[4]; | ||||
|  | ||||
|     const int step = 4 * (iqs/2); // 0, 4, 8, 12 | ||||
|     const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 | ||||
|     const int in = step%8; // 0, 4, 0, 4 | ||||
|     const int vh = (*((const int *)(bq5_K->qh + in))) >> im; | ||||
|  | ||||
|     const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); | ||||
|     const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); | ||||
|     const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); | ||||
|     const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); | ||||
|  | ||||
|     const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1]) | ||||
|                        + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]); | ||||
|  | ||||
|     return d * sumf_d; | ||||
|  | ||||
| #else | ||||
|     bad_arch(); | ||||
| #endif // __SYCL_ARCH__ >= VER_4VEC | ||||
|  | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <int mmq_y> | ||||
| @@ -7205,9 +6840,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql, | ||||
|  | ||||
|         const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; | ||||
|  | ||||
| #if QK_K == 256 | ||||
|         x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; | ||||
| #endif | ||||
|     } | ||||
|  | ||||
| #pragma unroll | ||||
| @@ -7387,7 +7020,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, | ||||
|                      const block_q8_1 *__restrict__ bq8_1, const int &iqs, | ||||
|                      const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs, | ||||
|                      const uint8_t *kmask_iq2xs) { | ||||
| #if QK_K == 256 | ||||
|     const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; | ||||
|  | ||||
| #if QR2_XXS == 8 | ||||
| @@ -7428,10 +7060,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq, | ||||
|     } | ||||
|     return d * (sumi1 + sumi2); | ||||
| #endif | ||||
| #else | ||||
|     assert(false); | ||||
|     return 0.f; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __dpct_inline__ float | ||||
| @@ -7440,7 +7068,6 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, | ||||
|                     const uint64_t *iq2xs_grid, const uint64_t *ksigns64) { | ||||
| #if DPCT_COMPATIBILITY_TEMP >=                                                 \ | ||||
|     MIN_CC_DP4A // lowest compute capability for integer intrinsics | ||||
| #if QK_K == 256 | ||||
|     const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; | ||||
|  | ||||
|     const int ib32 = iqs; | ||||
| @@ -7478,16 +7105,11 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, | ||||
|     assert(false); | ||||
|     return 0.f; | ||||
| #endif | ||||
| #else | ||||
|     assert(false); | ||||
|     return 0.f; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __dpct_inline__ float | ||||
| vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, | ||||
|                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) { | ||||
| #if QK_K == 256 | ||||
|     const block_iq2_s * bq2 = (const block_iq2_s *) vbq; | ||||
|  | ||||
|     const int ib32 = iqs; | ||||
| @@ -7531,9 +7153,6 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, | ||||
|     } | ||||
|     const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; | ||||
|     return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __dpct_inline__ float | ||||
| @@ -7542,7 +7161,6 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, | ||||
|                      const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) { | ||||
| #if DPCT_COMPATIBILITY_TEMP >=                                                 \ | ||||
|     MIN_CC_DP4A // lowest compute capability for integer intrinsics | ||||
| #if QK_K == 256 | ||||
|     const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; | ||||
|  | ||||
|     const int ib32 = iqs; | ||||
| @@ -7570,17 +7188,12 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, | ||||
|     assert(false); | ||||
|     return 0.f; | ||||
| #endif | ||||
| #else | ||||
|     assert(false); | ||||
|     return 0.f; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __dpct_inline__ float | ||||
| vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, | ||||
|                    const block_q8_1 *__restrict__ bq8_1, const int &iqs, | ||||
|                    const uint32_t *iq3s_grid) { | ||||
| #if QK_K == 256 | ||||
|     const block_iq3_s * bq2 = (const block_iq3_s *) vbq; | ||||
|  | ||||
|     const int ib32 = iqs; | ||||
| @@ -7609,16 +7222,12 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, | ||||
|         (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) * | ||||
|         bq8_1[ib32].ds[0]; | ||||
|     return d * sumi; | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __dpct_inline__ float | ||||
| vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, | ||||
|                    const block_q8_1 *__restrict__ bq8_1, const int &iqs, | ||||
|                    const uint32_t *iq1s_grid_gpu) { | ||||
| #if QK_K == 256 | ||||
|     const block_iq1_s * bq1 = (const block_iq1_s *) vbq; | ||||
|  | ||||
|     const int ib32 = iqs; | ||||
| @@ -7637,15 +7246,11 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, | ||||
|     const float d = d1q * bq8_1[ib32].ds[0]; | ||||
|     const float m = d1q * bq8_1[ib32].ds[1]; | ||||
|     return d * sumi + m * delta; | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __dpct_inline__ float | ||||
| vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, | ||||
|                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) { | ||||
| #if QK_K == 256 | ||||
|     const block_iq1_m * bq1 = (const block_iq1_m *) vbq; | ||||
|  | ||||
|     const int ib32 = iqs; | ||||
| @@ -7670,9 +7275,6 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, | ||||
|     scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); | ||||
|     const float d = (float)scale.f16 * bq8_1[ib32].ds[0]; | ||||
|     return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, | ||||
| @@ -7720,7 +7322,6 @@ static __dpct_inline__ float | ||||
| vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, | ||||
|                     const block_q8_1 *__restrict__ bq8_1, const int &iqs) { | ||||
|  | ||||
| #if QK_K == 256 | ||||
|     const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; | ||||
|     const uint8_t * values = (const uint8_t *)kvalues_iq4nl; | ||||
|  | ||||
| @@ -7738,9 +7339,6 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, | ||||
|         sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2); | ||||
|     } | ||||
|     return d * (sumi1 + sumi2); | ||||
| #else | ||||
|     assert(false); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, | ||||
| @@ -10203,7 +9801,6 @@ template <typename dst_t> | ||||
| static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                      dpct::queue_ptr stream) { | ||||
|     const int nb = k / QK_K; | ||||
| #if QK_K == 256 | ||||
|     { | ||||
|         dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                      {sycl::aspect::fp16}); | ||||
| @@ -10215,27 +9812,12 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                  dequantize_block_q2_K(vx, y, item_ct1); | ||||
|                              }); | ||||
|     } | ||||
| #else | ||||
|     { | ||||
|         dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                      {sycl::aspect::fp16}); | ||||
|  | ||||
|         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * | ||||
|                                                    sycl::range<3>(1, 1, 32), | ||||
|                                                sycl::range<3>(1, 1, 32)), | ||||
|                              [=](sycl::nd_item<3> item_ct1) { | ||||
|                                  dequantize_block_q2_K(vx, y, item_ct1); | ||||
|                              }); | ||||
|     } | ||||
|  | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                      dpct::queue_ptr stream) { | ||||
|     const int nb = k / QK_K; | ||||
| #if QK_K == 256 | ||||
|     { | ||||
|         dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                      {sycl::aspect::fp16}); | ||||
| @@ -10247,19 +9829,6 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                  dequantize_block_q3_K(vx, y, item_ct1); | ||||
|                              }); | ||||
|     } | ||||
| #else | ||||
|     { | ||||
|         dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                      {sycl::aspect::fp16}); | ||||
|  | ||||
|         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * | ||||
|                                                    sycl::range<3>(1, 1, 32), | ||||
|                                                sycl::range<3>(1, 1, 32)), | ||||
|                              [=](sycl::nd_item<3> item_ct1) { | ||||
|                                  dequantize_block_q3_K(vx, y, item_ct1); | ||||
|                              }); | ||||
|     } | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| @@ -10320,7 +9889,6 @@ template <typename dst_t> | ||||
| static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                      dpct::queue_ptr stream) { | ||||
|     const int nb = k / QK_K; | ||||
| #if QK_K == 256 | ||||
|     { | ||||
|         dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                      {sycl::aspect::fp16}); | ||||
| @@ -10332,27 +9900,12 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                  dequantize_block_q5_K(vx, y, item_ct1); | ||||
|                              }); | ||||
|     } | ||||
| #else | ||||
|     { | ||||
|         dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                      {sycl::aspect::fp16}); | ||||
|  | ||||
|         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * | ||||
|                                                    sycl::range<3>(1, 1, 32), | ||||
|                                                sycl::range<3>(1, 1, 32)), | ||||
|                              [=](sycl::nd_item<3> item_ct1) { | ||||
|                                  dequantize_block_q5_K(vx, y, item_ct1); | ||||
|                              }); | ||||
|     } | ||||
|  | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                      dpct::queue_ptr stream) { | ||||
|     const int nb = k / QK_K; | ||||
| #if QK_K == 256 | ||||
|     { | ||||
|         dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                      {sycl::aspect::fp16}); | ||||
| @@ -10364,20 +9917,6 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                  dequantize_block_q6_K(vx, y, item_ct1); | ||||
|                              }); | ||||
|     } | ||||
| #else | ||||
|     { | ||||
|         dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                      {sycl::aspect::fp16}); | ||||
|  | ||||
|         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * | ||||
|                                                    sycl::range<3>(1, 1, 32), | ||||
|                                                sycl::range<3>(1, 1, 32)), | ||||
|                              [=](sycl::nd_item<3> item_ct1) { | ||||
|                                  dequantize_block_q6_K(vx, y, item_ct1); | ||||
|                              }); | ||||
|     } | ||||
|  | ||||
| #endif | ||||
| } | ||||
|  | ||||
| template <typename dst_t> | ||||
| @@ -10529,9 +10068,6 @@ template <typename dst_t> | ||||
| static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, | ||||
|                                        dpct::queue_ptr stream) { | ||||
|     const int nb = (k + QK_K - 1) / QK_K; | ||||
| #if QK_K == 64 | ||||
|     dequantize_row_iq4_nl_sycl(vx, y, k, stream); | ||||
| #else | ||||
|       { | ||||
|             dpct::has_capability_or_fail(stream->get_device(), | ||||
|                                          {sycl::aspect::fp16}); | ||||
| @@ -10546,7 +10082,6 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, | ||||
|                       }); | ||||
|             }); | ||||
|       } | ||||
| #endif | ||||
| } | ||||
|  | ||||
|  | ||||
| @@ -12051,8 +11586,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, | ||||
|                                         const int nrows_y, const int nrows_dst, | ||||
|                                         dpct::queue_ptr stream) try { | ||||
|  | ||||
| #if QK_K == 256 | ||||
|  | ||||
|     int id; | ||||
|     SYCL_CHECK( | ||||
|         CHECK_TRY_ERROR(id = get_current_device_id())); | ||||
| @@ -12167,7 +11700,6 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, | ||||
|             }); | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
| } | ||||
| catch (sycl::exception const &exc) { | ||||
|   std::cerr << exc.what() << "Exception caught at file:" << __FILE__ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov