mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0 This does not change anything for ternary models, since their values should never end up being in halfway cases anyway.
This commit is contained in:
		@@ -3330,7 +3330,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
 | 
				
			|||||||
            for (size_t m = 0; m < 32; ++m) {
 | 
					            for (size_t m = 0; m < 32; ++m) {
 | 
				
			||||||
                uint8_t q = 0;
 | 
					                uint8_t q = 0;
 | 
				
			||||||
                for (size_t n = 0; n < 5; ++n) {
 | 
					                for (size_t n = 0; n < 5; ++n) {
 | 
				
			||||||
                    int xi = nearest_int(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2
 | 
					                    int xi = lroundf(x[m + n*32] * id) + 1; // -1, 0, 1 -> 0, 1, 2
 | 
				
			||||||
                    q *= 3;
 | 
					                    q *= 3;
 | 
				
			||||||
                    q += xi;
 | 
					                    q += xi;
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
@@ -3345,7 +3345,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
 | 
				
			|||||||
            for (size_t m = 0; m < 16; ++m) {
 | 
					            for (size_t m = 0; m < 16; ++m) {
 | 
				
			||||||
                uint8_t q = 0;
 | 
					                uint8_t q = 0;
 | 
				
			||||||
                for (size_t n = 0; n < 5; ++n) {
 | 
					                for (size_t n = 0; n < 5; ++n) {
 | 
				
			||||||
                    int xi = nearest_int(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2
 | 
					                    int xi = lroundf(x[m + n*16] * id) + 1; // -1, 0, 1 -> 0, 1, 2
 | 
				
			||||||
                    q *= 3;
 | 
					                    q *= 3;
 | 
				
			||||||
                    q += xi;
 | 
					                    q += xi;
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
@@ -3360,7 +3360,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
 | 
				
			|||||||
            uint8_t q = 0;
 | 
					            uint8_t q = 0;
 | 
				
			||||||
            for (size_t m = 0; m < 4; ++m) {
 | 
					            for (size_t m = 0; m < 4; ++m) {
 | 
				
			||||||
                // -1, 0, 1 -> 0, 1, 2
 | 
					                // -1, 0, 1 -> 0, 1, 2
 | 
				
			||||||
                int xi = nearest_int(x[j + m*sizeof(y->qh)] * id) + 1;
 | 
					                int xi = lroundf(x[j + m*sizeof(y->qh)] * id) + 1;
 | 
				
			||||||
                q *= 3;
 | 
					                q *= 3;
 | 
				
			||||||
                q += xi;
 | 
					                q += xi;
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
@@ -3396,7 +3396,7 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
 | 
				
			|||||||
                uint8_t q = 0;
 | 
					                uint8_t q = 0;
 | 
				
			||||||
                for (size_t n = 0; n < 4; ++n) {
 | 
					                for (size_t n = 0; n < 4; ++n) {
 | 
				
			||||||
                    // -1, 0, 1 -> 0, 1, 2
 | 
					                    // -1, 0, 1 -> 0, 1, 2
 | 
				
			||||||
                    int xi = nearest_int(x[m + n*32] * id) + 1;
 | 
					                    int xi = lroundf(x[m + n*32] * id) + 1;
 | 
				
			||||||
                    q += (xi & 3) << (2*n);
 | 
					                    q += (xi & 3) << (2*n);
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                y[i].qs[j + m] = q;
 | 
					                y[i].qs[j + m] = q;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -574,6 +574,87 @@ class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
 | 
				
			|||||||
        return (d * q).reshape((n_blocks, QK_K))
 | 
					        return (d * q).reshape((n_blocks, QK_K))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 | 
				
			||||||
 | 
					        n_blocks = blocks.shape[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        d = abs(blocks).max(axis=-1, keepdims=True)
 | 
				
			||||||
 | 
					        with np.errstate(divide="ignore"):
 | 
				
			||||||
 | 
					            id = np.where(d == 0, 0, 1 / d)
 | 
				
			||||||
 | 
					        qs = np_roundf(blocks * id)
 | 
				
			||||||
 | 
					        qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):]
 | 
				
			||||||
 | 
					        qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
 | 
				
			||||||
 | 
					        qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
 | 
				
			||||||
 | 
					        qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
 | 
				
			||||||
 | 
					        qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
 | 
				
			||||||
 | 
					        qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1))
 | 
				
			||||||
 | 
					        qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
 | 
				
			||||||
 | 
					        qs = np.concatenate([qs0, qs1, qh], axis=-1)
 | 
				
			||||||
 | 
					        qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        qs = qs.astype(np.uint8)
 | 
				
			||||||
 | 
					        d = d.astype(np.float16).view(np.uint8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return np.concatenate([qs, d], axis=-1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 | 
				
			||||||
 | 
					        n_blocks = blocks.shape[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
 | 
				
			||||||
 | 
					        qh, d = np.hsplit(rest, [QK_K // 64])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        d = d.view(np.float16).astype(np.float32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        qs0, qs1 = qs[..., :32], qs[..., 32:]
 | 
				
			||||||
 | 
					        qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
 | 
				
			||||||
 | 
					        qs0 = qs0.reshape((n_blocks, -1))
 | 
				
			||||||
 | 
					        qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
 | 
				
			||||||
 | 
					        qs1 = qs1.reshape((n_blocks, -1))
 | 
				
			||||||
 | 
					        qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1))
 | 
				
			||||||
 | 
					        qh = qh.reshape((n_blocks, -1))
 | 
				
			||||||
 | 
					        qs = np.concatenate([qs0, qs1, qh], axis=-1)
 | 
				
			||||||
 | 
					        qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return (d * qs.astype(np.float32))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 | 
				
			||||||
 | 
					        n_blocks = blocks.shape[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        d = abs(blocks).max(axis=-1, keepdims=True)
 | 
				
			||||||
 | 
					        with np.errstate(divide="ignore"):
 | 
				
			||||||
 | 
					            id = np.where(d == 0, 0, 1 / d)
 | 
				
			||||||
 | 
					        qs = np_roundf(blocks * id)
 | 
				
			||||||
 | 
					        qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
 | 
				
			||||||
 | 
					        qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
 | 
				
			||||||
 | 
					        qs = qs.reshape((n_blocks, -1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        d = d.astype(np.float16).view(np.uint8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return np.concatenate([qs, d], axis=-1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
 | 
				
			||||||
 | 
					        n_blocks = blocks.shape[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        qs, d = np.hsplit(blocks, [QK_K // 4])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        d = d.view(np.float16).astype(np.float32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
 | 
				
			||||||
 | 
					        qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return (d * qs.astype(np.float32))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
 | 
					class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
 | 
				
			||||||
    ksigns: bytes = (
 | 
					    ksigns: bytes = (
 | 
				
			||||||
        b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
 | 
					        b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -66,6 +66,7 @@ class GGMLQuants:
 | 
				
			|||||||
        for t in (
 | 
					        for t in (
 | 
				
			||||||
            "q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
 | 
					            "q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
 | 
				
			||||||
            "q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
 | 
					            "q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
 | 
				
			||||||
 | 
					            "tq1_0", "tq2_0",
 | 
				
			||||||
            "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
 | 
					            "iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
 | 
				
			||||||
            "iq4_nl", "iq4_xs",
 | 
					            "iq4_nl", "iq4_xs",
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user