mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-01 09:01:57 +00:00
Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
@@ -1327,6 +1327,8 @@ class GGMLQuantizationType(IntEnum):
|
||||
Q4_0_4_4 = 31
|
||||
Q4_0_4_8 = 32
|
||||
Q4_0_8_8 = 33
|
||||
TQ1_0 = 34
|
||||
TQ2_0 = 35
|
||||
|
||||
|
||||
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||
@@ -1371,6 +1373,8 @@ class LlamaFileType(IntEnum):
|
||||
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
||||
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
||||
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
||||
MOSTLY_TQ1_0 = 36 # except 1d tensors
|
||||
MOSTLY_TQ2_0 = 37 # except 1d tensors
|
||||
|
||||
GUESSED = 1024 # not specified in the model file
|
||||
|
||||
@@ -1447,6 +1451,8 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
||||
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
||||
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -574,6 +574,87 @@ class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
|
||||
return (d * q).reshape((n_blocks, QK_K))
|
||||
|
||||
|
||||
class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
|
||||
@classmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
d = abs(blocks).max(axis=-1, keepdims=True)
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
|
||||
|
||||
qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):]
|
||||
qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
|
||||
qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
|
||||
qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
|
||||
qs = np.concatenate([qs0, qs1, qh], axis=-1)
|
||||
qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
|
||||
|
||||
qs = qs.astype(np.uint8)
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
|
||||
return np.concatenate([qs, d], axis=-1)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
|
||||
qh, d = np.hsplit(rest, [QK_K // 64])
|
||||
|
||||
d = d.view(np.float16).astype(np.float32)
|
||||
|
||||
qs0, qs1 = qs[..., :32], qs[..., 32:]
|
||||
qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs0 = qs0.reshape((n_blocks, -1))
|
||||
qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs1 = qs1.reshape((n_blocks, -1))
|
||||
qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qh = qh.reshape((n_blocks, -1))
|
||||
qs = np.concatenate([qs0, qs1, qh], axis=-1)
|
||||
qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
|
||||
|
||||
return (d * qs.astype(np.float32))
|
||||
|
||||
|
||||
class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
|
||||
@classmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
d = abs(blocks).max(axis=-1, keepdims=True)
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
|
||||
|
||||
qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
|
||||
qs = qs.reshape((n_blocks, -1))
|
||||
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
|
||||
return np.concatenate([qs, d], axis=-1)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
qs, d = np.hsplit(blocks, [QK_K // 4])
|
||||
|
||||
d = d.view(np.float16).astype(np.float32)
|
||||
|
||||
qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
|
||||
|
||||
return (d * qs.astype(np.float32))
|
||||
|
||||
|
||||
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
|
||||
ksigns: bytes = (
|
||||
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
|
||||
|
||||
@@ -23,6 +23,7 @@ python = ">=3.8"
|
||||
numpy = ">=1.17"
|
||||
tqdm = ">=4.27"
|
||||
pyyaml = ">=5.1"
|
||||
sentencepiece = ">=0.1.98,<=0.2.0"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
|
||||
@@ -66,6 +66,7 @@ class GGMLQuants:
|
||||
for t in (
|
||||
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
|
||||
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
||||
"tq1_0", "tq2_0",
|
||||
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
||||
"iq4_nl", "iq4_xs",
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user