mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-29 08:41:22 +00:00
convert : remove unused field ModelTensorInfo.src_qtype
This commit is contained in:
@@ -65,8 +65,7 @@ class ModelTensorInfo:
|
|||||||
load: Callable[[], Tensor]
|
load: Callable[[], Tensor]
|
||||||
size: int # in elements
|
size: int # in elements
|
||||||
src_type: str
|
src_type: str
|
||||||
src_qtype: gguf.GGMLQuantizationType | None = None
|
auto_qtype: gguf.GGMLQuantizationType | None = None
|
||||||
dst_qtype: gguf.GGMLQuantizationType | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class ModelBase:
|
class ModelBase:
|
||||||
@@ -139,17 +138,17 @@ class ModelBase:
|
|||||||
# find out the most common type
|
# find out the most common type
|
||||||
hist: dict[gguf.GGMLQuantizationType, int] = {}
|
hist: dict[gguf.GGMLQuantizationType, int] = {}
|
||||||
for t in self.model_tensors.values():
|
for t in self.model_tensors.values():
|
||||||
if t.dst_qtype is not None:
|
if t.auto_qtype is not None:
|
||||||
if t.dst_qtype not in hist:
|
if t.auto_qtype not in hist:
|
||||||
hist[t.dst_qtype] = 0
|
hist[t.auto_qtype] = 0
|
||||||
hist[t.dst_qtype] += t.size
|
hist[t.auto_qtype] += t.size
|
||||||
max_qtype = gguf.GGMLQuantizationType.F32
|
max_qtype = gguf.GGMLQuantizationType.F32
|
||||||
max_size = 0
|
max_size = 0
|
||||||
for qtype, size in hist.items():
|
for qtype, size in hist.items():
|
||||||
if size > max_size:
|
if size > max_size:
|
||||||
max_qtype = qtype
|
max_qtype = qtype
|
||||||
max_size = size
|
max_size = size
|
||||||
# TODO: add more type if they're used as dst_qtypes
|
# TODO: add more type if they're used as auto_qtype
|
||||||
if max_qtype == gguf.GGMLQuantizationType.F32:
|
if max_qtype == gguf.GGMLQuantizationType.F32:
|
||||||
self.ftype = gguf.LlamaFileType.ALL_F32
|
self.ftype = gguf.LlamaFileType.ALL_F32
|
||||||
elif max_qtype == gguf.GGMLQuantizationType.F16:
|
elif max_qtype == gguf.GGMLQuantizationType.F16:
|
||||||
@@ -200,8 +199,7 @@ class ModelBase:
|
|||||||
load=lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r),
|
load=lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r),
|
||||||
size=math.prod(remote_tensor.shape),
|
size=math.prod(remote_tensor.shape),
|
||||||
src_type=str(dtype),
|
src_type=str(dtype),
|
||||||
src_qtype=qtype,
|
auto_qtype=qtype,
|
||||||
dst_qtype=qtype,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return tensors
|
return tensors
|
||||||
@@ -265,8 +263,7 @@ class ModelBase:
|
|||||||
load=data_gen,
|
load=data_gen,
|
||||||
size=size,
|
size=size,
|
||||||
src_type=str(dtype),
|
src_type=str(dtype),
|
||||||
src_qtype=qtype,
|
auto_qtype=qtype,
|
||||||
dst_qtype=qtype,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# verify tensor name presence and identify potentially missing files
|
# verify tensor name presence and identify potentially missing files
|
||||||
@@ -370,8 +367,7 @@ class ModelBase:
|
|||||||
load=lambda w=w, s=s: dequant_bitnet(w.load(), s.load()),
|
load=lambda w=w, s=s: dequant_bitnet(w.load(), s.load()),
|
||||||
size=w.size,
|
size=w.size,
|
||||||
src_type="bitnet",
|
src_type="bitnet",
|
||||||
src_qtype=gguf.GGMLQuantizationType.F32,
|
auto_qtype=gguf.GGMLQuantizationType.TQ1_0,
|
||||||
dst_qtype=gguf.GGMLQuantizationType.TQ1_0,
|
|
||||||
)
|
)
|
||||||
tensors_to_remove.append(name)
|
tensors_to_remove.append(name)
|
||||||
elif quant_method == "fp8":
|
elif quant_method == "fp8":
|
||||||
@@ -384,8 +380,7 @@ class ModelBase:
|
|||||||
load=lambda w=w, s=s: dequant_simple(w.load(), s.load()),
|
load=lambda w=w, s=s: dequant_simple(w.load(), s.load()),
|
||||||
size=w.size,
|
size=w.size,
|
||||||
src_type=w.src_type,
|
src_type=w.src_type,
|
||||||
src_qtype=gguf.GGMLQuantizationType.F32,
|
auto_qtype=gguf.GGMLQuantizationType.BF16, # TODO: change to FP8 once natively supported
|
||||||
dst_qtype=gguf.GGMLQuantizationType.BF16, # TODO: change to FP8 once natively supported
|
|
||||||
)
|
)
|
||||||
tensors_to_remove.append(name)
|
tensors_to_remove.append(name)
|
||||||
elif quant_method == "gptq":
|
elif quant_method == "gptq":
|
||||||
@@ -403,8 +398,7 @@ class ModelBase:
|
|||||||
),
|
),
|
||||||
size=qweight.size, # TODO: use more accurate value
|
size=qweight.size, # TODO: use more accurate value
|
||||||
src_type=f"GPTQ-{bits}bit",
|
src_type=f"GPTQ-{bits}bit",
|
||||||
src_qtype=gguf.GGMLQuantizationType.F32,
|
auto_qtype=gguf.GGMLQuantizationType.Q8_0 if bits == 8 else gguf.GGMLQuantizationType.Q4_1,
|
||||||
dst_qtype=gguf.GGMLQuantizationType.Q8_0 if bits == 8 else gguf.GGMLQuantizationType.Q4_1,
|
|
||||||
)
|
)
|
||||||
tensors_to_remove += [
|
tensors_to_remove += [
|
||||||
base_name + n
|
base_name + n
|
||||||
@@ -569,7 +563,7 @@ class ModelBase:
|
|||||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
if self.ftype_guessed:
|
if self.ftype_guessed:
|
||||||
data_qtype = old_qtype if tensor_info is None or tensor_info.dst_qtype is None else tensor_info.dst_qtype
|
data_qtype = old_qtype if tensor_info is None or tensor_info.auto_qtype is None else tensor_info.auto_qtype
|
||||||
elif self.ftype == gguf.LlamaFileType.ALL_F32:
|
elif self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||||
@@ -8942,7 +8936,7 @@ class LazyTorchTensor(gguf.LazyBase):
|
|||||||
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||||
args=(self,),
|
args=(self,),
|
||||||
func=(lambda s: s.numpy()),
|
func=(lambda s: s.numpy()),
|
||||||
ranges=self._ranges
|
ranges=self._ranges,
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
Reference in New Issue
Block a user