convert : handle compressed-tensors quant method (#17069)

* convert : handle compressed-tensors quant method

* convert : handle int-quantized models

* convert : handle naive-quantized models

* gguf-py : __pos__ is also unary

* convert : fix flake8 lint

* convert : use F32 for dequant of pack-quantized tensors
This commit is contained in:
compilade
2025-11-09 09:45:50 -05:00
committed by GitHub
parent cb1adf8851
commit 1c07c0c68c
2 changed files with 92 additions and 10 deletions

View File

@@ -48,13 +48,18 @@ class LazyMeta(ABCMeta):
# NOTE: doing this from a metaclass is very convenient
# TODO: make this even more comprehensive
for binary_op in (
"lt", "le", "eq", "ne", "ge", "gt", "not"
"abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
"neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
"lt", "le", "eq", "ne", "ge", "gt",
"add", "and", "floordiv", "lshift", "mod", "mul", "matmul",
"or", "pow", "rshift", "sub", "truediv", "xor",
"iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
"radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
):
attr_name = f"__{binary_op}__"
# evaluation on the meta tensor is needed in case there's broadcasting
namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
for unary_op in ("not", "abs", "invert", "neg", "pos"):
attr_name = f"__{unary_op}__"
# the result of these operators usually has the same shape and dtype as the input,
# so evaluation on the meta tensor can be skipped.
namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)