mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-12 10:47:01 +00:00
* oai moe * compat with new checkpoint * add attn sink impl * add rope scaling yarn * logits match with latest transformers code * wip chat template * rm trailing space * use ggml_scale_bias * rm redundant is_swa_all * convert interleaved gate_up * graph : fix activation function to match reference (#7) * vocab : handle o200k_harmony special tokens * ggml : add attention sinks support (#1) * llama : add attn sinks * ggml : add attn sinks * cuda : add attn sinks * vulkan : add support for sinks in softmax remove unnecessary return * ggml : add fused swiglu_oai op (#11) * ggml : add fused swiglu_oai op * Update ggml/src/ggml-cpu/ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update CUDA impl * cont : metal impl * add vulkan impl * test-backend-ops : more test cases, clean up * llama : remove unfused impl * remove extra lines --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> * repack mxfp4 upon conversion * clean up a bit * enable thinking * add quick hack to render only some special tokens * fix bf16 conversion * remove vocab hack * webui ok * support chat parsing for gpt-oss * fix webui * direct mapping mxfp4, FINALLY * force using mxfp4 * properly use lazy tensor * ggml : add mxfp4 ggml : use e8m0 conversion instead of powf Co-authored-by: Diego Devesa <slarengh@gmail.com> change kvalues_mxfp4 table to match e2m1 (#6) metal : remove quantization for now (not used) cuda : fix disabled CUDA graphs due to ffn moe bias vulkan : add support for mxfp4 cont : add cm2 dequant * ggml : add ggml_add_id (#13) * ggml : add ggml_add_id * add cuda impl * llama : add weight support check for add_id * perf opt * add vulkan impl * rename cuda files * add metal impl * allow in-place ggml_add_id * llama : keep biases on CPU with --cpu-moe * llama : fix compile error ggml-ci * cuda : add fallback for __nv_cvt_e8m0_to_bf16raw ggml-ci * cleanup ggml-ci * sycl : fix supports_op for MXFP4 ggml-ci * fix Unknown reasoning format * ggml-cpu : fix AVX build ggml-ci * fix hip build ggml-ci * cuda : add mxfp4 dequantization support for cuBLAS ggml-ci * ggml-cpu : fix mxfp4 fallback definitions for some architectures ggml-ci * cuda : fix version required for __nv_cvt_e8m0_to_bf16raw --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: slaren <slarengh@gmail.com>
52 lines
1.4 KiB
Plaintext
52 lines
1.4 KiB
Plaintext
#version 450
|
|
|
|
#include "types.comp"
|
|
#include "generic_unary_head.comp"
|
|
#include "dequant_funcs.comp"
|
|
|
|
#if defined(DATA_A_IQ4_NL) || defined(DATA_A_MXFP4)
|
|
// 16 invocations needed for init_iq_shmem
|
|
layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
|
|
#else
|
|
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
|
|
#endif
|
|
|
|
void main() {
|
|
#ifdef NEEDS_INIT_IQ_SHMEM
|
|
init_iq_shmem(gl_WorkGroupSize);
|
|
if (gl_LocalInvocationIndex.x != 0) {
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
|
|
|
|
if (idx >= p.ne) {
|
|
return;
|
|
}
|
|
|
|
uint dst_idx = get_doffset() + dst_idx(idx);
|
|
uint src_idx = src0_idx_quant(idx, QUANT_K);
|
|
|
|
const uint a_offset = 0;
|
|
const uint ib = src_idx;
|
|
const vec2 dm = get_dm(ib, a_offset);
|
|
|
|
[[unroll]] for (int j = 0; j < QUANT_K; j += 4) {
|
|
vec4 v = dequantize4(ib, j / QUANT_R, a_offset);
|
|
v = v * dm.x + vec4(dm.y);
|
|
|
|
#if QUANT_R == 2
|
|
data_d[dst_idx + j/2 + 0] = v[0];
|
|
data_d[dst_idx + j/2 + QUANT_K/2 + 0] = v[1];
|
|
data_d[dst_idx + j/2 + 1] = v[2];
|
|
data_d[dst_idx + j/2 + QUANT_K/2 + 1] = v[3];
|
|
#else
|
|
data_d[dst_idx + j + 0] = v[0];
|
|
data_d[dst_idx + j + 1] = v[1];
|
|
data_d[dst_idx + j + 2] = v[2];
|
|
data_d[dst_idx + j + 3] = v[3];
|
|
#endif
|
|
}
|
|
}
|