mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : add gpt-oss (#15091)
* oai moe * compat with new checkpoint * add attn sink impl * add rope scaling yarn * logits match with latest transformers code * wip chat template * rm trailing space * use ggml_scale_bias * rm redundant is_swa_all * convert interleaved gate_up * graph : fix activation function to match reference (#7) * vocab : handle o200k_harmony special tokens * ggml : add attention sinks support (#1) * llama : add attn sinks * ggml : add attn sinks * cuda : add attn sinks * vulkan : add support for sinks in softmax remove unnecessary return * ggml : add fused swiglu_oai op (#11) * ggml : add fused swiglu_oai op * Update ggml/src/ggml-cpu/ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update CUDA impl * cont : metal impl * add vulkan impl * test-backend-ops : more test cases, clean up * llama : remove unfused impl * remove extra lines --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> * repack mxfp4 upon conversion * clean up a bit * enable thinking * add quick hack to render only some special tokens * fix bf16 conversion * remove vocab hack * webui ok * support chat parsing for gpt-oss * fix webui * direct mapping mxfp4, FINALLY * force using mxfp4 * properly use lazy tensor * ggml : add mxfp4 ggml : use e8m0 conversion instead of powf Co-authored-by: Diego Devesa <slarengh@gmail.com> change kvalues_mxfp4 table to match e2m1 (#6) metal : remove quantization for now (not used) cuda : fix disabled CUDA graphs due to ffn moe bias vulkan : add support for mxfp4 cont : add cm2 dequant * ggml : add ggml_add_id (#13) * ggml : add ggml_add_id * add cuda impl * llama : add weight support check for add_id * perf opt * add vulkan impl * rename cuda files * add metal impl * allow in-place ggml_add_id * llama : keep biases on CPU with --cpu-moe * llama : fix compile error ggml-ci * cuda : add fallback for __nv_cvt_e8m0_to_bf16raw ggml-ci * cleanup ggml-ci * sycl : fix supports_op for MXFP4 ggml-ci * fix Unknown reasoning format * ggml-cpu : fix AVX build ggml-ci * fix hip build ggml-ci * cuda : add mxfp4 dequantization support for cuBLAS ggml-ci * ggml-cpu : fix mxfp4 fallback definitions for some architectures ggml-ci * cuda : fix version required for __nv_cvt_e8m0_to_bf16raw --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
		| @@ -88,6 +88,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { | ||||
|     { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      }, | ||||
|     { LLM_ARCH_HUNYUAN_DENSE,    "hunyuan-dense"    }, | ||||
|     { LLM_ARCH_SMOLLM3,          "smollm3"          }, | ||||
|     { LLM_ARCH_OPENAI_MOE,       "gpt-oss"          }, | ||||
|     { LLM_ARCH_LFM2,             "lfm2"             }, | ||||
|     { LLM_ARCH_DREAM,            "dream"            }, | ||||
|     { LLM_ARCH_SMALLTHINKER,     "smallthinker"     }, | ||||
| @@ -1971,6 +1972,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N | ||||
|             { LLM_TENSOR_FFN_UP,         "blk.%d.ffn_up" }, | ||||
|         }, | ||||
|     }, | ||||
|     { | ||||
|         LLM_ARCH_OPENAI_MOE, | ||||
|         { | ||||
|             { LLM_TENSOR_TOKEN_EMBD,         "token_embd" }, | ||||
|             { LLM_TENSOR_OUTPUT_NORM,        "output_norm" }, | ||||
|             { LLM_TENSOR_OUTPUT,             "output" }, | ||||
|             { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" }, | ||||
|             { LLM_TENSOR_ATTN_POST_NORM,     "blk.%d.post_attention_norm" }, | ||||
|             { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" }, | ||||
|             { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" }, | ||||
|             { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" }, | ||||
|             { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" }, | ||||
|             { LLM_TENSOR_ATTN_SINKS,         "blk.%d.attn_sinks" }, | ||||
|             { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" }, | ||||
|             { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" }, | ||||
|             { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" }, | ||||
|             { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" }, | ||||
|         }, | ||||
|     }, | ||||
|     { | ||||
|         LLM_ARCH_LFM2, | ||||
|         { | ||||
| @@ -2086,6 +2106,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { | ||||
|     {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_K_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_V_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_SINKS,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}}, | ||||
|     {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_DEC_ATTN_V,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov