mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Move GLM4 f32 attention fix to the correct function (#13750)
This commit is contained in:
		| @@ -1287,6 +1287,10 @@ ggml_tensor * llm_graph_context::build_attn( | ||||
|  | ||||
|     if (wo) { | ||||
|         cur = build_lora_mm(wo, cur); | ||||
|         if (arch == LLM_ARCH_GLM4) { | ||||
|             // GLM4 seems to have numerical issues with half-precision accumulators | ||||
|             ggml_mul_mat_set_prec(cur, GGML_PREC_F32); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (wo_b) { | ||||
| @@ -1367,10 +1371,6 @@ ggml_tensor * llm_graph_context::build_attn( | ||||
|  | ||||
|     if (wo) { | ||||
|         cur = build_lora_mm(wo, cur); | ||||
|         if (arch == LLM_ARCH_GLM4) { | ||||
|             // GLM4 seems to have numerical issues with half-precision accumulators | ||||
|             ggml_mul_mat_set_prec(cur, GGML_PREC_F32); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (wo_b) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 0cc4m
					0cc4m