mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : do not cap thread count when MoE on CPU (#5419)
* Not capping thread count when MoE inference is running on CPU * Whitespace
This commit is contained in:
		 Paul Tsochantaris
					Paul Tsochantaris
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							e4124c2477
						
					
				
				
					commit
					e5ca3937c6
				
			| @@ -7285,7 +7285,9 @@ static int llama_decode_internal( | ||||
|     // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well | ||||
|     //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering | ||||
|     //       with the BLAS calls. need a better solution | ||||
|     if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { | ||||
|     // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is | ||||
|     //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance. | ||||
|     if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { | ||||
|         n_threads = std::min(4, n_threads); | ||||
|     } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user