mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : do not cap thread count when MoE on CPU (#5419)
* Not capping thread count when MoE inference is running on CPU * Whitespace
This commit is contained in:
		 Paul Tsochantaris
					Paul Tsochantaris
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							e4124c2477
						
					
				
				
					commit
					e5ca3937c6
				
			| @@ -7285,7 +7285,9 @@ static int llama_decode_internal( | |||||||
|     // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well |     // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well | ||||||
|     //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering |     //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering | ||||||
|     //       with the BLAS calls. need a better solution |     //       with the BLAS calls. need a better solution | ||||||
|     if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { |     // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is | ||||||
|  |     //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance. | ||||||
|  |     if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { | ||||||
|         n_threads = std::min(4, n_threads); |         n_threads = std::min(4, n_threads); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user