mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ggml : update softmax n_task calculation (#5126)
updated the n_task calculation to use max number of threads possible. This has improved the prompt eval performance by around 5% for DOT kernels and by around 10% for MMLA kernels on AWS Graviton3.
This commit is contained in:
		
							
								
								
									
										2
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { | |||||||
|             } break; |             } break; | ||||||
|         case GGML_OP_SOFT_MAX: |         case GGML_OP_SOFT_MAX: | ||||||
|             { |             { | ||||||
|                 n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); |                 n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); | ||||||
|             } break; |             } break; | ||||||
|         case GGML_OP_CONV_TRANSPOSE_1D: |         case GGML_OP_CONV_TRANSPOSE_1D: | ||||||
|             { |             { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 snadampal
					snadampal