mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ggml : update softmax n_task calculation (#5126)
updated the n_task calculation to use max number of threads possible. This has improved the prompt eval performance by around 5% for DOT kernels and by around 10% for MMLA kernels on AWS Graviton3.
This commit is contained in:
		
							
								
								
									
										2
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { | ||||
|             } break; | ||||
|         case GGML_OP_SOFT_MAX: | ||||
|             { | ||||
|                 n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0])); | ||||
|                 n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); | ||||
|             } break; | ||||
|         case GGML_OP_CONV_TRANSPOSE_1D: | ||||
|             { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 snadampal
					snadampal