ggml : update softmax n_task calculation (#5126)

updated the n_task calculation to use max number of threads possible. This has improved the prompt eval performance by around 5% for DOT kernels and by around 10% for MMLA kernels on AWS Graviton3.
2025-10-31 08:51:55 +00:00 · 2024-01-26 11:17:59 -06:00
parent 5f1925a8ce
commit 7032f4f634
1 changed files with 1 additions and 1 deletions
--- a/ggml.c
+++ b/ggml.c
@@ -16597,7 +16597,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            } break;
        case GGML_OP_SOFT_MAX:
            {
-                n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
+                n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {