metal: somewhat faster f16 x f32 matrix multiply kernel (#2951)

* Somewhat faster f16 x f32 matrix multiply kernel * Better use 32 thread groups for f16 x f32 --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2025-10-27 08:21:30 +00:00 · 2023-09-01 11:15:57 +03:00
parent bce1fef328
commit e8d9158925
2 changed files with 29 additions and 11 deletions
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -840,7 +840,7 @@ void ggml_metal_graph_compute(
                                switch (src0t) {
                                    case GGML_TYPE_F16:
                                        {
-                                            nth0 = 64;
+                                            nth0 = 32;
                                            nth1 = 1;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
                                        } break;