mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	metal : print more GPU info + disable mul_mm for MTLGPUFamiliy < Apple7
This commit is contained in:
		
							
								
								
									
										102
									
								
								ggml-metal.m
									
									
									
									
									
								
							
							
						
						
									
										102
									
								
								ggml-metal.m
									
									
									
									
									
								
							| @@ -274,16 +274,18 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { | |||||||
|         GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); |         GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); |         GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); |         GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); |         if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); |             GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); | ||||||
|  |             GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); | ||||||
|  |         } | ||||||
|         GGML_METAL_ADD_KERNEL(rope_f32); |         GGML_METAL_ADD_KERNEL(rope_f32); | ||||||
|         GGML_METAL_ADD_KERNEL(rope_f16); |         GGML_METAL_ADD_KERNEL(rope_f16); | ||||||
|         GGML_METAL_ADD_KERNEL(alibi_f32); |         GGML_METAL_ADD_KERNEL(alibi_f32); | ||||||
| @@ -296,8 +298,22 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { | |||||||
| #undef GGML_METAL_ADD_KERNEL | #undef GGML_METAL_ADD_KERNEL | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); |  | ||||||
| #if TARGET_OS_OSX | #if TARGET_OS_OSX | ||||||
|  |     // print MTL GPU family: | ||||||
|  |     GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]); | ||||||
|  |     GGML_METAL_LOG_INFO("%s: GPU arch:   %s\n", __func__, [[ctx->device architecture].name UTF8String]); | ||||||
|  |  | ||||||
|  |     // determine max supported GPU family | ||||||
|  |     // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf | ||||||
|  |     // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf | ||||||
|  |     for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) { | ||||||
|  |         if ([ctx->device supportsFamily:i]) { | ||||||
|  |             GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i); | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); | ||||||
|     GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); |     GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); | ||||||
|     if (ctx->device.maxTransferRate != 0) { |     if (ctx->device.maxTransferRate != 0) { | ||||||
|         GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); |         GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); | ||||||
| @@ -351,16 +367,18 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { | |||||||
|     GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); |     GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); |     GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); |     GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); |     if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); |         GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); | ||||||
|  |         GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); | ||||||
|  |     } | ||||||
|     GGML_METAL_DEL_KERNEL(rope_f32); |     GGML_METAL_DEL_KERNEL(rope_f32); | ||||||
|     GGML_METAL_DEL_KERNEL(rope_f16); |     GGML_METAL_DEL_KERNEL(rope_f16); | ||||||
|     GGML_METAL_DEL_KERNEL(alibi_f32); |     GGML_METAL_DEL_KERNEL(alibi_f32); | ||||||
| @@ -986,32 +1004,36 @@ void ggml_metal_graph_compute( | |||||||
|                         } break; |                         } break; | ||||||
|                     case GGML_OP_MUL_MAT: |                     case GGML_OP_MUL_MAT: | ||||||
|                         { |                         { | ||||||
|                             // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 |  | ||||||
|  |  | ||||||
|                             GGML_ASSERT(ne00 == ne10); |                             GGML_ASSERT(ne00 == ne10); | ||||||
|                             // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere |  | ||||||
|                             uint gqa = ne12/ne02; |  | ||||||
|                             GGML_ASSERT(ne03 == ne13); |                             GGML_ASSERT(ne03 == ne13); | ||||||
|  |  | ||||||
|  |                             const uint gqa = ne12/ne02; | ||||||
|  |  | ||||||
|                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared |                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared | ||||||
|                             // to the matrix-vector kernel. the numbers below are measured on M2 Ultra |                             // to the matrix-vector kernel | ||||||
|                             // not sure if this translates across all chips |  | ||||||
|                             int ne11_mm_min = 1; |                             int ne11_mm_min = 1; | ||||||
|  |  | ||||||
|                             switch (src0t) { | #if 0 | ||||||
|                                 case GGML_TYPE_F16:  ne11_mm_min = 2;  break; |                             // the numbers below are measured on M2 Ultra for 7B and 13B models | ||||||
|                                 case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break; |                             // these numbers do not translate to other devices or model sizes | ||||||
|                                 case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; |                             // TODO: need to find a better approach | ||||||
|                                 case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break; |                             if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { | ||||||
|                                 case GGML_TYPE_Q4_0: |                                 switch (src0t) { | ||||||
|                                 case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; |                                     case GGML_TYPE_F16:  ne11_mm_min = 2;  break; | ||||||
|                                 case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; |                                     case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break; | ||||||
|                                 case GGML_TYPE_Q5_0:                          // not tested yet |                                     case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; | ||||||
|                                 case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet |                                     case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break; | ||||||
|                                 case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break; |                                     case GGML_TYPE_Q4_0: | ||||||
|                                 case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break; |                                     case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; | ||||||
|                                 default:             ne11_mm_min = 1;  break; |                                     case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; | ||||||
|  |                                     case GGML_TYPE_Q5_0:                          // not tested yet | ||||||
|  |                                     case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet | ||||||
|  |                                     case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break; | ||||||
|  |                                     case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break; | ||||||
|  |                                     default:             ne11_mm_min = 1;  break; | ||||||
|  |                                 } | ||||||
|                             } |                             } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|                             // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs |                             // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs | ||||||
|                             // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel |                             // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel | ||||||
|   | |||||||
| @@ -2332,7 +2332,7 @@ kernel void kernel_get_rows( | |||||||
| } | } | ||||||
|  |  | ||||||
| #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A | #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A | ||||||
| #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A | #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B | ||||||
| #define BLOCK_SIZE_K 32 | #define BLOCK_SIZE_K 32 | ||||||
| #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A | #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A | ||||||
| #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B | #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B | ||||||
| @@ -2459,7 +2459,8 @@ kernel void kernel_mul_mm(device const  uchar * src0, | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         threadgroup_barrier(mem_flags::mem_threadgroup); |         threadgroup_barrier(mem_flags::mem_threadgroup); | ||||||
|         device float * C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; |  | ||||||
|  |         device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; | ||||||
|         if (sgitg == 0) { |         if (sgitg == 0) { | ||||||
|             for (int i = 0; i < n_rows; i++) { |             for (int i = 0; i < n_rows; i++) { | ||||||
|                 for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { |                 for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov