mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	metal : detect tensor support
This commit is contained in:
		| @@ -35,7 +35,6 @@ struct ggml_metal { | ||||
|     // additional, inference-time compiled pipelines | ||||
|     ggml_metal_pipelines_t pipelines_ext; | ||||
|  | ||||
|     bool use_bfloat; | ||||
|     bool use_fusion; | ||||
|     bool use_concurrency; | ||||
|     bool use_graph_optimize; | ||||
| @@ -121,11 +120,10 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev); | ||||
|     //const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev); | ||||
|  | ||||
|     res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); | ||||
|  | ||||
|     res->use_bfloat      = props_dev->has_bfloat; | ||||
|     res->use_fusion      = getenv("GGML_METAL_FUSION_DISABLE") == nil; | ||||
|     res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil; | ||||
|  | ||||
| @@ -147,7 +145,6 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) { | ||||
|  | ||||
|     memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt)); | ||||
|  | ||||
|     GGML_LOG_INFO("%s: use bfloat         = %s\n", __func__, res->use_bfloat         ? "true" : "false"); | ||||
|     GGML_LOG_INFO("%s: use fusion         = %s\n", __func__, res->use_fusion         ? "true" : "false"); | ||||
|     GGML_LOG_INFO("%s: use concurrency    = %s\n", __func__, res->use_concurrency    ? "true" : "false"); | ||||
|     GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false"); | ||||
|   | ||||
| @@ -193,6 +193,7 @@ struct ggml_metal_device_props { | ||||
|     bool has_simdgroup_mm; | ||||
|     bool has_unified_memory; | ||||
|     bool has_bfloat; | ||||
|     bool has_tensor; | ||||
|     bool use_residency_sets; | ||||
|     bool use_shared_buffers; | ||||
|  | ||||
|   | ||||
| @@ -21,8 +21,9 @@ | ||||
| #define GGML_METAL_HAS_RESIDENCY_SETS 1 | ||||
| #endif | ||||
|  | ||||
| // overload of MTLGPUFamilyMetal3 (not available in some environments) | ||||
| // overload of MTLGPUFamilyMetalX (not available in some environments) | ||||
| static const NSInteger MTLGPUFamilyMetal3_GGML = 5001; | ||||
| static const NSInteger MTLGPUFamilyMetal4_GGML = 5002; | ||||
|  | ||||
| // virtual address for GPU memory allocations | ||||
| static atomic_uintptr_t g_addr_device = 0x000000400ULL; | ||||
| @@ -261,6 +262,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) { | ||||
|                     [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"]; | ||||
|                 } | ||||
|  | ||||
|                 if (ggml_metal_device_get_props(dev)->has_tensor) { | ||||
|                     [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"]; | ||||
|                 } | ||||
|  | ||||
| #if GGML_METAL_EMBED_LIBRARY | ||||
|                 [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"]; | ||||
| #endif | ||||
| @@ -470,6 +475,8 @@ ggml_metal_device_t ggml_metal_device_init(void) { | ||||
|             dev->props.has_bfloat  = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; | ||||
|             dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6]; | ||||
|  | ||||
|             dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML]; | ||||
|  | ||||
|             dev->props.use_residency_sets = true; | ||||
| #if defined(GGML_METAL_HAS_RESIDENCY_SETS) | ||||
|             dev->props.use_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil; | ||||
| @@ -529,6 +536,7 @@ ggml_metal_device_t ggml_metal_device_init(void) { | ||||
|             GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm        ? "true" : "false"); | ||||
|             GGML_LOG_INFO("%s: has unified memory    = %s\n", __func__, dev->props.has_unified_memory      ? "true" : "false"); | ||||
|             GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, dev->props.has_bfloat              ? "true" : "false"); | ||||
|             GGML_LOG_INFO("%s: has tensor            = %s\n", __func__, dev->props.has_tensor              ? "true" : "false"); | ||||
|             GGML_LOG_INFO("%s: use residency sets    = %s\n", __func__, dev->props.use_residency_sets      ? "true" : "false"); | ||||
|             GGML_LOG_INFO("%s: use shared buffers    = %s\n", __func__, dev->props.use_shared_buffers      ? "true" : "false"); | ||||
|  | ||||
|   | ||||
| @@ -9,9 +9,7 @@ __embed_ggml-common.h__ | ||||
|  | ||||
| #include <metal_stdlib> | ||||
|  | ||||
| #define GGML_METAL_USE_METAL4 | ||||
|  | ||||
| #ifdef GGML_METAL_USE_METAL4 | ||||
| #ifdef GGML_METAL_HAS_TENSOR | ||||
| #include <metal_tensor> | ||||
|  | ||||
| #include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> | ||||
| @@ -8196,7 +8194,7 @@ kernel void kernel_mul_mm( | ||||
|         + args.nb11*(r1 + lr1) | ||||
|         + args.nb10*iy); | ||||
|  | ||||
| #ifndef GGML_METAL_USE_METAL4 | ||||
| #ifndef GGML_METAL_HAS_TENSOR | ||||
|     S0_8x8 ma[4]; | ||||
|     S1_8x8 mb[2]; | ||||
|  | ||||
| @@ -8217,7 +8215,7 @@ kernel void kernel_mul_mm( | ||||
| #endif | ||||
|  | ||||
|     for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) { | ||||
| #ifndef GGML_METAL_USE_METAL4 | ||||
| #ifndef GGML_METAL_HAS_TENSOR | ||||
|         // load data and store to threadgroup memory | ||||
|         if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) { | ||||
|             threadgroup_barrier(mem_flags::mem_threadgroup); | ||||
| @@ -8397,7 +8395,7 @@ kernel void kernel_mul_mm( | ||||
|  | ||||
|     if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) { | ||||
|         // if no bounds checks on the output are needed, we can directly write to device memory | ||||
| #ifdef GGML_METAL_USE_METAL4 | ||||
| #ifdef GGML_METAL_HAS_TENSOR | ||||
|         device float * C = (device float *) dst + | ||||
|             r0 + \ | ||||
|             r1 * args.ne0 + im*args.ne1*args.ne0; | ||||
| @@ -8419,7 +8417,7 @@ kernel void kernel_mul_mm( | ||||
|  | ||||
|         threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0; | ||||
|  | ||||
| #ifdef GGML_METAL_USE_METAL4 | ||||
| #ifdef GGML_METAL_HAS_TENSOR | ||||
|         auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1)); | ||||
|         cT.store(tC); | ||||
| #else | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov