metal : detect tensor support

This commit is contained in:
Georgi Gerganov
2025-10-18 17:23:16 +03:00
parent 591b60340d
commit f8416cfa01
4 changed files with 16 additions and 12 deletions

View File

@@ -35,7 +35,6 @@ struct ggml_metal {
// additional, inference-time compiled pipelines // additional, inference-time compiled pipelines
ggml_metal_pipelines_t pipelines_ext; ggml_metal_pipelines_t pipelines_ext;
bool use_bfloat;
bool use_fusion; bool use_fusion;
bool use_concurrency; bool use_concurrency;
bool use_graph_optimize; bool use_graph_optimize;
@@ -121,11 +120,10 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
} }
} }
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev); //const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT); res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
res->use_bfloat = props_dev->has_bfloat;
res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil; res->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil; res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
@@ -147,7 +145,6 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt)); memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, res->use_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false"); GGML_LOG_INFO("%s: use fusion = %s\n", __func__, res->use_fusion ? "true" : "false");
GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false"); GGML_LOG_INFO("%s: use concurrency = %s\n", __func__, res->use_concurrency ? "true" : "false");
GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false"); GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");

View File

@@ -193,6 +193,7 @@ struct ggml_metal_device_props {
bool has_simdgroup_mm; bool has_simdgroup_mm;
bool has_unified_memory; bool has_unified_memory;
bool has_bfloat; bool has_bfloat;
bool has_tensor;
bool use_residency_sets; bool use_residency_sets;
bool use_shared_buffers; bool use_shared_buffers;

View File

@@ -21,8 +21,9 @@
#define GGML_METAL_HAS_RESIDENCY_SETS 1 #define GGML_METAL_HAS_RESIDENCY_SETS 1
#endif #endif
// overload of MTLGPUFamilyMetal3 (not available in some environments) // overload of MTLGPUFamilyMetalX (not available in some environments)
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001; static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
// virtual address for GPU memory allocations // virtual address for GPU memory allocations
static atomic_uintptr_t g_addr_device = 0x000000400ULL; static atomic_uintptr_t g_addr_device = 0x000000400ULL;
@@ -261,6 +262,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"]; [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
} }
if (ggml_metal_device_get_props(dev)->has_tensor) {
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
}
#if GGML_METAL_EMBED_LIBRARY #if GGML_METAL_EMBED_LIBRARY
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"]; [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
#endif #endif
@@ -470,6 +475,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; dev->props.has_bfloat = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6]; dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
dev->props.use_residency_sets = true; dev->props.use_residency_sets = true;
#if defined(GGML_METAL_HAS_RESIDENCY_SETS) #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
dev->props.use_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil; dev->props.use_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == nil;
@@ -529,6 +536,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false"); GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm ? "true" : "false");
GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false"); GGML_LOG_INFO("%s: has unified memory = %s\n", __func__, dev->props.has_unified_memory ? "true" : "false");
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, dev->props.has_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: has tensor = %s\n", __func__, dev->props.has_tensor ? "true" : "false");
GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false"); GGML_LOG_INFO("%s: use residency sets = %s\n", __func__, dev->props.use_residency_sets ? "true" : "false");
GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false"); GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, dev->props.use_shared_buffers ? "true" : "false");

View File

@@ -9,9 +9,7 @@ __embed_ggml-common.h__
#include <metal_stdlib> #include <metal_stdlib>
#define GGML_METAL_USE_METAL4 #ifdef GGML_METAL_HAS_TENSOR
#ifdef GGML_METAL_USE_METAL4
#include <metal_tensor> #include <metal_tensor>
#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> #include <MetalPerformancePrimitives/MetalPerformancePrimitives.h>
@@ -8196,7 +8194,7 @@ kernel void kernel_mul_mm(
+ args.nb11*(r1 + lr1) + args.nb11*(r1 + lr1)
+ args.nb10*iy); + args.nb10*iy);
#ifndef GGML_METAL_USE_METAL4 #ifndef GGML_METAL_HAS_TENSOR
S0_8x8 ma[4]; S0_8x8 ma[4];
S1_8x8 mb[2]; S1_8x8 mb[2];
@@ -8217,7 +8215,7 @@ kernel void kernel_mul_mm(
#endif #endif
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) { for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
#ifndef GGML_METAL_USE_METAL4 #ifndef GGML_METAL_HAS_TENSOR
// load data and store to threadgroup memory // load data and store to threadgroup memory
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) { if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
@@ -8397,7 +8395,7 @@ kernel void kernel_mul_mm(
if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) { if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
// if no bounds checks on the output are needed, we can directly write to device memory // if no bounds checks on the output are needed, we can directly write to device memory
#ifdef GGML_METAL_USE_METAL4 #ifdef GGML_METAL_HAS_TENSOR
device float * C = (device float *) dst + device float * C = (device float *) dst +
r0 + \ r0 + \
r1 * args.ne0 + im*args.ne1*args.ne0; r1 * args.ne0 + im*args.ne1*args.ne0;
@@ -8419,7 +8417,7 @@ kernel void kernel_mul_mm(
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0; threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
#ifdef GGML_METAL_USE_METAL4 #ifdef GGML_METAL_HAS_TENSOR
auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1)); auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
cT.store(tC); cT.store(tC);
#else #else