mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	metal : first working version of the inference without prompt processing
Bonus: supports partial inference on the CPU
This commit is contained in:
		
							
								
								
									
										12
									
								
								ggml-metal.m
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								ggml-metal.m
									
									
									
									
									
								
							@@ -237,7 +237,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fprintf(stderr, "%s: error: buffer is nil\n", __func__);
 | 
			
		||||
    fprintf(stderr, "%s: error: buffer is nil for tensor '%s'\n", __func__, t->name);
 | 
			
		||||
 | 
			
		||||
    return nil;
 | 
			
		||||
}
 | 
			
		||||
@@ -877,15 +877,15 @@ void ggml_metal_graph_compute(
 | 
			
		||||
                                encoder = [command_buffer computeCommandEncoder];
 | 
			
		||||
                            }
 | 
			
		||||
 | 
			
		||||
                            const int n_dims = ((int32_t *) src1->data)[1];
 | 
			
		||||
                            const int mode   = ((int32_t *) src1->data)[2];
 | 
			
		||||
                            const int n_dims = ((int32_t *) dst->op_params)[1];
 | 
			
		||||
                            const int mode   = ((int32_t *) dst->op_params)[2];
 | 
			
		||||
 | 
			
		||||
                            const int n_past = ((int32_t *)(src1->data))[0];
 | 
			
		||||
                            const int n_past = ((int32_t *)(dst->op_params))[0];
 | 
			
		||||
 | 
			
		||||
                            float freq_base;
 | 
			
		||||
                            float freq_scale;
 | 
			
		||||
                            memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
 | 
			
		||||
                            memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
 | 
			
		||||
                            memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
 | 
			
		||||
                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
 | 
			
		||||
 | 
			
		||||
                            [encoder setComputePipelineState:ctx->pipeline_rope];
 | 
			
		||||
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
 | 
			
		||||
 
 | 
			
		||||
@@ -2845,10 +2845,15 @@ struct llama_context * llama_new_context_with_model(
 | 
			
		||||
 | 
			
		||||
        struct ggml_backend_buffer * buf_compute = ctx->buf_compute_metal->backend_buffer;
 | 
			
		||||
        struct ggml_backend_buffer * buf_kv      = ctx->kv_self.buf->backend_buffer;
 | 
			
		||||
        struct ggml_backend_buffer * buf_input   = ctx->buf_input->backend_buffer;
 | 
			
		||||
        struct ggml_backend_buffer * buf_output  = ctx->buf_output->backend_buffer;
 | 
			
		||||
 | 
			
		||||
        LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "eval", buf_compute->backend_data, buf_compute->backend_size, 0));
 | 
			
		||||
        LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "kv",   buf_kv->backend_data,      buf_kv->backend_size,      0));
 | 
			
		||||
 | 
			
		||||
        LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "inp", buf_input->backend_data,  buf_input->backend_size,  0));
 | 
			
		||||
        LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "inp", buf_output->backend_data, buf_output->backend_size, 0));
 | 
			
		||||
 | 
			
		||||
        //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
 | 
			
		||||
        //LLAMA_METAL_CHECK_BUF(ggml_backend_metal_map_buffer(ctx->model.backend_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
 | 
			
		||||
#undef LLAMA_METAL_CHECK_BUF
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user