mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	build : on Mac OS enable Metal by default (#2901)
* build : on Mac OS enable Metal by default * make : try to fix build on Linux * make : move targets back to the top * make : fix target clean * llama : enable GPU inference by default with Metal * llama : fix vocab_only logic when GPU is enabled * common : better `n_gpu_layers` assignment * readme : update Metal instructions * make : fix merge conflict remnants * gitignore : metal
This commit is contained in:
		
							
								
								
									
										29
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										29
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -31,28 +31,29 @@ tmp/ | ||||
| models/* | ||||
| models-mnt | ||||
|  | ||||
| /main | ||||
| /quantize | ||||
| /quantize-stats | ||||
| /result | ||||
| /perplexity | ||||
| /embedding | ||||
| /train-text-from-scratch | ||||
| /convert-llama2c-to-ggml | ||||
| /simple | ||||
| /benchmark-matmult | ||||
| /vdot | ||||
| /server | ||||
| /Pipfile | ||||
| /baby-llama | ||||
| /beam-search | ||||
| /benchmark-matmult | ||||
| /convert-llama2c-to-ggml | ||||
| /embd-input-test | ||||
| /embedding | ||||
| /gguf | ||||
| /gguf-llama-simple | ||||
| /libllama.so | ||||
| /llama-bench | ||||
| /baby-llama | ||||
| /beam-search | ||||
| /main | ||||
| /metal | ||||
| /perplexity | ||||
| /quantize | ||||
| /quantize-stats | ||||
| /result | ||||
| /save-load-state | ||||
| /server | ||||
| /simple | ||||
| /speculative | ||||
| /train-text-from-scratch | ||||
| /vdot | ||||
| build-info.h | ||||
| arm_neon.h | ||||
| compile_commands.json | ||||
|   | ||||
| @@ -36,6 +36,12 @@ endif() | ||||
| # Option list | ||||
| # | ||||
|  | ||||
| if (APPLE) | ||||
|     set(LLAMA_METAL_DEFAULT ON) | ||||
| else() | ||||
|     set(LLAMA_METAL_DEFAULT OFF) | ||||
| endif() | ||||
|  | ||||
| # general | ||||
| option(LLAMA_STATIC                     "llama: static link libraries"                          OFF) | ||||
| option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF) | ||||
| @@ -76,7 +82,7 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some | ||||
| set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") | ||||
| option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF) | ||||
| option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF) | ||||
| option(LLAMA_METAL                           "llama: use Metal"                                 OFF) | ||||
| option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT}) | ||||
| option(LLAMA_MPI                             "llama: use MPI"                                   OFF) | ||||
| option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON) | ||||
| option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF) | ||||
| @@ -158,6 +164,31 @@ if (APPLE AND LLAMA_ACCELERATE) | ||||
|     endif() | ||||
| endif() | ||||
|  | ||||
| if (LLAMA_METAL) | ||||
|     find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED) | ||||
|     find_library(METAL_FRAMEWORK            Metal                   REQUIRED) | ||||
|     find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED) | ||||
|  | ||||
|     message(STATUS "Metal framework found") | ||||
|  | ||||
|     set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) | ||||
|  | ||||
|     add_compile_definitions(GGML_USE_METAL) | ||||
|     #add_compile_definitions(GGML_METAL_NDEBUG) | ||||
|  | ||||
|     # get full path to the file | ||||
|     #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") | ||||
|  | ||||
|     # copy ggml-metal.metal to bin directory | ||||
|     configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY) | ||||
|  | ||||
|     set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} | ||||
|         ${FOUNDATION_LIBRARY} | ||||
|         ${METAL_FRAMEWORK} | ||||
|         ${METALKIT_FRAMEWORK} | ||||
|         ) | ||||
| endif() | ||||
|  | ||||
| if (LLAMA_BLAS) | ||||
|     if (LLAMA_STATIC) | ||||
|         set(BLA_STATIC ON) | ||||
| @@ -293,29 +324,6 @@ if (LLAMA_CUBLAS) | ||||
|     endif() | ||||
| endif() | ||||
|  | ||||
| if (LLAMA_METAL) | ||||
|     find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED) | ||||
|     find_library(METAL_FRAMEWORK            Metal                   REQUIRED) | ||||
|     find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED) | ||||
|  | ||||
|     set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) | ||||
|  | ||||
|     add_compile_definitions(GGML_USE_METAL) | ||||
|     #add_compile_definitions(GGML_METAL_NDEBUG) | ||||
|  | ||||
|     # get full path to the file | ||||
|     #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") | ||||
|  | ||||
|     # copy ggml-metal.metal to bin directory | ||||
|     configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY) | ||||
|  | ||||
|     set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} | ||||
|         ${FOUNDATION_LIBRARY} | ||||
|         ${METAL_FRAMEWORK} | ||||
|         ${METALKIT_FRAMEWORK} | ||||
|         ) | ||||
| endif() | ||||
|  | ||||
| if (LLAMA_MPI) | ||||
|     cmake_minimum_required(VERSION 3.10) | ||||
|     find_package(MPI) | ||||
|   | ||||
							
								
								
									
										76
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										76
									
								
								Makefile
									
									
									
									
									
								
							| @@ -7,6 +7,39 @@ TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-dou | ||||
| # Code coverage output files | ||||
| COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report | ||||
|  | ||||
| ifndef UNAME_S | ||||
| UNAME_S := $(shell uname -s) | ||||
| endif | ||||
|  | ||||
| ifndef UNAME_P | ||||
| UNAME_P := $(shell uname -p) | ||||
| endif | ||||
|  | ||||
| ifndef UNAME_M | ||||
| UNAME_M := $(shell uname -m) | ||||
| endif | ||||
|  | ||||
| # Mac OS + Arm can report x86_64 | ||||
| # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 | ||||
| ifeq ($(UNAME_S),Darwin) | ||||
| 	ifndef LLAMA_NO_METAL | ||||
| 		LLAMA_METAL := 1 | ||||
| 	endif | ||||
|  | ||||
| 	ifneq ($(UNAME_P),arm) | ||||
| 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) | ||||
| 		ifeq ($(SYSCTL_M),1) | ||||
| 			# UNAME_P := arm | ||||
| 			# UNAME_M := arm64 | ||||
| 			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789) | ||||
| 		endif | ||||
| 	endif | ||||
| endif | ||||
|  | ||||
| ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' | ||||
| BUILD_TARGETS += metal | ||||
| endif | ||||
|  | ||||
| default: $(BUILD_TARGETS) | ||||
|  | ||||
| test: | ||||
| @@ -38,18 +71,6 @@ gcovr-report: coverage ## Generate gcovr report | ||||
| 	mkdir -p gcovr-report | ||||
| 	gcovr --root . --html --html-details --output gcovr-report/coverage.html | ||||
|  | ||||
| ifndef UNAME_S | ||||
| UNAME_S := $(shell uname -s) | ||||
| endif | ||||
|  | ||||
| ifndef UNAME_P | ||||
| UNAME_P := $(shell uname -p) | ||||
| endif | ||||
|  | ||||
| ifndef UNAME_M | ||||
| UNAME_M := $(shell uname -m) | ||||
| endif | ||||
|  | ||||
| ifdef RISCV_CROSS_COMPILE | ||||
| CC	:= riscv64-unknown-linux-gnu-gcc | ||||
| CXX	:= riscv64-unknown-linux-gnu-g++ | ||||
| @@ -58,19 +79,6 @@ endif | ||||
| CCV := $(shell $(CC) --version | head -n 1) | ||||
| CXXV := $(shell $(CXX) --version | head -n 1) | ||||
|  | ||||
| # Mac OS + Arm can report x86_64 | ||||
| # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 | ||||
| ifeq ($(UNAME_S),Darwin) | ||||
| 	ifneq ($(UNAME_P),arm) | ||||
| 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) | ||||
| 		ifeq ($(SYSCTL_M),1) | ||||
| 			# UNAME_P := arm | ||||
| 			# UNAME_M := arm64 | ||||
| 			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789) | ||||
| 		endif | ||||
| 	endif | ||||
| endif | ||||
|  | ||||
| # | ||||
| # Compile flags | ||||
| # | ||||
| @@ -231,14 +239,24 @@ endif | ||||
| endif | ||||
|  | ||||
| ifndef LLAMA_NO_ACCELERATE | ||||
| 	# Mac M1 - include Accelerate framework. | ||||
| 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). | ||||
| 	# Mac OS - include Accelerate framework. | ||||
| 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel | ||||
| 	ifeq ($(UNAME_S),Darwin) | ||||
| 		MK_CPPFLAGS += -DGGML_USE_ACCELERATE | ||||
| 		MK_LDFLAGS  += -framework Accelerate | ||||
| 	endif | ||||
| endif # LLAMA_NO_ACCELERATE | ||||
|  | ||||
| ifdef LLAMA_METAL | ||||
| 	# By default - use GPU acceleration on Mac OS | ||||
| 	ifeq ($(UNAME_S),Darwin) | ||||
| 		CFLAGS   += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG | ||||
| 		CXXFLAGS += -DGGML_USE_METAL | ||||
| 		LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit | ||||
| 		OBJS     += ggml-metal.o | ||||
| 	endif | ||||
| endif # LLAMA_METAL | ||||
|  | ||||
| ifdef LLAMA_MPI | ||||
| 	MK_CPPFLAGS += -DGGML_USE_MPI | ||||
| 	MK_CFLAGS   += -Wno-cast-qual | ||||
| @@ -480,10 +498,6 @@ beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o co | ||||
| speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o $(OBJS) | ||||
| 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) | ||||
|  | ||||
| ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' | ||||
| BUILD_TARGETS += metal | ||||
| endif | ||||
|  | ||||
| ifdef LLAMA_METAL | ||||
| metal: examples/metal/metal.cpp ggml.o $(OBJS) | ||||
| 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) | ||||
|   | ||||
							
								
								
									
										26
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								README.md
									
									
									
									
									
								
							| @@ -280,29 +280,11 @@ In order to build llama.cpp you have three different options. | ||||
|  | ||||
| ### Metal Build | ||||
|  | ||||
| Using Metal allows the computation to be executed on the GPU for Apple devices: | ||||
| On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU. | ||||
| To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option. | ||||
|  | ||||
| - Using `make`: | ||||
|  | ||||
|   ```bash | ||||
|   LLAMA_METAL=1 make | ||||
|   ``` | ||||
|  | ||||
| - Using `CMake`: | ||||
|  | ||||
|     ```bash | ||||
|     mkdir build-metal | ||||
|     cd build-metal | ||||
|     cmake -DLLAMA_METAL=ON .. | ||||
|     cmake --build . --config Release | ||||
|     ``` | ||||
|  | ||||
| When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument. | ||||
| Any value larger than 0 will offload the computation to the GPU. For example: | ||||
|  | ||||
| ```bash | ||||
| ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1 | ||||
| ``` | ||||
| When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line | ||||
| argument. | ||||
|  | ||||
| ### MPI Build | ||||
|  | ||||
|   | ||||
| @@ -717,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param | ||||
|  | ||||
|     lparams.n_ctx           = params.n_ctx; | ||||
|     lparams.n_batch         = params.n_batch; | ||||
|     lparams.n_gpu_layers    = params.n_gpu_layers; | ||||
|     if (params.n_gpu_layers != -1) { | ||||
|         lparams.n_gpu_layers = params.n_gpu_layers; | ||||
|     } | ||||
|     lparams.main_gpu        = params.main_gpu; | ||||
|     lparams.tensor_split    = params.tensor_split; | ||||
|     lparams.low_vram        = params.low_vram; | ||||
| @@ -1212,7 +1214,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l | ||||
|     fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); | ||||
|     fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false"); | ||||
|     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); | ||||
|     fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers); | ||||
|     fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); | ||||
|     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); | ||||
|     fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs); | ||||
|     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); | ||||
|   | ||||
| @@ -34,7 +34,7 @@ struct gpt_params { | ||||
|     int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt | ||||
|     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding | ||||
|     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited) | ||||
|     int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM | ||||
|     int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default) | ||||
|     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors | ||||
|     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs | ||||
|     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens. | ||||
|   | ||||
| @@ -151,14 +151,6 @@ int main(int argc, char ** argv) { | ||||
|         LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); | ||||
|     } | ||||
|  | ||||
|     if (params.n_ctx > 2048) { | ||||
|         // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048 | ||||
|         LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx); | ||||
|     } else if (params.n_ctx < 8) { | ||||
|         LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); | ||||
|         params.n_ctx = 8; | ||||
|     } | ||||
|  | ||||
|     LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); | ||||
|  | ||||
|     if (params.seed == LLAMA_DEFAULT_SEED) { | ||||
| @@ -194,6 +186,13 @@ int main(int argc, char ** argv) { | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|     if (params.n_ctx > llama_n_ctx(ctx)) { | ||||
|         LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx); | ||||
|     } else if (params.n_ctx < 8) { | ||||
|         LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); | ||||
|         params.n_ctx = 8; | ||||
|     } | ||||
|  | ||||
|     // print system information | ||||
|     { | ||||
|         LOG_TEE("\n"); | ||||
|   | ||||
| @@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { | ||||
|         // Example, we have a context window of 512, we will compute perplexity for each of the | ||||
|         // last 256 tokens.  Then, we split the input up into context window size chunks to | ||||
|         // process the entire prompt. | ||||
|         const int first = std::min(512, params.n_ctx/2); | ||||
|         const int first = params.n_ctx/2; | ||||
|         process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, | ||||
|                        workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); | ||||
|         count += params.n_ctx - first - 1; | ||||
| @@ -668,11 +668,6 @@ int main(int argc, char ** argv) { | ||||
|         params.n_ctx += params.ppl_stride/2; | ||||
|     } | ||||
|  | ||||
|     if (params.n_ctx > 2048) { | ||||
|         fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" | ||||
|                 "expect poor results\n", __func__, params.n_ctx); | ||||
|     } | ||||
|  | ||||
|     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); | ||||
|  | ||||
|     if (params.seed == LLAMA_DEFAULT_SEED) { | ||||
| @@ -698,6 +693,11 @@ int main(int argc, char ** argv) { | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|     if (params.n_ctx > llama_n_ctx(ctx)) { | ||||
|         fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);" | ||||
|                 "expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx); | ||||
|     } | ||||
|  | ||||
|     // print system information | ||||
|     { | ||||
|         fprintf(stderr, "\n"); | ||||
|   | ||||
							
								
								
									
										54
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										54
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -5340,7 +5340,7 @@ struct llama_context_params llama_context_default_params() { | ||||
|         /*.seed                        =*/ LLAMA_DEFAULT_SEED, | ||||
|         /*.n_ctx                       =*/ 512, | ||||
|         /*.n_batch                     =*/ 512, | ||||
|         /*.gpu_layers                  =*/ 0, | ||||
|         /*.n_gpu_layers                =*/ 0, | ||||
|         /*.main_gpu                    =*/ 0, | ||||
|         /*.tensor_split                =*/ nullptr, | ||||
|         /*.rope_freq_base              =*/ 10000.0f, | ||||
| @@ -5357,6 +5357,10 @@ struct llama_context_params llama_context_default_params() { | ||||
|         /*.embedding                   =*/ false, | ||||
|     }; | ||||
|  | ||||
| #ifdef GGML_USE_METAL | ||||
|     result.n_gpu_layers = 1; | ||||
| #endif | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| @@ -5549,43 +5553,43 @@ struct llama_context * llama_new_context_with_model( | ||||
|             } | ||||
| #endif | ||||
|         } | ||||
|     } | ||||
|  | ||||
| #ifdef GGML_USE_METAL | ||||
|     if (params.n_gpu_layers > 0) { | ||||
|         // this allocates all Metal resources and memory buffers | ||||
|         if (params.n_gpu_layers > 0) { | ||||
|             // this allocates all Metal resources and memory buffers | ||||
|  | ||||
|         void * data_ptr  = NULL; | ||||
|         size_t data_size = 0; | ||||
|             void * data_ptr  = NULL; | ||||
|             size_t data_size = 0; | ||||
|  | ||||
|         if (params.use_mmap) { | ||||
|             data_ptr  = ctx->model.mapping->addr; | ||||
|             data_size = ctx->model.mapping->size; | ||||
|         } else { | ||||
|             data_ptr  = ggml_get_mem_buffer(ctx->model.ctx); | ||||
|             data_size = ggml_get_mem_size  (ctx->model.ctx); | ||||
|         } | ||||
|             if (params.use_mmap) { | ||||
|                 data_ptr  = ctx->model.mapping->addr; | ||||
|                 data_size = ctx->model.mapping->size; | ||||
|             } else { | ||||
|                 data_ptr  = ggml_get_mem_buffer(ctx->model.ctx); | ||||
|                 data_size = ggml_get_mem_size  (ctx->model.ctx); | ||||
|             } | ||||
|  | ||||
|         const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); | ||||
|             const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); | ||||
|  | ||||
|         LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); | ||||
|             LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); | ||||
|  | ||||
| #define LLAMA_METAL_CHECK_BUF(result)                            \ | ||||
|     if (!(result)) {                                             \ | ||||
|         LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ | ||||
|         llama_free(ctx);                                         \ | ||||
|         return NULL;                                             \ | ||||
|     } | ||||
|             if (!(result)) {                                             \ | ||||
|                 LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \ | ||||
|                 llama_free(ctx);                                         \ | ||||
|                 return NULL;                                             \ | ||||
|             } | ||||
|  | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); | ||||
|             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); | ||||
|  | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); | ||||
|             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0)); | ||||
|             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0)); | ||||
|  | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); | ||||
|             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0)); | ||||
| #undef LLAMA_METAL_CHECK_BUF | ||||
|     } | ||||
|         } | ||||
| #endif | ||||
|     } | ||||
|  | ||||
| #ifdef GGML_USE_MPI | ||||
|     ctx->ctx_mpi = ggml_mpi_init(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov