mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	llama : enable GPU inference by default with Metal
This commit is contained in:
		@@ -702,7 +702,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    lparams.n_ctx           = params.n_ctx;
 | 
					    lparams.n_ctx           = params.n_ctx;
 | 
				
			||||||
    lparams.n_batch         = params.n_batch;
 | 
					    lparams.n_batch         = params.n_batch;
 | 
				
			||||||
    lparams.n_gpu_layers    = params.n_gpu_layers;
 | 
					    lparams.n_gpu_layers    = params.n_gpu_layers != -1 ? params.n_gpu_layers : lparams.n_gpu_layers;
 | 
				
			||||||
    lparams.main_gpu        = params.main_gpu;
 | 
					    lparams.main_gpu        = params.main_gpu;
 | 
				
			||||||
    lparams.tensor_split    = params.tensor_split;
 | 
					    lparams.tensor_split    = params.tensor_split;
 | 
				
			||||||
    lparams.low_vram        = params.low_vram;
 | 
					    lparams.low_vram        = params.low_vram;
 | 
				
			||||||
@@ -1064,7 +1064,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
 | 
				
			|||||||
    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
 | 
					    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
 | 
				
			||||||
    fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
 | 
					    fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
 | 
				
			||||||
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
 | 
					    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
 | 
				
			||||||
    fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
 | 
					    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
 | 
				
			||||||
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
 | 
					    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
 | 
				
			||||||
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
 | 
					    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
 | 
				
			||||||
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
 | 
					    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -33,7 +33,7 @@ struct gpt_params {
 | 
				
			|||||||
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
 | 
					    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
 | 
				
			||||||
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
 | 
					    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
 | 
				
			||||||
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
 | 
					    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
 | 
				
			||||||
    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
 | 
					    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
 | 
				
			||||||
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
 | 
					    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
 | 
				
			||||||
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
 | 
					    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
 | 
				
			||||||
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
 | 
					    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -151,14 +151,6 @@ int main(int argc, char ** argv) {
 | 
				
			|||||||
        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
 | 
					        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (params.n_ctx > 2048) {
 | 
					 | 
				
			||||||
        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
 | 
					 | 
				
			||||||
        LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
 | 
					 | 
				
			||||||
    } else if (params.n_ctx < 8) {
 | 
					 | 
				
			||||||
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
 | 
					 | 
				
			||||||
        params.n_ctx = 8;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 | 
					    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (params.seed == LLAMA_DEFAULT_SEED) {
 | 
					    if (params.seed == LLAMA_DEFAULT_SEED) {
 | 
				
			||||||
@@ -194,6 +186,13 @@ int main(int argc, char ** argv) {
 | 
				
			|||||||
        return 1;
 | 
					        return 1;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (params.n_ctx > llama_n_ctx(ctx)) {
 | 
				
			||||||
 | 
					        LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx);
 | 
				
			||||||
 | 
					    } else if (params.n_ctx < 8) {
 | 
				
			||||||
 | 
					        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
 | 
				
			||||||
 | 
					        params.n_ctx = 8;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // print system information
 | 
					    // print system information
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
        LOG_TEE("\n");
 | 
					        LOG_TEE("\n");
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
 | 
				
			|||||||
        // Example, we have a context window of 512, we will compute perplexity for each of the
 | 
					        // Example, we have a context window of 512, we will compute perplexity for each of the
 | 
				
			||||||
        // last 256 tokens.  Then, we split the input up into context window size chunks to
 | 
					        // last 256 tokens.  Then, we split the input up into context window size chunks to
 | 
				
			||||||
        // process the entire prompt.
 | 
					        // process the entire prompt.
 | 
				
			||||||
        const int first = std::min(512, params.n_ctx/2);
 | 
					        const int first = params.n_ctx/2;
 | 
				
			||||||
        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
 | 
					        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
 | 
				
			||||||
                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
 | 
					                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
 | 
				
			||||||
        count += params.n_ctx - first - 1;
 | 
					        count += params.n_ctx - first - 1;
 | 
				
			||||||
@@ -668,11 +668,6 @@ int main(int argc, char ** argv) {
 | 
				
			|||||||
        params.n_ctx += params.ppl_stride/2;
 | 
					        params.n_ctx += params.ppl_stride/2;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (params.n_ctx > 2048) {
 | 
					 | 
				
			||||||
        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
 | 
					 | 
				
			||||||
                "expect poor results\n", __func__, params.n_ctx);
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 | 
					    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (params.seed == LLAMA_DEFAULT_SEED) {
 | 
					    if (params.seed == LLAMA_DEFAULT_SEED) {
 | 
				
			||||||
@@ -698,6 +693,11 @@ int main(int argc, char ** argv) {
 | 
				
			|||||||
        return 1;
 | 
					        return 1;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (params.n_ctx > llama_n_ctx(ctx)) {
 | 
				
			||||||
 | 
					        fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);"
 | 
				
			||||||
 | 
					                "expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // print system information
 | 
					    // print system information
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
        fprintf(stderr, "\n");
 | 
					        fprintf(stderr, "\n");
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -5334,7 +5334,7 @@ struct llama_context_params llama_context_default_params() {
 | 
				
			|||||||
        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
 | 
					        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
 | 
				
			||||||
        /*.n_ctx                       =*/ 512,
 | 
					        /*.n_ctx                       =*/ 512,
 | 
				
			||||||
        /*.n_batch                     =*/ 512,
 | 
					        /*.n_batch                     =*/ 512,
 | 
				
			||||||
        /*.gpu_layers                  =*/ 0,
 | 
					        /*.n_gpu_layers                =*/ 0,
 | 
				
			||||||
        /*.main_gpu                    =*/ 0,
 | 
					        /*.main_gpu                    =*/ 0,
 | 
				
			||||||
        /*.tensor_split                =*/ nullptr,
 | 
					        /*.tensor_split                =*/ nullptr,
 | 
				
			||||||
        /*.rope_freq_base              =*/ 10000.0f,
 | 
					        /*.rope_freq_base              =*/ 10000.0f,
 | 
				
			||||||
@@ -5351,6 +5351,10 @@ struct llama_context_params llama_context_default_params() {
 | 
				
			|||||||
        /*.embedding                   =*/ false,
 | 
					        /*.embedding                   =*/ false,
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef GGML_USE_METAL
 | 
				
			||||||
 | 
					    result.n_gpu_layers = 1;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return result;
 | 
					    return result;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user