mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama.cpp : split llama_context_params into model and context params (#3301)
* llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used
This commit is contained in:
		| @@ -48,8 +48,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) { | ||||
|     // print system information | ||||
|     { | ||||
|         fprintf(stderr, "\n"); | ||||
|         fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", | ||||
|                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); | ||||
|         fprintf(stderr, "%s\n", get_system_info(params).c_str()); | ||||
|     } | ||||
|     struct MyModel * ret = new MyModel(); | ||||
|     ret->ctx = ctx; | ||||
| @@ -71,7 +70,7 @@ bool eval_float(void * model, float * input, int N){ | ||||
|     MyModel * mymodel = (MyModel*)model; | ||||
|     llama_context * ctx = mymodel->ctx; | ||||
|     gpt_params params = mymodel->params; | ||||
|     int n_emb = llama_n_embd(ctx); | ||||
|     int n_emb = llama_n_embd(llama_get_model(ctx)); | ||||
|     int n_past = mymodel->n_past; | ||||
|     int n_batch = N; // params.n_batch; | ||||
|  | ||||
| @@ -81,7 +80,7 @@ bool eval_float(void * model, float * input, int N){ | ||||
|             n_eval = n_batch; | ||||
|         } | ||||
|         llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, }; | ||||
|         if (llama_decode(ctx, batch, params.n_threads)) { | ||||
|         if (llama_decode(ctx, batch)) { | ||||
|             fprintf(stderr, "%s : failed to eval\n", __func__); | ||||
|             return false; | ||||
|         } | ||||
| @@ -102,7 +101,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) { | ||||
|         if (n_eval > params.n_batch) { | ||||
|             n_eval = params.n_batch; | ||||
|         } | ||||
|         if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0), params.n_threads)) { | ||||
|         if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) { | ||||
|             fprintf(stderr, "%s : failed to eval\n", __func__); | ||||
|             return false; | ||||
|         } | ||||
| @@ -133,7 +132,7 @@ llama_token sampling_id(struct MyModel* mymodel) { | ||||
|  | ||||
|     // out of user input, sample next token | ||||
|     const float   temp            = params.temp; | ||||
|     const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; | ||||
|     const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k; | ||||
|     const float   top_p           = params.top_p; | ||||
|     const float   tfs_z           = params.tfs_z; | ||||
|     const float   typical_p       = params.typical_p; | ||||
| @@ -149,7 +148,7 @@ llama_token sampling_id(struct MyModel* mymodel) { | ||||
|     llama_token id = 0; | ||||
|     { | ||||
|         auto logits  = llama_get_logits(ctx); | ||||
|         auto n_vocab = llama_n_vocab(ctx); | ||||
|         auto n_vocab = llama_n_vocab(llama_get_model(ctx)); | ||||
|  | ||||
|         // Apply params.logit_bias map | ||||
|         for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { | ||||
|   | ||||
| @@ -8,7 +8,7 @@ int main(int argc, char** argv) { | ||||
|     auto mymodel = create_mymodel(argc, argv); | ||||
|     int N = 10; | ||||
|     int max_tgt_len = 500; | ||||
|     int n_embd = llama_n_embd(mymodel->ctx); | ||||
|     int n_embd = llama_n_embd(llama_get_model(mymodel->ctx)); | ||||
|  | ||||
|     // add random float embd to test evaluation | ||||
|     float * data = new float[N*n_embd]; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren