mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	mpt : implement backwards compatiblity with duped output tensor (#6139)
This commit is contained in:
		
							
								
								
									
										13
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										13
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -540,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA | |||||||
|         { |         { | ||||||
|             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" }, |             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" }, | ||||||
|             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" }, |             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" }, | ||||||
|  |             { LLM_TENSOR_OUTPUT,          "output"}, | ||||||
|             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" }, |             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" }, | ||||||
|             { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" }, |             { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" }, | ||||||
|             { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" }, |             { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" }, | ||||||
| @@ -4300,9 +4301,9 @@ static bool llm_load_tensors( | |||||||
|                     { |                     { | ||||||
|                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); |                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); | ||||||
|                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}); |                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}); | ||||||
|                         if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) { |  | ||||||
|                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab}); |                         model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false); | ||||||
|                         } else { |                         if (!model.output) { | ||||||
|                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU |                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU | ||||||
|                             ml.n_created--; // artificial tensor |                             ml.n_created--; // artificial tensor | ||||||
|                             ml.size_data += ggml_nbytes(model.output); |                             ml.size_data += ggml_nbytes(model.output); | ||||||
| @@ -4507,11 +4508,13 @@ static bool llm_load_tensors( | |||||||
|                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); |                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); | ||||||
|                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false); |                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false); | ||||||
|  |  | ||||||
|                         // same as tok_embd, duplicated to allow offloading |                         model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false); | ||||||
|                         model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); |                         if (!model.output) { | ||||||
|  |                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU | ||||||
|                             ml.n_created--; // artificial tensor |                             ml.n_created--; // artificial tensor | ||||||
|                             ml.size_data += ggml_nbytes(model.output); |                             ml.size_data += ggml_nbytes(model.output); | ||||||
|                         } |                         } | ||||||
|  |                     } | ||||||
|  |  | ||||||
|                     for (int i = 0; i < n_layer; ++i) { |                     for (int i = 0; i < n_layer; ++i) { | ||||||
|                         ggml_context * ctx_layer = ctx_for_layer(i); |                         ggml_context * ctx_layer = ctx_for_layer(i); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jared Van Bortel
					Jared Van Bortel