mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	mpt : implement backwards compatiblity with duped output tensor (#6139)
This commit is contained in:
		
							
								
								
									
										17
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -540,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA | ||||
|         { | ||||
|             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" }, | ||||
|             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" }, | ||||
|             { LLM_TENSOR_OUTPUT,          "output"}, | ||||
|             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" }, | ||||
|             { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" }, | ||||
|             { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" }, | ||||
| @@ -4300,9 +4301,9 @@ static bool llm_load_tensors( | ||||
|                     { | ||||
|                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); | ||||
|                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}); | ||||
|                         if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) { | ||||
|                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,     "weight"), {n_embd, n_vocab}); | ||||
|                         } else { | ||||
|  | ||||
|                         model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false); | ||||
|                         if (!model.output) { | ||||
|                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU | ||||
|                             ml.n_created--; // artificial tensor | ||||
|                             ml.size_data += ggml_nbytes(model.output); | ||||
| @@ -4507,10 +4508,12 @@ static bool llm_load_tensors( | ||||
|                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); | ||||
|                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false); | ||||
|  | ||||
|                         // same as tok_embd, duplicated to allow offloading | ||||
|                         model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}); | ||||
|                         ml.n_created--; // artificial tensor | ||||
|                         ml.size_data += ggml_nbytes(model.output); | ||||
|                         model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, false); | ||||
|                         if (!model.output) { | ||||
|                             model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU | ||||
|                             ml.n_created--; // artificial tensor | ||||
|                             ml.size_data += ggml_nbytes(model.output); | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     for (int i = 0; i < n_layer; ++i) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jared Van Bortel
					Jared Van Bortel