mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-11 10:36:54 +00:00
hparams : add n_embd_inp() to support extended embed (#16928)
* add n_embd_full to support extended embed * don't change output * rename to n_embd_inp * restore n_embd where applicable
This commit is contained in:
@@ -276,8 +276,8 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
{
|
||||
const int n_embd = hparams.n_embd;
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
||||
const int n_embd_inp = hparams.n_embd_inp();
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
|
||||
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
||||
} break;
|
||||
case GGML_OP_SCALE:
|
||||
@@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case 64: type = LLM_TYPE_32B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
// since vision model stacks deepstack features along feature dim
|
||||
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
|
||||
hparams.n_embd *= hparams.n_deepstack_layers + 1;
|
||||
} break;
|
||||
case LLM_ARCH_QWEN3MOE:
|
||||
{
|
||||
@@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
case 94: type = LLM_TYPE_235B_A22B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
// since vision model stacks deepstack features along feature dim
|
||||
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
|
||||
hparams.n_embd *= hparams.n_deepstack_layers + 1;
|
||||
} break;
|
||||
case LLM_ARCH_PHI2:
|
||||
{
|
||||
@@ -3341,10 +3335,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
case LLM_ARCH_QWEN3:
|
||||
case LLM_ARCH_QWEN3VL:
|
||||
{
|
||||
// for model loading, the weights only have the main embd
|
||||
// so we need to divide by the number of deepstack layers + 1
|
||||
// n_embd is const int so we declare a new variable
|
||||
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
@@ -3380,10 +3370,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
case LLM_ARCH_QWEN3MOE:
|
||||
case LLM_ARCH_QWEN3VLMOE:
|
||||
{
|
||||
// for model loading, the weights only have the main embd
|
||||
// so we need to divide by the number of deepstack layers + 1
|
||||
// n_embd is const int so we declare a new variable
|
||||
int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
@@ -6535,6 +6521,7 @@ void llama_model::print_info() const {
|
||||
if (!hparams.vocab_only) {
|
||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||
LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
|
||||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
||||
@@ -7380,6 +7367,10 @@ int32_t llama_model_n_embd(const llama_model * model) {
|
||||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
int32_t llama_model_n_embd_inp(const llama_model * model) {
|
||||
return model->hparams.n_embd_inp();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_layer(const llama_model * model) {
|
||||
return model->hparams.n_layer;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user