mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-30 08:42:00 +00:00
test-model-random : better default tensor initialization distribution
This commit is contained in:
@@ -65,6 +65,7 @@ struct random_tensor {
|
|||||||
for (int64_t d : shape) {
|
for (int64_t d : shape) {
|
||||||
prod *= d;
|
prod *= d;
|
||||||
}
|
}
|
||||||
|
GGML_ASSERT(prod != 0);
|
||||||
return ggml_row_size(type, prod);
|
return ggml_row_size(type, prod);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -266,8 +267,20 @@ struct model_variant {
|
|||||||
tensors(other.tensors),
|
tensors(other.tensors),
|
||||||
metadata(other.metadata) {}
|
metadata(other.metadata) {}
|
||||||
|
|
||||||
|
void add_tensor(const std::string & name, const std::vector<int64_t> & shape, float gain = 1.0f) {
|
||||||
|
// ref: https://github.com/pytorch/pytorch/blob/134179474539648ba7dee1317959529fbd0e7f89/torch/nn/init.py#L515-L516
|
||||||
|
const auto init_kaiming_uniform = [gain](uint32_t fan_in) {
|
||||||
|
const float std = gain * std::sqrt(fan_in);
|
||||||
|
const float bound = std::sqrt(3.0f) * std;
|
||||||
|
|
||||||
|
return std::uniform_real_distribution<float>(-bound, bound);
|
||||||
|
};
|
||||||
|
|
||||||
|
tensors.push_back(random_tensor(name, shape, init_kaiming_uniform(shape[0])));
|
||||||
|
}
|
||||||
|
|
||||||
void add_tensor(const std::string & name, const std::vector<int64_t> & shape,
|
void add_tensor(const std::string & name, const std::vector<int64_t> & shape,
|
||||||
const std::function<float(std::mt19937 &)> & distribution = std::normal_distribution<float>()) {
|
const std::function<float(std::mt19937 &)> & distribution) {
|
||||||
tensors.push_back(random_tensor(name, shape, distribution));
|
tensors.push_back(random_tensor(name, shape, distribution));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -299,7 +312,7 @@ struct model_variant {
|
|||||||
|
|
||||||
size_t total_size = 0;
|
size_t total_size = 0;
|
||||||
for (const auto & t : tensors) {
|
for (const auto & t : tensors) {
|
||||||
total_size += t.n_bytes() + ggml_tensor_overhead();
|
total_size += GGML_PAD(t.n_bytes() + ggml_tensor_overhead(), GGML_MEM_ALIGN);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_init_params init_params = {
|
ggml_init_params init_params = {
|
||||||
@@ -356,6 +369,11 @@ struct model_variant {
|
|||||||
m.add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, vocab_types);
|
m.add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, vocab_types);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// don't actually use bias
|
||||||
|
const auto init_bias = []() {
|
||||||
|
return 0.0f;
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: fill the variants
|
// TODO: fill the variants
|
||||||
// TODO: how to make the variants more modular?
|
// TODO: how to make the variants more modular?
|
||||||
switch (arch) {
|
switch (arch) {
|
||||||
@@ -591,12 +609,12 @@ struct model_variant {
|
|||||||
cur.add_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, 2 * d_inner });
|
cur.add_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, 2 * d_inner });
|
||||||
|
|
||||||
cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { d_conv, d_inner });
|
cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { d_conv, d_inner });
|
||||||
cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), { d_inner });
|
cur.add_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), { d_inner }, init_bias);
|
||||||
|
|
||||||
cur.add_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), { d_inner, dt_rank + 2 * d_state });
|
cur.add_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), { d_inner, dt_rank + 2 * d_state });
|
||||||
|
|
||||||
cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), { dt_rank, d_inner });
|
cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), { dt_rank, d_inner });
|
||||||
cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { d_inner });
|
cur.add_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { d_inner }, init_bias);
|
||||||
|
|
||||||
// no "weight" suffix for these
|
// no "weight" suffix for these
|
||||||
cur.add_tensor(tn(LLM_TENSOR_SSM_A, i), { d_state, d_inner }, init_A_S4D);
|
cur.add_tensor(tn(LLM_TENSOR_SSM_A, i), { d_state, d_inner }, init_A_S4D);
|
||||||
@@ -674,19 +692,19 @@ struct model_variant {
|
|||||||
|
|
||||||
// Block 0, LN0
|
// Block 0, LN0
|
||||||
cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
||||||
cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, init_bias);
|
||||||
|
|
||||||
// output
|
// output
|
||||||
cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, init_bias);
|
||||||
cur.add_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
cur.add_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, init_bias);
|
||||||
|
|
||||||
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
||||||
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, init_bias);
|
||||||
|
|
||||||
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd});
|
||||||
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay});
|
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay});
|
||||||
@@ -721,7 +739,7 @@ struct model_variant {
|
|||||||
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
|
||||||
|
|
||||||
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
|
||||||
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
|
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, init_bias);
|
||||||
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
|
cur.add_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
|
||||||
|
|
||||||
cur.add_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
|
cur.add_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
|
||||||
@@ -1036,7 +1054,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
|
for (llama_seq_id seq_id = 0; seq_id < n_seq_max; ++seq_id) {
|
||||||
float err = ref_outputs[seq_id].validate_batch(ctx, batch, seq_id);
|
float err = ref_outputs[seq_id].validate_batch(ctx, batch, seq_id);
|
||||||
if (!isfinite(err) || err > 1.0f / 1024.0f) {
|
if (!isfinite(err) || err > 1.0f / 1024.0f) {
|
||||||
fprintf(stderr, "Error for seq_id %i is %f\n", seq_id, err);
|
fprintf(stderr, "Error for seq_id %i is %f at n_past=%i\n", seq_id, err, seq_id_n_past[seq_id]);
|
||||||
valid[seq_id] = false;
|
valid[seq_id] = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user