mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-05 09:36:52 +00:00
Merge branch 'master' into mpi
This commit is contained in:
85
llama.cpp
85
llama.cpp
@@ -198,8 +198,8 @@ struct llama_layer {
|
||||
};
|
||||
|
||||
struct llama_kv_cache {
|
||||
struct ggml_tensor * k;
|
||||
struct ggml_tensor * v;
|
||||
struct ggml_tensor * k = NULL;
|
||||
struct ggml_tensor * v = NULL;
|
||||
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
@@ -488,9 +488,7 @@ struct llama_file_loader {
|
||||
std::string word = file.read_string(len);
|
||||
|
||||
float score = 0.0f;
|
||||
if (file_version >= LLAMA_FILE_VERSION_GGMF_V1) {
|
||||
file.read_raw(&score, sizeof(score));
|
||||
}
|
||||
file.read_raw(&score, sizeof(score));
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
|
||||
@@ -1174,6 +1172,7 @@ static void llama_model_load_internal(
|
||||
}
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
@@ -1182,6 +1181,10 @@ static void llama_model_load_internal(
|
||||
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
||||
}
|
||||
size_t vram_kv_cache = 0;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
const int max_backend_supported_layers = hparams.n_layer + 3;
|
||||
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
||||
if (low_vram) {
|
||||
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||
@@ -1198,14 +1201,18 @@ static void llama_model_load_internal(
|
||||
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||
}
|
||||
}
|
||||
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
const int max_backend_supported_layers = hparams.n_layer + 1;
|
||||
const int max_offloadable_layers = hparams.n_layer + 1;
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
||||
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
||||
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
||||
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif
|
||||
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
}
|
||||
|
||||
// populate `tensors_by_name`
|
||||
@@ -1938,10 +1945,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_sample_softmax(ctx, candidates);
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Compute the cumulative probabilities
|
||||
float cum_sum = 0.0f;
|
||||
size_t last_idx = candidates->size;
|
||||
@@ -1970,9 +1977,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_sample_softmax(nullptr, candidates);
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Compute the first and second derivatives
|
||||
std::vector<float> first_derivatives(candidates->size - 1);
|
||||
@@ -2024,11 +2030,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// Compute the softmax of logits and calculate entropy
|
||||
llama_sample_softmax(nullptr, candidates);
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
float entropy = 0.0f;
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
||||
@@ -2197,13 +2203,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
ctx->n_sample++;
|
||||
}
|
||||
return X;
|
||||
}
|
||||
|
||||
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
||||
assert(ctx);
|
||||
int64_t t_start_sample_us;
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
||||
@@ -2218,13 +2222,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
||||
candidates->size = 1;
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
|
||||
// Normalize the probabilities of the remaining words
|
||||
llama_sample_softmax(ctx, candidates);
|
||||
|
||||
// Sample the next word X from the remaining words
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
llama_token X = llama_sample_token(ctx, candidates);
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
||||
@@ -2292,10 +2297,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
||||
}
|
||||
float * f32_output = (float *) output.addr;
|
||||
|
||||
quantize_fns_t qtype;
|
||||
ggml_type_traits_t qtype;
|
||||
if (ggml_is_quantized(tensor.type)) {
|
||||
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
||||
if (qtype.dequantize_row_q == NULL) {
|
||||
qtype = ggml_internal_get_type_traits(tensor.type);
|
||||
if (qtype.to_float == NULL) {
|
||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
||||
}
|
||||
} else if (tensor.type != GGML_TYPE_F16) {
|
||||
@@ -2306,7 +2311,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
||||
if (tensor.type == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
||||
} else if (ggml_is_quantized(tensor.type)) {
|
||||
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
||||
qtype.to_float(tensor.data, f32_output, nelements);
|
||||
} else {
|
||||
LLAMA_ASSERT(false); // unreachable
|
||||
}
|
||||
@@ -2331,7 +2336,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
||||
if (typ == GGML_TYPE_F16) {
|
||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||
} else {
|
||||
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
||||
qtype.to_float(inbuf, outbuf, nels);
|
||||
}
|
||||
};
|
||||
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
||||
@@ -3548,23 +3553,35 @@ llama_token llama_token_nl() {
|
||||
return 13;
|
||||
}
|
||||
|
||||
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
||||
struct llama_timings result = {
|
||||
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
||||
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
||||
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
||||
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
||||
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
||||
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
||||
|
||||
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
||||
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
||||
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void llama_print_timings(struct llama_context * ctx) {
|
||||
const int64_t t_end_us = ggml_time_us();
|
||||
|
||||
const int32_t n_sample = std::max(1, ctx->n_sample);
|
||||
const int32_t n_eval = std::max(1, ctx->n_eval);
|
||||
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
||||
const llama_timings timings = llama_get_timings(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
||||
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
||||
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
||||
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
||||
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
||||
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
||||
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
||||
}
|
||||
|
||||
void llama_reset_timings(struct llama_context * ctx) {
|
||||
|
||||
Reference in New Issue
Block a user