mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-02 09:12:03 +00:00
Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -37,8 +38,7 @@ struct ngram_container {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ int main(int argc, char ** argv) {
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||
|
||||
// target model sampling context
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
||||
|
||||
// verification n-grams
|
||||
std::vector<ngram_data> ngrams_cur(G);
|
||||
@@ -159,9 +159,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// sample first token
|
||||
{
|
||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
|
||||
id = gpt_sampler_sample(smpl, ctx, 0);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
gpt_sampler_accept(smpl, id, true);
|
||||
|
||||
{
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
@@ -284,9 +284,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// sample the next token
|
||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
|
||||
id = gpt_sampler_sample(smpl, ctx, i_batch);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
gpt_sampler_accept(smpl, id, true);
|
||||
|
||||
// print
|
||||
{
|
||||
@@ -361,7 +361,7 @@ int main(int argc, char ** argv) {
|
||||
if (v == 0) {
|
||||
// sample from the last level
|
||||
for (int i = 0; i < W; i++) {
|
||||
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||
tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < W; i++) {
|
||||
@@ -469,10 +469,12 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
|
||||
llama_print_timings(ctx);
|
||||
LOG_TEE("\n");
|
||||
gpt_perf_print(ctx, smpl);
|
||||
|
||||
gpt_sampler_free(smpl);
|
||||
|
||||
llama_kv_cache_view_free(&kvc_view);
|
||||
llama_sampling_free(ctx_sampling);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user