diff --git a/.clang-format b/.clang-format
index 45232b80ed..47d96b6b40 100644
--- a/.clang-format
+++ b/.clang-format
@@ -22,8 +22,8 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
-BinPackArguments: true
-BinPackParameters: true # OnePerLine
+BinPackArguments: false
+BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach
BraceWrapping:
@@ -70,15 +70,18 @@ ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
IncludeBlocks: Regroup
IncludeCategories:
- - Regex: '^<.*\.h>'
+ - Regex: '".*"'
Priority: 1
SortPriority: 0
- - Regex: '^<.*'
+ - Regex: '^<.*\.h>'
Priority: 2
SortPriority: 0
- - Regex: '.*'
+ - Regex: '^<.*'
Priority: 3
SortPriority: 0
+ - Regex: '.*'
+ Priority: 4
+ SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile
index 87ce2393f6..b0c86dccd5 100644
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,10 +1,10 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.0.1
+ARG MUSA_VERSION=rc4.2.0
# Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile
index 1c00f1b9c2..cf19e6e028 100644
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,8 +1,8 @@
ARG UBUNTU_VERSION=24.04
# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=6.3
-ARG AMDGPU_VERSION=6.3
+ARG ROCM_VERSION=6.4
+ARG AMDGPU_VERSION=6.4
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5bd988b7f7..c6d51fb0c2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -515,7 +515,7 @@ jobs:
ubuntu-22-cmake-musa:
runs-on: ubuntu-22.04
- container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+ container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
steps:
- name: Clone
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
index 276a217d45..19e7854745 100644
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -17,7 +17,7 @@ jobs:
steps:
- uses: actions/stale@v5
with:
- exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
+ exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"
diff --git a/.gitignore b/.gitignore
index f8ceb1560a..f48ce4cacd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,6 +82,7 @@ models/*
models-mnt
!models/.editorconfig
!models/ggml-vocab-*.gguf*
+!models/templates
# Zig
zig-out/
diff --git a/CODEOWNERS b/CODEOWNERS
index 3186f8eb1c..4c0dd4b725 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -9,3 +9,4 @@
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/gguf.cpp @JohannesGaessler
+/ggml/src/ggml-vulkan/ @0cc4m
diff --git a/README.md b/README.md
index edde61238c..9b2e0f851c 100644
--- a/README.md
+++ b/README.md
@@ -270,7 +270,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
-
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
## Obtaining and quantizing models
@@ -436,7 +435,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
## [`llama-perplexity`](tools/perplexity)
-#### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
+#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
-
Measure the perplexity over a text file
@@ -459,8 +458,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
-[^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
-[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
+[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
## [`llama-bench`](tools/llama-bench)
diff --git a/ci/README.md b/ci/README.md
index 6e297f1a82..8eebe988d5 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -54,7 +54,7 @@ docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \
- mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
+ mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
```
Inside the container, execute the following commands:
diff --git a/common/arg.cpp b/common/arg.cpp
index c1151f51da..7744fd6c48 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
string_process_escapes(seq_breaker);
}
+ for (auto & pair : params.speculative.replacements) {
+ string_process_escapes(pair.first);
+ string_process_escapes(pair.second);
+ }
}
if (!params.kv_overrides.empty()) {
@@ -1612,7 +1616,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.antiprompt.emplace_back(value);
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-sp", "--special"},
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2655,6 +2659,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.i_chunk = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+ add_opt(common_arg(
+ {"--show-statistics"},
+ string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
+ [](common_params & params) {
+ params.show_statistics = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--parse-special"},
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
@@ -3242,6 +3253,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+ add_opt(common_arg(
+ {"--spec-replace"}, "TARGET", "DRAFT",
+ "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
+ [](common_params & params, const std::string & tgt, const std::string & dft) {
+ params.speculative.replacements.push_back({ tgt, dft });
+ }
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
@@ -3431,28 +3449,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
- // diffusion parameters
add_opt(common_arg(
{ "--diffusion-steps" }, "N",
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
[](common_params & params, int value) { params.diffusion.steps = value; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
- add_opt(common_arg(
- { "--diffusion-eps" }, "F",
- string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
- [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
- add_opt(common_arg(
- { "--diffusion-algorithm" }, "N",
- string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
- params.diffusion.algorithm),
- [](common_params & params, int value) { params.diffusion.algorithm = value; }
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
- add_opt(common_arg(
- { "--diffusion-alg-temp" }, "F",
- string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
- [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
- ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "--diffusion-visual" },
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
@@ -3460,5 +3461,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { params.diffusion.visual_mode = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-eps" }, "F",
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-algorithm" }, "N",
+ string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
+ params.diffusion.algorithm),
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-alg-temp" }, "F",
+ string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
+ add_opt(common_arg(
+ { "--diffusion-block-length" }, "N",
+ string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
+ [](common_params & params, int value) { params.diffusion.block_length = value; }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-cfg-scale" }, "F",
+ string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
+ [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+ add_opt(common_arg(
+ { "--diffusion-add-gumbel-noise" }, "F",
+ string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
+ [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+
+
return ctx_arg;
}
diff --git a/common/chat.cpp b/common/chat.cpp
index 114dbfccdb..0c777d7a78 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1944,6 +1944,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
}
}
auto msg = builder.result();
- LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+ if (!is_partial) {
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str());
+ }
return msg;
}
diff --git a/common/common.h b/common/common.h
index 11427c51f6..f5acf37ff9 100644
--- a/common/common.h
+++ b/common/common.h
@@ -201,6 +201,7 @@ struct common_params_speculative {
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
float p_split = 0.1f; // speculative decoding split probability
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
+ std::vector> replacements; // main to speculative model replacements
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -220,11 +221,17 @@ struct common_params_vocoder {
};
struct common_params_diffusion {
- int32_t steps = 64; // number of diffusion steps
- float eps = 1e-3f; // epsilon for timesteps
- int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
- float alg_temp = 0.0f; // algorithm temperature
- bool visual_mode = false; // show progressive diffusion on screen
+ int32_t steps = 128;
+ bool visual_mode = false;
+
+ float eps = 0; // epsilon for timesteps
+ int32_t block_length = 32; // block length for generation
+
+ int32_t algorithm = 4; // default algorithm: low-confidence
+ float alg_temp = 0.0f; // algorithm temperature
+
+ float cfg_scale = 0; // classifier-free guidance scale
+ bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
};
enum common_reasoning_format {
@@ -432,9 +439,10 @@ struct common_params {
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
- bool process_output = false; // collect data for the output tensor
- bool compute_ppl = true; // whether to compute perplexity
- bool parse_special = false; // whether to parse special tokens during imatrix tokenization
+ bool process_output = false; // collect data for the output tensor
+ bool compute_ppl = true; // whether to compute perplexity
+ bool show_statistics = false; // show imatrix statistics per tensor
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
// cvector-generator params
int n_pca_batch = 100;
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 843bd1ddbd..262b2c23e7 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -1,30 +1,39 @@
#include "speculative.h"
+#include "ggml.h"
+#include "llama.h"
#include "log.h"
#include "common.h"
#include "sampling.h"
#include
#include
+#include