mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-06 09:46:50 +00:00
vulkan: Fuse rope+set_rows (#16769)
This pattern appears in a lot of models, the rope operation is applied right before storing into the KV cache (usually on the K tensor). Add a path to some of the rope shaders that computes the destination address based on the set_rows tensor. Compile variants of the shader with D_TYPE of f16 (the usual KV cache type). Add a src3 operand to ggml_vk_op_f32 - sometimes rope uses three srcs and needs the fourth for the row indices. Add fused_ops_write_mask to indicate which intermediate tensors need to write their results to memory. Skipping writing the roped K value helps to allow more nodes to run concurrently. Add logic to ggml_vk_graph_optimize to make ROPE+VIEW+SET_ROWS consecutive. It rarely starts out that way in the graph. Add new backend tests.
This commit is contained in:
@@ -2125,6 +2125,34 @@ struct test_get_rows_back : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
static void init_set_rows_row_ids(ggml_tensor * t, int num_rows) {
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
for (int i2 = 0; i2 < t->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < t->ne[1]; i1++) {
|
||||
// generate a shuffled subset of row indices
|
||||
std::vector<int64_t> data(num_rows);
|
||||
for (int i = 0; i < num_rows; i++) {
|
||||
data[i] = i;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
data.resize(t->ne[0]);
|
||||
|
||||
const size_t offs = i1*t->nb[1] + i2*t->nb[2];
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
// TODO: Make a template or something
|
||||
std::vector<int32_t> data_i32(t->ne[0]);
|
||||
for (int i = 0; i < t->ne[0]; i++) {
|
||||
data_i32[i] = static_cast<int32_t>(data[i]);
|
||||
}
|
||||
ggml_backend_tensor_set(t, data_i32.data(), offs, t->ne[0]*sizeof(int32_t));
|
||||
} else {
|
||||
ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GGML_OP_SET_ROWS
|
||||
struct test_set_rows : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -2168,37 +2196,13 @@ struct test_set_rows : public test_case {
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
std::random_device rd;
|
||||
std::default_random_engine rng(rd());
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
|
||||
if (ggml_is_view_op(t->op)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i2 = 0; i2 < t->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < t->ne[1]; i1++) {
|
||||
// generate a shuffled subset of row indices
|
||||
std::vector<int64_t> data(ne[1]);
|
||||
for (int i = 0; i < ne[1]; i++) {
|
||||
data[i] = i;
|
||||
}
|
||||
std::shuffle(data.begin(), data.end(), rng);
|
||||
data.resize(t->ne[0]);
|
||||
|
||||
const size_t offs = i1*t->nb[1] + i2*t->nb[2];
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
// TODO: Make a template or something
|
||||
std::vector<int32_t> data_i32(t->ne[0]);
|
||||
for (int i = 0; i < t->ne[0]; i++) {
|
||||
data_i32[i] = static_cast<int32_t>(data[i]);
|
||||
}
|
||||
ggml_backend_tensor_set(t, data_i32.data(), offs, t->ne[0]*sizeof(int32_t));
|
||||
} else {
|
||||
ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t));
|
||||
}
|
||||
}
|
||||
}
|
||||
init_set_rows_row_ids(t, ne[1]);
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
@@ -2227,6 +2231,67 @@ struct test_set_rows : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_ROPE + GGML_OP_VIEW + GGML_OP_SET_ROWS
|
||||
struct test_rope_set_rows : public test_case {
|
||||
const ggml_type type;
|
||||
const ggml_type type_idx;
|
||||
const std::array<int64_t, 4> ne;
|
||||
int mode;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(type, type_idx, ne, mode);
|
||||
}
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "ROPE_SET_ROWS";
|
||||
}
|
||||
|
||||
bool run_whole_graph() override { return true; }
|
||||
|
||||
test_rope_set_rows(ggml_type type,
|
||||
ggml_type type_idx,
|
||||
std::array<int64_t, 4> ne,
|
||||
int mode)
|
||||
: type(type), type_idx(type_idx), ne(ne), mode(mode) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * src = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
|
||||
ggml_set_name(src, "src");
|
||||
|
||||
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
|
||||
|
||||
ggml_tensor * rope = ggml_rope(ctx, src, pos, ne[0], mode);
|
||||
|
||||
ggml_tensor * view = ggml_view_2d(ctx, rope, ne[0] * ne[1], ne[2], rope->nb[2], 0);
|
||||
|
||||
ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne[0] * ne[1], ne[2] * ne[3], 1, 1);
|
||||
ggml_set_name(dst, "dst");
|
||||
|
||||
ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, ne[2], 1, 1);
|
||||
ggml_set_name(row_idxs, "row_idxs");
|
||||
|
||||
ggml_tensor * out = ggml_set_rows(ctx, dst, view, row_idxs);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
|
||||
if (ggml_is_view_op(t->op)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
init_set_rows_row_ids(t, ne[2]);
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_ARGMAX
|
||||
struct test_argmax : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -6163,6 +6228,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
|
||||
for (int mode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX }) {
|
||||
for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 1, 100 }, mode));
|
||||
test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 512, 1 }, mode));
|
||||
}
|
||||
}
|
||||
|
||||
for (ggml_type type_input : {GGML_TYPE_F32}) {
|
||||
for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
|
||||
for (int k0 : {1, 3}) {
|
||||
|
||||
Reference in New Issue
Block a user