mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-05 09:36:52 +00:00
graph : restore ubatch in build_cb
ggml-ci
This commit is contained in:
@@ -196,6 +196,7 @@ bool llama_context::apply_adapter_cvec(
|
|||||||
void llama_context::build_cb(
|
void llama_context::build_cb(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
const char * name,
|
const char * name,
|
||||||
|
const llama_ubatch & ubatch,
|
||||||
int il) {
|
int il) {
|
||||||
if (il >= 0) {
|
if (il >= 0) {
|
||||||
ggml_format_name(cur, "%s-%d", name, il);
|
ggml_format_name(cur, "%s-%d", name, il);
|
||||||
@@ -213,10 +214,7 @@ void llama_context::build_cb(
|
|||||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||||
// FIXME: fix in ggml_backend_sched
|
// FIXME: fix in ggml_backend_sched
|
||||||
const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
|
const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
|
||||||
// TODO: during #11213, the requirement for ubatch.n_tokens < 32 was removed to simplify
|
if (ubatch.n_tokens < 32 || full_offload) {
|
||||||
// not sure if this is still needed, but it can be brought back if needed
|
|
||||||
//if (ubatch.n_tokens < 32 || full_offload) {
|
|
||||||
if (full_offload) {
|
|
||||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||||
const auto & dev_layer = model.dev_layer(il);
|
const auto & dev_layer = model.dev_layer(il);
|
||||||
for (auto & backend : backends) {
|
for (auto & backend : backends) {
|
||||||
|
|||||||
@@ -85,6 +85,7 @@ struct llama_context : public llama_graph_i {
|
|||||||
virtual void build_cb(
|
virtual void build_cb(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
const char * name,
|
const char * name,
|
||||||
|
const llama_ubatch & ubatch,
|
||||||
int il);
|
int il);
|
||||||
|
|
||||||
// TODO: add encode/decode graphs
|
// TODO: add encode/decode graphs
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ public:
|
|||||||
virtual void build_cb(
|
virtual void build_cb(
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
const char * name,
|
const char * name,
|
||||||
|
const llama_ubatch & ubatch,
|
||||||
int il) = 0;
|
int il) = 0;
|
||||||
|
|
||||||
// apply control vector for layer il
|
// apply control vector for layer il
|
||||||
|
|||||||
@@ -248,6 +248,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|||||||
return cur_buft;
|
return cur_buft;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3888,7 +3889,7 @@ struct llm_build_context {
|
|||||||
|
|
||||||
// TODO: tmp
|
// TODO: tmp
|
||||||
void cb(struct ggml_tensor * cur, const char * name, int il) {
|
void cb(struct ggml_tensor * cur, const char * name, int il) {
|
||||||
lgf.build_cb(cur, name, il);
|
lgf.build_cb(cur, name, ubatch, il);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: tmp
|
// TODO: tmp
|
||||||
|
|||||||
Reference in New Issue
Block a user