mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading
This commit is contained in:
		
							
								
								
									
										115
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										115
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -3548,11 +3548,11 @@ static struct ggml_cgraph * llm_build_llama( | ||||
|                     model.layers[il].ffn_gate, NULL, | ||||
|                     model.layers[il].ffn_down, NULL, | ||||
|                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il); | ||||
|             cb(cur, "ffn_result", il); | ||||
|             cb(cur, "ffn_out", il); | ||||
|         } | ||||
|  | ||||
|         cur = ggml_add(ctx0, cur, inpFF); | ||||
|         cb(cur, "inpFF_+_result_w2", il); | ||||
|         cb(cur, "inpFF_ffn_out", il); | ||||
|  | ||||
|         // input for next layer | ||||
|         inpL = cur; | ||||
| @@ -3714,11 +3714,11 @@ static struct ggml_cgraph * llm_build_baichaun( | ||||
|                     model.layers[il].ffn_gate, NULL, | ||||
|                     model.layers[il].ffn_down, NULL, | ||||
|                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il); | ||||
|             cb(cur, "ffn_result", il); | ||||
|             cb(cur, "ffn_out", il); | ||||
|         } | ||||
|  | ||||
|         cur = ggml_add(ctx0, cur, inpFF); | ||||
|         cb(cur, "inpFF_+_result_w2", il); | ||||
|         cb(cur, "inpFF_ffn_out", il); | ||||
|  | ||||
|         // input for next layer | ||||
|         inpL = cur; | ||||
| @@ -3884,14 +3884,14 @@ static struct ggml_cgraph * llm_build_falcon( | ||||
|                     NULL,                      NULL, | ||||
|                     model.layers[il].ffn_down, NULL, | ||||
|                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); | ||||
|             cb(cur, "ffn_result", il); | ||||
|             cb(cur, "ffn_out", il); | ||||
|         } | ||||
|  | ||||
|         cur = ggml_add(ctx0, cur, attn_out); | ||||
|         cb(cur, "inpFF_+_result_w2", il); | ||||
|         cb(cur, "inpFF_ffn_out", il); | ||||
|  | ||||
|         cur = ggml_add(ctx0, cur, inpL); | ||||
|         cb(cur, "inpL_+_inpFF_+_result_w2", il); | ||||
|         cb(cur, "inpL_inpFF_ffn_out", il); | ||||
|  | ||||
|         // input for next layer | ||||
|         inpL = cur; | ||||
| @@ -3988,6 +3988,7 @@ static struct ggml_cgraph * llm_build_starcoder( | ||||
|     cb(KQ_mask, "KQ_mask", -1); | ||||
|  | ||||
|     pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); | ||||
|     cb(pos, "pos_embd", -1); | ||||
|  | ||||
|     inpL = ggml_add(ctx0, embd, pos); | ||||
|     cb(inpL, "inpL", -1); | ||||
| @@ -4027,7 +4028,7 @@ static struct ggml_cgraph * llm_build_starcoder( | ||||
|  | ||||
|         // Add the input | ||||
|         cur = ggml_add(ctx0, cur, inpL); | ||||
|         cb(cur, "inpL_+_result_wo", il); | ||||
|         cb(cur, "inpL_kqv_out", il); | ||||
|  | ||||
|         struct ggml_tensor * inpFF = cur; | ||||
|  | ||||
| @@ -4044,11 +4045,11 @@ static struct ggml_cgraph * llm_build_starcoder( | ||||
|                     NULL,                      NULL, | ||||
|                     model.layers[il].ffn_down, model.layers[il].ffn_down_b, | ||||
|                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); | ||||
|             cb(cur, "ffn_result", il); | ||||
|             cb(cur, "ffn_out", il); | ||||
|         } | ||||
|  | ||||
|         inpL = ggml_add(ctx0, cur, inpFF); | ||||
|  | ||||
|         cb(inpL, "inpL_inpFF_ffn_out", il); | ||||
|     } | ||||
|  | ||||
|     cur = llm_build_norm(ctx0, inpL, | ||||
| @@ -4294,11 +4295,11 @@ static struct ggml_cgraph * llm_build_persimmon( | ||||
|                     NULL,                      NULL, | ||||
|                     model.layers[il].ffn_down, model.layers[il].ffn_down_b, | ||||
|                     LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); | ||||
|             cb(cur, "ffn_result", il); | ||||
|             cb(cur, "ffn_out", il); | ||||
|         } | ||||
|  | ||||
|         cur = ggml_add(ctx0, cur, inpFF); | ||||
|         cb(cur, "inpFF_+_result_w2", il); | ||||
|         cb(cur, "inpFF_ffn_out", il); | ||||
|  | ||||
|         inpL = cur; | ||||
|     } | ||||
| @@ -4432,11 +4433,11 @@ static struct ggml_cgraph * llm_build_refact( | ||||
|                     model.layers[il].ffn_gate, NULL, | ||||
|                     model.layers[il].ffn_down, NULL, | ||||
|                     LLM_FFN_SILU, LLM_FFN_PAR, cb, il); | ||||
|             cb(cur, "ffn_result", il); | ||||
|             cb(cur, "ffn_out", il); | ||||
|         } | ||||
|  | ||||
|         cur = ggml_add(ctx0, cur, inpFF); | ||||
|         cb(cur, "inpFF_+_result_w2", il); | ||||
|         cb(cur, "inpFF_ffn_out", il); | ||||
|  | ||||
|         // input for next layer | ||||
|         inpL = cur; | ||||
| @@ -4569,7 +4570,7 @@ static struct ggml_cgraph * llm_build_bloom( | ||||
|  | ||||
|         // Add the input | ||||
|         cur = ggml_add(ctx0, cur, inpL); | ||||
|         cb(cur, "inpL_+_result_wo", il); | ||||
|         cb(cur, "inpL_kqv_out", il); | ||||
|  | ||||
|         struct ggml_tensor * inpFF = cur; | ||||
|  | ||||
| @@ -4586,11 +4587,11 @@ static struct ggml_cgraph * llm_build_bloom( | ||||
|                     NULL,                      NULL, | ||||
|                     model.layers[il].ffn_down, model.layers[il].ffn_down_b, | ||||
|                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); | ||||
|             cb(cur, "ffn_result", il); | ||||
|             cb(cur, "ffn_out", il); | ||||
|         } | ||||
|  | ||||
|         inpL = ggml_add(ctx0, cur, inpFF); | ||||
|         cb(inpL, "inpFF_+_result_w2", il); | ||||
|         cb(inpL, "inpFF_ffn_out", il); | ||||
|     } | ||||
|  | ||||
|     cur = llm_build_norm(ctx0, inpL, | ||||
| @@ -4717,7 +4718,7 @@ static struct ggml_cgraph * llm_build_mpt( | ||||
|  | ||||
|         // Add the input | ||||
|         cur = ggml_add(ctx0, cur, inpL); | ||||
|         cb(cur, "inpL_+_result_wo", il); | ||||
|         cb(cur, "inpL_kqv_out", il); | ||||
|  | ||||
|         struct ggml_tensor * attn_out = cur; | ||||
|  | ||||
| @@ -4734,11 +4735,11 @@ static struct ggml_cgraph * llm_build_mpt( | ||||
|                     NULL,                      NULL, | ||||
|                     model.layers[il].ffn_down, NULL, | ||||
|                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); | ||||
|             cb(cur, "ffn_result", il); | ||||
|             cb(cur, "ffn_out", il); | ||||
|         } | ||||
|  | ||||
|         cur = ggml_add(ctx0, cur, attn_out); | ||||
|         cb(cur, "inpL_+_inpFF_+_result_w2", il); | ||||
|         cb(cur, "inpL_inpFF_ffn_out", il); | ||||
|  | ||||
|         // input for next layer | ||||
|         inpL = cur; | ||||
| @@ -4777,6 +4778,7 @@ enum llm_offload_func_e { | ||||
|     OFFLOAD_FUNC_OUT, | ||||
| }; | ||||
|  | ||||
| // TODO: will be removed with backend v2 | ||||
| struct llm_offload_trie { | ||||
|     struct node { | ||||
|         ~node() { | ||||
| @@ -4850,10 +4852,12 @@ struct llm_offload_trie { | ||||
|     node * root = nullptr; | ||||
| }; | ||||
|  | ||||
| // TODO: will be removed with backend v2 | ||||
| static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = { | ||||
|   //{ "inp_tokens",                 OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel | ||||
|   //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel | ||||
|     { "inp_pos",                    OFFLOAD_FUNC_NR  }, | ||||
|     { "pos_embd",                   OFFLOAD_FUNC_NR  }, | ||||
|  | ||||
|     { "KQ_mask",                    OFFLOAD_FUNC_NR  }, | ||||
|     { "K_shift",                    OFFLOAD_FUNC_NR  }, | ||||
| @@ -4902,7 +4906,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map | ||||
|     { "kqv_wo",                     OFFLOAD_FUNC_V   }, | ||||
|     { "kqv_out",                    OFFLOAD_FUNC_V   }, | ||||
|  | ||||
|     { "inpL_+_result_wo",           OFFLOAD_FUNC     }, | ||||
|     { "inpL_kqv_out",               OFFLOAD_FUNC     }, | ||||
|     { "inpFF",                      OFFLOAD_FUNC     }, | ||||
|  | ||||
|     { "ffn_norm",                   OFFLOAD_FUNC     }, | ||||
| @@ -4914,15 +4918,15 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map | ||||
|     { "ffn_gate_par",               OFFLOAD_FUNC     }, | ||||
|     { "ffn_down",                   OFFLOAD_FUNC     }, | ||||
|     { "ffn_down_b",                 OFFLOAD_FUNC     }, | ||||
|     { "ffn_result",                 OFFLOAD_FUNC     }, | ||||
|     { "ffn_out",                    OFFLOAD_FUNC     }, | ||||
|  | ||||
|     { "ffn_silu",                   OFFLOAD_FUNC     }, | ||||
|     { "ffn_gelu",                   OFFLOAD_FUNC     }, | ||||
|     { "ffn_relu",                   OFFLOAD_FUNC     }, | ||||
|     { "ffn_sqr(relu)",              OFFLOAD_FUNC     }, | ||||
|  | ||||
|     { "inpFF_+_result_w2",          OFFLOAD_FUNC     }, | ||||
|     { "inpL_+_inpFF_+_result_w2",   OFFLOAD_FUNC     }, | ||||
|     { "inpFF_ffn_out",              OFFLOAD_FUNC     }, | ||||
|     { "inpL_inpFF_ffn_out",         OFFLOAD_FUNC     }, | ||||
|  | ||||
|     { "result_norm",                OFFLOAD_FUNC_EMB }, | ||||
|     { "result_output",              OFFLOAD_FUNC_OUT }, | ||||
| @@ -4946,6 +4950,14 @@ static struct ggml_cgraph * llama_build_graph( | ||||
|     bool alloc_inp_KQ_mask  = false; | ||||
|     bool alloc_inp_K_shift  = false; | ||||
|  | ||||
| #ifdef GGML_USE_CUBLAS | ||||
|     const bool do_offload = true; | ||||
| #else | ||||
|     const bool do_offload = true; // TODO: set to false after finishing refactoring | ||||
| #endif | ||||
|  | ||||
|     int n_non_view = 0; // number of non-view tensors that have been processed by the callback | ||||
|  | ||||
|     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) | ||||
|     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { | ||||
|         if (il >= 0) { | ||||
| @@ -5053,23 +5065,23 @@ static struct ggml_cgraph * llama_build_graph( | ||||
|             alloc_inp_K_shift = true; | ||||
|         } | ||||
|  | ||||
|         // | ||||
|         // offload layers | ||||
|         // | ||||
|         // TODO: this code will be obsoleted with backend v2 | ||||
|  | ||||
| #ifdef GGML_USE_CUBLAS | ||||
|         const bool do_offload = true; | ||||
| #else | ||||
|         const bool do_offload = true; // TODO: set to false after finishing refactoring | ||||
| #endif | ||||
|  | ||||
|         if (!do_offload) { | ||||
|         // view tensors are not processed further | ||||
|         if (cur->view_src != nullptr) { | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         // view tensors are not offloaded | ||||
|         if (cur->view_src != nullptr) { | ||||
|         if (cur->op != GGML_OP_NONE) { | ||||
|             n_non_view++; | ||||
|         } | ||||
|  | ||||
|         // | ||||
|         // offload layers | ||||
|         // | ||||
|         // TODO: will be removed with backend v2 | ||||
|  | ||||
| //#define LLAMA_OFFLOAD_DEBUG | ||||
|  | ||||
|         if (!do_offload) { | ||||
|             return; | ||||
|         } | ||||
|  | ||||
| @@ -5103,11 +5115,13 @@ static struct ggml_cgraph * llama_build_graph( | ||||
|         llm_offload_func_e func_e = k_offload_func_trie.find(name); | ||||
|  | ||||
|         if (func_e == OFFLOAD_FUNC_NOP) { | ||||
| #ifdef LLAMA_OFFLOAD_DEBUG | ||||
|             // if a tensor hasn't been offloaded, we warn the user | ||||
|             if (worst_case) { | ||||
|                 LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, | ||||
|                         cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); | ||||
|             } | ||||
| #endif | ||||
|  | ||||
|             return; | ||||
|         } | ||||
| @@ -5170,9 +5184,11 @@ static struct ggml_cgraph * llama_build_graph( | ||||
|         // apply offload function to the tensor | ||||
|         func(cur); | ||||
|  | ||||
| #ifdef LLAMA_OFFLOAD_DEBUG | ||||
|         if (worst_case) { | ||||
|             LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); | ||||
|         } | ||||
| #endif | ||||
|     }; | ||||
|  | ||||
|     struct ggml_cgraph * result = NULL; | ||||
| @@ -5214,6 +5230,29 @@ static struct ggml_cgraph * llama_build_graph( | ||||
|             GGML_ASSERT(false); | ||||
|     } | ||||
|  | ||||
|     if (worst_case) { | ||||
|         int n_non_view_total = 0; | ||||
|  | ||||
|         for (int i = 0; i < result->n_nodes; ++i) { | ||||
|             if (result->nodes[i]->view_src == nullptr) { | ||||
|                 n_non_view_total++; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); | ||||
|  | ||||
| #ifdef LLAMA_OFFLOAD_DEBUG | ||||
|         if (n_non_view != n_non_view_total) { | ||||
|             LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); | ||||
|             LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n",     __func__); | ||||
|             LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n",    __func__); | ||||
|             LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n",                     __func__); | ||||
|             LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n",            __func__); | ||||
|             LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); | ||||
|         } | ||||
| #endif | ||||
|     } | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov