mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml: add names to tensors (#1268)
* ggml: add names to tensors * minor improvements to dot file formatting
This commit is contained in:
		
							
								
								
									
										24
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										24
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -659,6 +659,7 @@ struct llama_model_loader { | ||||
|             LLAMA_ASSERT(lt.ne.size() == 1); | ||||
|             tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0)); | ||||
|         } | ||||
|         ggml_set_name(tensor, lt.name.c_str()); | ||||
|         LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor | ||||
|         lt.ggml_tensor = tensor; | ||||
|         num_ggml_tensors_created++; | ||||
| @@ -798,6 +799,8 @@ static bool kv_cache_init( | ||||
|  | ||||
|     cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); | ||||
|     cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); | ||||
|     ggml_set_name(cache.k, "cache_k"); | ||||
|     ggml_set_name(cache.v, "cache_v"); | ||||
|  | ||||
|     return true; | ||||
| } | ||||
| @@ -1084,6 +1087,7 @@ static bool llama_eval_internal( | ||||
|     gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; | ||||
|  | ||||
|     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); | ||||
|     ggml_set_name(embd, "embd"); | ||||
|     memcpy(embd->data, tokens, N*ggml_element_size(embd)); | ||||
|  | ||||
|     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); | ||||
| @@ -1110,6 +1114,8 @@ static bool llama_eval_internal( | ||||
|             // compute Q and K and RoPE them | ||||
|             struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); | ||||
|             struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); | ||||
|             ggml_set_name(Qcur, "Qcur"); | ||||
|             ggml_set_name(Kcur, "Kcur"); | ||||
|  | ||||
|             // store key and value to memory | ||||
|             { | ||||
| @@ -1130,6 +1136,7 @@ static bool llama_eval_internal( | ||||
|                 ggml_permute(ctx0, | ||||
|                         Qcur, | ||||
|                         0, 2, 1, 3); | ||||
|             ggml_set_name(Q, "Q"); | ||||
|  | ||||
|             struct ggml_tensor * K = | ||||
|                 ggml_permute(ctx0, | ||||
| @@ -1137,21 +1144,26 @@ static bool llama_eval_internal( | ||||
|                             ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), | ||||
|                             n_embd/n_head, n_head, n_past + N), | ||||
|                         0, 2, 1, 3); | ||||
|             ggml_set_name(K, "K"); | ||||
|  | ||||
|             // K * Q | ||||
|             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); | ||||
|             ggml_set_name(KQ, "KQ"); | ||||
|  | ||||
|             // KQ_scaled = KQ / sqrt(n_embd/n_head) | ||||
|             struct ggml_tensor * KQ_scaled = | ||||
|                 ggml_scale(ctx0, | ||||
|                         KQ, | ||||
|                         ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); | ||||
|             struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); | ||||
|             ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); | ||||
|  | ||||
|             struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); | ||||
|             ggml_set_name(KQ_scaled, "KQ_scaled"); | ||||
|  | ||||
|             // KQ_masked = mask_past(KQ_scaled) | ||||
|             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); | ||||
|             ggml_set_name(KQ_masked, "KQ_masked"); | ||||
|  | ||||
|             // KQ = soft_max(KQ_masked) | ||||
|             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); | ||||
|             ggml_set_name(KQ_soft_max, "KQ_soft_max"); | ||||
|  | ||||
|             // split cached V into n_head heads | ||||
|             struct ggml_tensor * V = | ||||
| @@ -1160,9 +1172,11 @@ static bool llama_eval_internal( | ||||
|                         n_ctx*ggml_element_size(kv_self.v), | ||||
|                         n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, | ||||
|                         il*n_ctx*ggml_element_size(kv_self.v)*n_embd); | ||||
|             ggml_set_name(V, "V"); | ||||
|  | ||||
| #if 1 | ||||
|             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); | ||||
|             ggml_set_name(KQV, "KQV"); | ||||
| #else | ||||
|             // make V contiguous in memory to speed up the matmul, however we waste time on the copy | ||||
|             // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation | ||||
| @@ -1173,11 +1187,13 @@ static bool llama_eval_internal( | ||||
|  | ||||
|             // KQV_merged = KQV.permute(0, 2, 1, 3) | ||||
|             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); | ||||
|             ggml_set_name(KQV_merged, "KQV_merged"); | ||||
|  | ||||
|             // cur = KQV_merged.contiguous().view(n_embd, N) | ||||
|             cur = ggml_cpy(ctx0, | ||||
|                     KQV_merged, | ||||
|                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); | ||||
|             ggml_set_name(cur, "KQV_merged_contiguous"); | ||||
|  | ||||
|             // projection (no bias) | ||||
|             cur = ggml_mul_mat(ctx0, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren