mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	rebase to master (except ggml-cuda)
This commit is contained in:
		| @@ -289,7 +289,7 @@ void ggml_graph_splits_add_n_va(struct ggml_graph_splits * splits, struct ggml_t | |||||||
|  |  | ||||||
|     if ((*inputs[0])->backend == ggml_get_ctx_backend(ctx)) { |     if ((*inputs[0])->backend == ggml_get_ctx_backend(ctx)) { | ||||||
|         if (splits->n_splits > 0) { |         if (splits->n_splits > 0) { | ||||||
|             char name[GGML_MAX_NAME - 1]; // silence -Wformat-truncation |             char name[GGML_MAX_NAME]; | ||||||
|             vsnprintf(name, sizeof(name), fmt, args); |             vsnprintf(name, sizeof(name), fmt, args); | ||||||
|             char new_name[GGML_MAX_NAME]; |             char new_name[GGML_MAX_NAME]; | ||||||
|             snprintf(new_name, sizeof(new_name), "%s,%s", splits->splits[splits->n_splits - 1].name, name); |             snprintf(new_name, sizeof(new_name), "%s,%s", splits->splits[splits->n_splits - 1].name, name); | ||||||
|   | |||||||
							
								
								
									
										30
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										30
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							| @@ -1475,8 +1475,8 @@ static void ggml_cuda_mul_mat(ggml_cuda_context * ctx, ggml_tensor * src0, ggml_ | |||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cudaStream_t stream) { | static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cudaStream_t stream) { | ||||||
|     ggml_tensor * src0 = node->src0; |     ggml_tensor * src0 = node->src[0]; | ||||||
|     ggml_tensor * src1 = node->src1; |     ggml_tensor * src1 = node->src[1]; | ||||||
|     ggml_tensor * dst  = node; |     ggml_tensor * dst  = node; | ||||||
|  |  | ||||||
| #if 0 | #if 0 | ||||||
| @@ -1551,8 +1551,6 @@ static void ggml_cuda_exec_node(ggml_cuda_context * ctx, ggml_tensor * node, cud | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| static const int GGML_MAX_PARENTS = 2 + GGML_MAX_OPT; |  | ||||||
|  |  | ||||||
| static bool ggml_is_noop(ggml_tensor * t) { | static bool ggml_is_noop(ggml_tensor * t) { | ||||||
|     return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || |     return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || | ||||||
|            t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE; |            t->op == GGML_OP_PERMUTE || t->op == GGML_OP_NONE; | ||||||
| @@ -1581,26 +1579,20 @@ static void ggml_cuda_graph_exec_parallel(ggml_cuda_context * ctx, ggml_cgraph * | |||||||
|         ggml_tensor * node = gf->nodes[i]; |         ggml_tensor * node = gf->nodes[i]; | ||||||
|         const bool is_noop = ggml_is_noop(node); |         const bool is_noop = ggml_is_noop(node); | ||||||
|  |  | ||||||
|         // build a list of parents |  | ||||||
|         ggml_tensor * parents[GGML_MAX_PARENTS] = { node->src0, node->src1 }; |  | ||||||
|         for (int j = 0; j < GGML_MAX_OPT; j++) { |  | ||||||
|             parents[j + 2] = node->opt[j]; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // assign an stream for the node |         // assign an stream for the node | ||||||
|         cudaStream_t stream = nullptr; |         cudaStream_t stream = nullptr; | ||||||
|  |  | ||||||
|         // take a stream from a parent |         // take a stream from a parent | ||||||
|         for (int j = 0; j < GGML_MAX_PARENTS; j++) { |         for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||||
|             if (parents[j] && stream_map.count(parents[j]) && stream_map[parents[j]] != nullptr) { |             if (node->src[j] && stream_map.count(node->src[j]) && stream_map[node->src[j]] != nullptr) { | ||||||
|                 stream = stream_map[parents[j]]; |                 stream = stream_map[node->src[j]]; | ||||||
|                 stream_map.erase(parents[j]); |                 stream_map.erase(node->src[j]); | ||||||
|  |  | ||||||
|                 if (is_noop) { |                 if (is_noop) { | ||||||
|                     // if this is a noop, we can use the parent's event |                     // if this is a noop, we can use the parent's event | ||||||
|                     stream_map[node] = stream; |                     stream_map[node] = stream; | ||||||
|                     if (event_map.count(parents[j]) > 0) { |                     if (event_map.count(node->src[j]) > 0) { | ||||||
|                         event_map[node] = event_map[parents[j]]; |                         event_map[node] = event_map[node->src[j]]; | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 break; |                 break; | ||||||
| @@ -1624,9 +1616,9 @@ static void ggml_cuda_graph_exec_parallel(ggml_cuda_context * ctx, ggml_cgraph * | |||||||
|  |  | ||||||
|         // wait on parent streams |         // wait on parent streams | ||||||
|         bool waited = false; |         bool waited = false; | ||||||
|         for (int j = 0; j < GGML_MAX_PARENTS; j++) { |         for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||||
|             if (parents[j] && event_map.count(parents[j]) > 0) { |             if (node->src[j] && event_map.count(node->src[j]) > 0) { | ||||||
|                 CUDA_CHECK(cudaStreamWaitEvent(stream, event_map[parents[j]], 0)); |                 CUDA_CHECK(cudaStreamWaitEvent(stream, event_map[node->src[j]], 0)); | ||||||
|                 waited = true; |                 waited = true; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -6855,7 +6855,9 @@ struct ggml_tensor * ggml_rope_impl( | |||||||
|     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); |     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); | ||||||
|  |  | ||||||
|     // TODO: just use a struct |     // TODO: just use a struct | ||||||
|     int32_t params[] = { n_past, n_dims, mode, n_ctx, *(int32_t*)&freq_base, *(int32_t*)&freq_scale}; |     int32_t params[6] = { n_past, n_dims, mode, n_ctx }; | ||||||
|  |     memcpy(params + 4, &freq_base, sizeof(float)); | ||||||
|  |     memcpy(params + 5, &freq_scale, sizeof(float)); | ||||||
|     assert(GGML_MAX_OP_PARAMS >= sizeof(params)); |     assert(GGML_MAX_OP_PARAMS >= sizeof(params)); | ||||||
|     memcpy(result->params, ¶ms, sizeof(params)); |     memcpy(result->params, ¶ms, sizeof(params)); | ||||||
|  |  | ||||||
| @@ -7127,13 +7129,11 @@ struct ggml_tensor* ggml_pool_1d( | |||||||
|     }; |     }; | ||||||
|     struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); |     struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); | ||||||
|  |  | ||||||
|     ggml_scratch_save(ctx); |  | ||||||
|     struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); |     struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); | ||||||
|     ((int32_t*)c->data)[0] = op; |     ((int32_t*)c->data)[0] = op; | ||||||
|     ((int32_t*)c->data)[1] = k0; |     ((int32_t*)c->data)[1] = k0; | ||||||
|     ((int32_t*)c->data)[2] = s0; |     ((int32_t*)c->data)[2] = s0; | ||||||
|     ((int32_t*)c->data)[3] = p0; |     ((int32_t*)c->data)[3] = p0; | ||||||
|     ggml_scratch_load(ctx); |  | ||||||
|  |  | ||||||
|     result->op = GGML_OP_POOL_1D; |     result->op = GGML_OP_POOL_1D; | ||||||
|     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; |     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; | ||||||
| @@ -7170,7 +7170,6 @@ struct ggml_tensor* ggml_pool_2d( | |||||||
|     }; |     }; | ||||||
|     struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); |     struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); | ||||||
|  |  | ||||||
|     ggml_scratch_save(ctx); |  | ||||||
|     struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7); |     struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7); | ||||||
|     ((int32_t*)c->data)[0] = op; |     ((int32_t*)c->data)[0] = op; | ||||||
|     ((int32_t*)c->data)[1] = k0; |     ((int32_t*)c->data)[1] = k0; | ||||||
| @@ -7179,7 +7178,6 @@ struct ggml_tensor* ggml_pool_2d( | |||||||
|     ((int32_t*)c->data)[4] = s1; |     ((int32_t*)c->data)[4] = s1; | ||||||
|     ((int32_t*)c->data)[5] = p0; |     ((int32_t*)c->data)[5] = p0; | ||||||
|     ((int32_t*)c->data)[6] = p1; |     ((int32_t*)c->data)[6] = p1; | ||||||
|     ggml_scratch_load(ctx); |  | ||||||
|  |  | ||||||
|     result->op = GGML_OP_POOL_2D; |     result->op = GGML_OP_POOL_2D; | ||||||
|     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; |     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; | ||||||
| @@ -15823,7 +15821,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (node->op == GGML_OP_NONE && node->src0 == NULL && node->src1 == NULL && node->grad == NULL) { |     // TODO: add ggml_dependency instead of checking for NULL | ||||||
|  |     if (node->op == GGML_OP_NONE && node->src[0] == NULL && node->src[1] == NULL && node->grad == NULL) { | ||||||
|         // reached a leaf node, not part of the gradient graph (e.g. a constant) |         // reached a leaf node, not part of the gradient graph (e.g. a constant) | ||||||
|         GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); |         GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								ggml.h
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								ggml.h
									
									
									
									
									
								
							| @@ -199,7 +199,7 @@ | |||||||
| #define GGML_MAX_CONTEXTS      64 | #define GGML_MAX_CONTEXTS      64 | ||||||
| #define GGML_MAX_SRC           6 | #define GGML_MAX_SRC           6 | ||||||
| #define GGML_MAX_NAME          48 | #define GGML_MAX_NAME          48 | ||||||
| #define GGML_MAX_OP_PARAMS     16 | #define GGML_MAX_OP_PARAMS     32 | ||||||
| #define GGML_DEFAULT_N_THREADS 4 | #define GGML_DEFAULT_N_THREADS 4 | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1168,7 +1168,7 @@ static ggml_graph_splits llama_build_graph( | |||||||
|  |  | ||||||
|     struct ggml_graph_splits splits = ggml_graph_split_init(); |     struct ggml_graph_splits splits = ggml_graph_split_init(); | ||||||
|  |  | ||||||
|     // initalize contexts for every backend |     // initialize contexts for every backend | ||||||
|  |  | ||||||
|     struct ggml_context * ctx_cpu = nullptr; |     struct ggml_context * ctx_cpu = nullptr; | ||||||
|     // TODO: don't create context if there are no CPU layers |     // TODO: don't create context if there are no CPU layers | ||||||
| @@ -1295,8 +1295,8 @@ static ggml_graph_splits llama_build_graph( | |||||||
|                 // TODO: replace with ggml_dependency / ggml_depends_on |                 // TODO: replace with ggml_dependency / ggml_depends_on | ||||||
|                 k = ggml_view_tensor(ctx_kv, kv_self.k); |                 k = ggml_view_tensor(ctx_kv, kv_self.k); | ||||||
|                 v = ggml_view_tensor(ctx_kv, kv_self.v); |                 v = ggml_view_tensor(ctx_kv, kv_self.v); | ||||||
|                 k->src0 = k_cpy; |                 k->src[0] = k_cpy; | ||||||
|                 v->src0 = v_cpy; |                 v->src[0] = v_cpy; | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             struct ggml_tensor * Q = |             struct ggml_tensor * Q = | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren