mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : move more prints to the ggml log system (#9839)
* ggml : move more prints to the ggml log system * show BLAS OpenMP warnings in all builds using debug print
This commit is contained in:
		| @@ -14,7 +14,7 @@ | ||||
|  | ||||
| //#define GGML_ALLOCATOR_DEBUG | ||||
|  | ||||
| //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) | ||||
| //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__) | ||||
| #define AT_PRINTF(...) | ||||
|  | ||||
|  | ||||
| @@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso | ||||
|     size = GGML_PAD(size, talloc->alignment); | ||||
|  | ||||
|     if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { | ||||
|         fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", | ||||
|         GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", | ||||
|                 __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); | ||||
|         GGML_ABORT("not enough space in the buffer"); | ||||
|     } | ||||
| @@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz | ||||
|             best_fit_block = alloc->n_free_blocks - 1; | ||||
|         } else { | ||||
|             // this should never happen | ||||
|             fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", | ||||
|             GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", | ||||
|                     __func__, size, max_avail); | ||||
|             GGML_ABORT("not enough space in the buffer"); | ||||
|         } | ||||
| @@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); | ||||
|         GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); | ||||
|         for (int i = 0; i < 1024; i++) { | ||||
|             if (alloc->allocated_tensors[i].tensor) { | ||||
|                 fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, | ||||
|                 GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, | ||||
|                     alloc->allocated_tensors[i].offset, | ||||
|                     alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), | ||||
|                     ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); | ||||
|             } | ||||
|         } | ||||
|         fprintf(stderr, "\n"); | ||||
|         GGML_LOG_DEBUG("\n"); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| @@ -768,13 +768,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c | ||||
|         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views | ||||
|         if (new_size > cur_size || galloc->buffers[i] == NULL) { | ||||
| #ifndef NDEBUG | ||||
|             fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); | ||||
|             GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); | ||||
| #endif | ||||
|  | ||||
|             ggml_backend_buffer_free(galloc->buffers[i]); | ||||
|             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); | ||||
|             if (galloc->buffers[i] == NULL) { | ||||
|                 fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); | ||||
|                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); | ||||
|                 return false; | ||||
|             } | ||||
|             ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); | ||||
| @@ -825,14 +825,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t | ||||
| static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { | ||||
|     if (galloc->n_nodes != graph->n_nodes) { | ||||
| #ifndef NDEBUG | ||||
|         fprintf(stderr, "%s: graph has different number of nodes\n", __func__); | ||||
|         GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__); | ||||
| #endif | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     if (galloc->n_leafs != graph->n_leafs) { | ||||
| #ifndef NDEBUG | ||||
|         fprintf(stderr, "%s: graph has different number of leafs\n", __func__); | ||||
|         GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__); | ||||
| #endif | ||||
|         return true; | ||||
|     } | ||||
| @@ -843,7 +843,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph | ||||
|  | ||||
|         if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { | ||||
| #ifndef NDEBUG | ||||
|             fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); | ||||
|             GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name); | ||||
| #endif | ||||
|             return true; | ||||
|         } | ||||
| @@ -855,7 +855,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph | ||||
|             } | ||||
|             if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { | ||||
| #ifndef NDEBUG | ||||
|                 fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); | ||||
|                 GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); | ||||
| #endif | ||||
|                 return true; | ||||
|             } | ||||
| @@ -869,14 +869,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) | ||||
|     if (ggml_gallocr_needs_realloc(galloc, graph)) { | ||||
|         if (galloc->n_buffers == 1) { | ||||
| #ifndef NDEBUG | ||||
|             fprintf(stderr, "%s: reallocating buffers automatically\n", __func__); | ||||
|             GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__); | ||||
| #endif | ||||
|             if (!ggml_gallocr_reserve(galloc, graph)) { | ||||
|                 return false; | ||||
|             } | ||||
|         } else { | ||||
| #ifndef NDEBUG | ||||
|             fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); | ||||
|             GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); | ||||
| #endif | ||||
|             return false; | ||||
|         } | ||||
| @@ -940,7 +940,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx, | ||||
|     ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); | ||||
|     if (buffer == NULL) { | ||||
| #ifndef NDEBUG | ||||
|         fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); | ||||
|         GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); | ||||
| #endif | ||||
|         for (size_t i = 0; i < *n_buffers; i++) { | ||||
|             ggml_backend_buffer_free((*buffers)[i]); | ||||
| @@ -990,7 +990,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte | ||||
|         } | ||||
|  | ||||
|         if (this_size > max_size) { | ||||
|             fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", | ||||
|             GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", | ||||
|                     __func__, t->name, | ||||
|                     ggml_backend_buft_name(buft), | ||||
|                     this_size, max_size); | ||||
| @@ -1022,7 +1022,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte | ||||
|  | ||||
|     if (n_buffers == 0) { | ||||
| #ifndef NDEBUG | ||||
|         fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); | ||||
|         GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); | ||||
| #endif | ||||
|         return NULL; | ||||
|     } | ||||
|   | ||||
| @@ -379,7 +379,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst | ||||
|         ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); | ||||
|     } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { | ||||
| #ifndef NDEBUG | ||||
|         fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); | ||||
|         GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); | ||||
| #endif | ||||
|         size_t nbytes = ggml_nbytes(src); | ||||
|         void * data = malloc(nbytes); | ||||
| @@ -571,7 +571,7 @@ struct ggml_backend_registry { | ||||
|  | ||||
|     void register_backend(ggml_backend_reg_t reg) { | ||||
| #ifndef NDEBUG | ||||
|         fprintf(stderr, "%s: registered backend %s (%zu devices)\n", | ||||
|         GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", | ||||
|             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); | ||||
| #endif | ||||
|         backends.push_back(reg); | ||||
| @@ -582,7 +582,7 @@ struct ggml_backend_registry { | ||||
|  | ||||
|     void register_device(ggml_backend_dev_t device) { | ||||
| #ifndef NDEBUG | ||||
|         fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); | ||||
|         GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); | ||||
| #endif | ||||
|         devices.push_back(device); | ||||
|     } | ||||
| @@ -773,7 +773,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back | ||||
|     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned | ||||
|     void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) | ||||
|     if (data == NULL) { | ||||
|         fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); | ||||
|         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); | ||||
|         return NULL; | ||||
|     } | ||||
|  | ||||
| @@ -836,7 +836,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ | ||||
|     void * ptr; | ||||
|     int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); | ||||
|     if (result != 0) { | ||||
|         fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); | ||||
|         GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); | ||||
|         return NULL; | ||||
|     } | ||||
|  | ||||
| @@ -1459,7 +1459,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co | ||||
|     } | ||||
|  | ||||
| #ifndef NDEBUG | ||||
|     fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", | ||||
|     GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", | ||||
|         __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); | ||||
| #endif | ||||
|  | ||||
| @@ -1548,13 +1548,13 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { | ||||
|             ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; | ||||
|             fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), | ||||
|             GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), | ||||
|                 sched->splits[cur_split].n_inputs); | ||||
|             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { | ||||
|                 fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, | ||||
|                 GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, | ||||
|                     fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); | ||||
|             } | ||||
|             fprintf(stderr, "\n"); | ||||
|             GGML_LOG_DEBUG("\n"); | ||||
|             cur_split++; | ||||
|         } | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
| @@ -1562,7 +1562,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str | ||||
|             continue; | ||||
|         } | ||||
|         ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); | ||||
|         fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, | ||||
|         GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, | ||||
|             fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); | ||||
|         for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||
|             struct ggml_tensor * src = node->src[j]; | ||||
| @@ -1570,10 +1570,10 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str | ||||
|                 continue; | ||||
|             } | ||||
|             ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); | ||||
|             fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, | ||||
|             GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, | ||||
|                 fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); | ||||
|         } | ||||
|         fprintf(stderr, "\n"); | ||||
|         GGML_LOG_DEBUG("\n"); | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -2087,11 +2087,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { | ||||
|         // the re-allocation may cause the split inputs to be moved to a different address | ||||
|         ggml_backend_sched_synchronize(sched); | ||||
| #ifndef NDEBUG | ||||
|         fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); | ||||
|         GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); | ||||
| #endif | ||||
|         ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); | ||||
|         if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { | ||||
|             fprintf(stderr, "%s: failed to allocate graph\n", __func__); | ||||
|             GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__); | ||||
|             return false; | ||||
|         } | ||||
|     } | ||||
| @@ -2485,7 +2485,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s | ||||
|     struct ggml_context * ctx_unallocated = ggml_init(params); | ||||
|  | ||||
|     if (ctx_allocated == NULL || ctx_unallocated == NULL) { | ||||
|         fprintf(stderr, "failed to allocate context for graph copy\n"); | ||||
|         GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__); | ||||
|         ggml_hash_set_free(&hash_set); | ||||
|         free(node_copies); | ||||
|         free(node_init); | ||||
| @@ -2508,7 +2508,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s | ||||
|     // allocate nodes | ||||
|     ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); | ||||
|     if (buffer == NULL) { | ||||
|         fprintf(stderr, "failed to allocate buffer for graph copy\n"); | ||||
|         GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__); | ||||
|         ggml_hash_set_free(&hash_set); | ||||
|         free(node_copies); | ||||
|         free(node_init); | ||||
|   | ||||
| @@ -297,14 +297,14 @@ ggml_backend_t ggml_backend_blas_init(void) { | ||||
|         /* .context   = */ ctx, | ||||
|     }; | ||||
|  | ||||
| #if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) | ||||
| #if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) | ||||
|     if (openblas_get_parallel() != OPENBLAS_OPENMP) { | ||||
|         fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); | ||||
|         GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| #if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) | ||||
|     fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); | ||||
| #if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) | ||||
|     GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); | ||||
| #endif | ||||
|  | ||||
|     return backend; | ||||
|   | ||||
| @@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { | ||||
|                 return; | ||||
|             } | ||||
|         } | ||||
|         GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); | ||||
|         GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); | ||||
|         ggml_cuda_set_device(device); | ||||
|         CUDA_CHECK(cudaFree(ptr)); | ||||
|         pool_size -= size; | ||||
| @@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) { | ||||
|     if (err != cudaSuccess) { | ||||
|         // clear the error | ||||
|         cudaGetLastError(); | ||||
|         GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, | ||||
|         GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, | ||||
|                            size / 1024.0 / 1024.0, cudaGetErrorString(err)); | ||||
|         return nullptr; | ||||
|     } | ||||
| @@ -2406,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ | ||||
|  | ||||
|     if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { | ||||
| #ifndef NDEBUG | ||||
|         GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); | ||||
|         GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__); | ||||
| #endif | ||||
|         return false; | ||||
|     } | ||||
| @@ -2524,7 +2524,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, | ||||
|         if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { | ||||
|             cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; | ||||
| #ifndef NDEBUG | ||||
|             GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); | ||||
|             GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); | ||||
| #endif | ||||
|         } | ||||
|     } | ||||
| @@ -2575,14 +2575,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, | ||||
|             if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { | ||||
|                 use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture | ||||
| #ifndef NDEBUG | ||||
|                 GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); | ||||
|                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); | ||||
| #endif | ||||
|             } | ||||
|  | ||||
|             if (node->op == GGML_OP_MUL_MAT_ID) { | ||||
|                 use_cuda_graph = false; // This node type is not supported by CUDA graph capture | ||||
| #ifndef NDEBUG | ||||
|                 GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); | ||||
|                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); | ||||
| #endif | ||||
|             } | ||||
|  | ||||
| @@ -2591,7 +2591,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, | ||||
|                 // Changes in batch size or context size can cause changes to the grid size of some kernels. | ||||
|                 use_cuda_graph = false; | ||||
| #ifndef NDEBUG | ||||
|                 GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); | ||||
|                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); | ||||
| #endif | ||||
|             } | ||||
|  | ||||
| @@ -2603,7 +2603,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, | ||||
|                 if (!ptr) { | ||||
|                     use_cuda_graph = false; | ||||
| #ifndef NDEBUG | ||||
|                     GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); | ||||
|                     GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); | ||||
| #endif | ||||
|                 } else { | ||||
|                     if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { | ||||
| @@ -2627,7 +2627,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, | ||||
|         if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { | ||||
|             cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; | ||||
| #ifndef NDEBUG | ||||
|             GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); | ||||
|             GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); | ||||
| #endif | ||||
|         } | ||||
|     } | ||||
| @@ -2685,7 +2685,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, | ||||
|                 use_cuda_graph = false; | ||||
|                 cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; | ||||
| #ifndef NDEBUG | ||||
|                 GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); | ||||
|                 GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__); | ||||
| #endif | ||||
|             } else { | ||||
|                 graph_evaluated_or_captured = true; // CUDA graph has been captured | ||||
| @@ -2854,7 +2854,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { | ||||
|         // clear the error | ||||
|         cudaGetLastError(); | ||||
|  | ||||
|         GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, | ||||
|         GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, | ||||
|                            size / 1024.0 / 1024.0, cudaGetErrorString(err)); | ||||
|         return false; | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Diego Devesa
					Diego Devesa