mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	llama : fix data units
ggml-ci
This commit is contained in:
		@@ -5841,7 +5841,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
#ifdef DEBUG_CUDA_MALLOC
 | 
					#ifdef DEBUG_CUDA_MALLOC
 | 
				
			||||||
    fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
 | 
					    fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
 | 
				
			||||||
            (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
 | 
					            (uint32_t)(max_size/1e6), (uint32_t)(tot_size/1e6), (uint32_t)(size/1e6));
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    void * ptr;
 | 
					    void * ptr;
 | 
				
			||||||
    size_t look_ahead_size = (size_t) (1.05 * size);
 | 
					    size_t look_ahead_size = (size_t) (1.05 * size);
 | 
				
			||||||
@@ -5979,7 +5979,7 @@ void * ggml_cuda_host_malloc(size_t size) {
 | 
				
			|||||||
        // This can fixed the OOM error in WSL.
 | 
					        // This can fixed the OOM error in WSL.
 | 
				
			||||||
        cudaGetLastError();
 | 
					        cudaGetLastError();
 | 
				
			||||||
        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
 | 
					        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
 | 
				
			||||||
            size/1024.0/1024.0, cudaGetErrorString(err));
 | 
					            size/1e6, cudaGetErrorString(err));
 | 
				
			||||||
        return nullptr;
 | 
					        return nullptr;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										18
									
								
								ggml-metal.m
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								ggml-metal.m
									
									
									
									
									
								
							@@ -346,9 +346,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 | 
					    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 | 
				
			||||||
    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
 | 
					    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
 | 
				
			||||||
    if (ctx->device.maxTransferRate != 0) {
 | 
					    if (ctx->device.maxTransferRate != 0) {
 | 
				
			||||||
        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
 | 
					        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
        GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
 | 
					        GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
 | 
				
			|||||||
            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
 | 
					            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if (ctx->buffers[ctx->n_buffers].metal == nil) {
 | 
					            if (ctx->buffers[ctx->n_buffers].metal == nil) {
 | 
				
			||||||
                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
 | 
					                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6);
 | 
				
			||||||
                return false;
 | 
					                return false;
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
 | 
					            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            ++ctx->n_buffers;
 | 
					            ++ctx->n_buffers;
 | 
				
			||||||
        } else {
 | 
					        } else {
 | 
				
			||||||
@@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
 | 
				
			|||||||
                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
 | 
					                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                if (ctx->buffers[ctx->n_buffers].metal == nil) {
 | 
					                if (ctx->buffers[ctx->n_buffers].metal == nil) {
 | 
				
			||||||
                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
 | 
					                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6);
 | 
				
			||||||
                    return false;
 | 
					                    return false;
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
 | 
					                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i);
 | 
				
			||||||
                if (i + size_step < size) {
 | 
					                if (i + size_step < size) {
 | 
				
			||||||
                    GGML_METAL_LOG_INFO("\n");
 | 
					                    GGML_METAL_LOG_INFO("\n");
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
@@ -580,8 +580,8 @@ bool ggml_metal_add_buffer(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#if TARGET_OS_OSX
 | 
					#if TARGET_OS_OSX
 | 
				
			||||||
        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
 | 
					        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
 | 
				
			||||||
                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
 | 
					                ctx->device.currentAllocatedSize / 1e6,
 | 
				
			||||||
                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
 | 
					                ctx->device.recommendedMaxWorkingSetSize / 1e6);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
 | 
					        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
 | 
				
			||||||
            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
 | 
					            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
 | 
				
			||||||
@@ -589,7 +589,7 @@ bool ggml_metal_add_buffer(
 | 
				
			|||||||
            GGML_METAL_LOG_INFO("\n");
 | 
					            GGML_METAL_LOG_INFO("\n");
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
        GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
 | 
					        GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										40
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										40
									
								
								llama.cpp
									
									
									
									
									
								
							@@ -1083,9 +1083,9 @@ enum e_model {
 | 
				
			|||||||
    MODEL_70B,
 | 
					    MODEL_70B,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static const size_t kB = 1024;
 | 
					static const size_t kB = 1000;
 | 
				
			||||||
static const size_t MB = 1024*kB;
 | 
					static const size_t MB = 1000*kB;
 | 
				
			||||||
static const size_t GB = 1024*MB;
 | 
					static const size_t GB = 1000*MB;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct llama_hparams {
 | 
					struct llama_hparams {
 | 
				
			||||||
    bool     vocab_only;
 | 
					    bool     vocab_only;
 | 
				
			||||||
@@ -1481,7 +1481,7 @@ static bool llama_kv_cache_init(
 | 
				
			|||||||
            vram_kv_cache += ggml_nbytes(cache.k);
 | 
					            vram_kv_cache += ggml_nbytes(cache.k);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        if (vram_kv_cache > 0) {
 | 
					        if (vram_kv_cache > 0) {
 | 
				
			||||||
            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
 | 
					            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1e6);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
@@ -2520,9 +2520,9 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
 | 
				
			|||||||
    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
 | 
					    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
 | 
				
			||||||
    LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
 | 
					    LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
 | 
				
			||||||
    if (ml.n_bytes < GB) {
 | 
					    if (ml.n_bytes < GB) {
 | 
				
			||||||
        LLAMA_LOG_INFO("%s: model size       = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
 | 
					        LLAMA_LOG_INFO("%s: model size       = %.2f MB (%.2f BPW) \n", __func__, ml.n_bytes/1e6, ml.n_bytes*8.0/ml.n_elements);
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
        LLAMA_LOG_INFO("%s: model size       = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
 | 
					        LLAMA_LOG_INFO("%s: model size       = %.2f GB (%.2f BPW) \n", __func__, ml.n_bytes/1e9, ml.n_bytes*8.0/ml.n_elements);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // general kv
 | 
					    // general kv
 | 
				
			||||||
@@ -2558,7 +2558,7 @@ static void llm_load_tensors(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    ml.calc_sizes(ctx_size, mmapped_size);
 | 
					    ml.calc_sizes(ctx_size, mmapped_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
 | 
					    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1e6);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // create the ggml context
 | 
					    // create the ggml context
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
@@ -3207,7 +3207,7 @@ static void llm_load_tensors(
 | 
				
			|||||||
            ctx_size +
 | 
					            ctx_size +
 | 
				
			||||||
            mmapped_size - vram_weights; // weights in VRAM not in memory
 | 
					            mmapped_size - vram_weights; // weights in VRAM not in memory
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        LLAMA_LOG_INFO("%s: mem required  = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
 | 
					        LLAMA_LOG_INFO("%s: mem required  = %7.2f MB\n", __func__, mem_required / 1e6);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
 | 
					#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
 | 
				
			||||||
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 | 
					        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 | 
				
			||||||
@@ -3226,7 +3226,7 @@ static void llm_load_tensors(
 | 
				
			|||||||
#endif // GGML_USE_CUBLAS
 | 
					#endif // GGML_USE_CUBLAS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
 | 
					        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
 | 
				
			||||||
        LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
 | 
					        LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1e6);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
        (void) n_gpu_layers;
 | 
					        (void) n_gpu_layers;
 | 
				
			||||||
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
 | 
					#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
 | 
				
			||||||
@@ -7878,7 +7878,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 | 
				
			|||||||
            new_type = tensor->type;
 | 
					            new_type = tensor->type;
 | 
				
			||||||
            new_data = tensor->data;
 | 
					            new_data = tensor->data;
 | 
				
			||||||
            new_size = ggml_nbytes(tensor);
 | 
					            new_size = ggml_nbytes(tensor);
 | 
				
			||||||
            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
 | 
					            LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1e6);
 | 
				
			||||||
        } else {
 | 
					        } else {
 | 
				
			||||||
            const size_t nelements = ggml_nelements(tensor);
 | 
					            const size_t nelements = ggml_nelements(tensor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -7938,7 +7938,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 | 
				
			|||||||
                workers.clear();
 | 
					                workers.clear();
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
 | 
					            LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1e6, new_size/1e6);
 | 
				
			||||||
            int64_t tot_count = 0;
 | 
					            int64_t tot_count = 0;
 | 
				
			||||||
            for (size_t i = 0; i < hist_cur.size(); i++) {
 | 
					            for (size_t i = 0; i < hist_cur.size(); i++) {
 | 
				
			||||||
                hist_all[i] += hist_cur[i];
 | 
					                hist_all[i] += hist_cur[i];
 | 
				
			||||||
@@ -7976,8 +7976,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    gguf_free(ctx_out);
 | 
					    gguf_free(ctx_out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
 | 
					    LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1e6);
 | 
				
			||||||
    LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
 | 
					    LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1e6);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // print histogram for all tensors
 | 
					    // print histogram for all tensors
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
@@ -8478,7 +8478,7 @@ struct llama_context * llama_new_context_with_model(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
 | 
					            const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
 | 
				
			||||||
            LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
 | 
					            LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1e6);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // resized during inference
 | 
					        // resized during inference
 | 
				
			||||||
@@ -8523,7 +8523,7 @@ struct llama_context * llama_new_context_with_model(
 | 
				
			|||||||
            // measure memory requirements for the graph
 | 
					            // measure memory requirements for the graph
 | 
				
			||||||
            size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
 | 
					            size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
 | 
					            LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1e6);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            // recreate allocator with exact memory requirements
 | 
					            // recreate allocator with exact memory requirements
 | 
				
			||||||
            ggml_allocr_free(ctx->alloc);
 | 
					            ggml_allocr_free(ctx->alloc);
 | 
				
			||||||
@@ -8537,7 +8537,7 @@ struct llama_context * llama_new_context_with_model(
 | 
				
			|||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GGML_USE_CUBLAS
 | 
					#ifdef GGML_USE_CUBLAS
 | 
				
			||||||
            ggml_cuda_set_scratch_size(alloc_size);
 | 
					            ggml_cuda_set_scratch_size(alloc_size);
 | 
				
			||||||
            LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
 | 
					            LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1e6);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            // calculate total VRAM usage
 | 
					            // calculate total VRAM usage
 | 
				
			||||||
            auto add_tensor = [](const ggml_tensor * t, size_t & size) {
 | 
					            auto add_tensor = [](const ggml_tensor * t, size_t & size) {
 | 
				
			||||||
@@ -8558,9 +8558,9 @@ struct llama_context * llama_new_context_with_model(
 | 
				
			|||||||
            size_t total_vram_size = model_vram_size + ctx_vram_size;
 | 
					            size_t total_vram_size = model_vram_size + ctx_vram_size;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
 | 
					            LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
 | 
				
			||||||
                    total_vram_size / 1024.0 / 1024.0,
 | 
					                    total_vram_size / 1e6,
 | 
				
			||||||
                    model_vram_size / 1024.0 / 1024.0,
 | 
					                    model_vram_size / 1e6,
 | 
				
			||||||
                    ctx_vram_size / 1024.0 / 1024.0);
 | 
					                    ctx_vram_size   / 1e6);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -8581,7 +8581,7 @@ struct llama_context * llama_new_context_with_model(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
 | 
					            const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
 | 
					            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1e6);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define LLAMA_METAL_CHECK_BUF(result)                            \
 | 
					#define LLAMA_METAL_CHECK_BUF(result)                            \
 | 
				
			||||||
            if (!(result)) {                                             \
 | 
					            if (!(result)) {                                             \
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user