mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	initial implementation of delayed graph allocation
This commit is contained in:
		@@ -1752,6 +1752,8 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend * backend, const ggm
 | 
			
		||||
 | 
			
		||||
    //ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 | 
			
		||||
 | 
			
		||||
    //printf("get tensor %s %p\n", tensor->name, tensor->data);
 | 
			
		||||
 | 
			
		||||
    CUDA_CHECK(cudaMemcpyAsync(data, (const char*)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStream_main));
 | 
			
		||||
 | 
			
		||||
    UNUSED(backend);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user