mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	cuBLAS: fall back to pageable memory if pinned alloc fails (#1233)
* cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
This commit is contained in:
		
							
								
								
									
										14
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							| @@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, | ||||
| } | ||||
|  | ||||
| void * ggml_cuda_host_malloc(size_t size) { | ||||
|     void * ptr; | ||||
|     CUDA_CHECK(cudaMallocHost((void **) &ptr, size)); | ||||
|     if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { | ||||
|         return nullptr; | ||||
|     } | ||||
|  | ||||
|     void * ptr = nullptr; | ||||
|     cudaError_t err = cudaMallocHost((void **) &ptr, size); | ||||
|     if (err != cudaSuccess) { | ||||
|         fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", | ||||
|             size/1024.0/1024.0, cudaGetErrorString(err)); | ||||
|         return nullptr; | ||||
|     } | ||||
|  | ||||
|     return ptr; | ||||
| } | ||||
|  | ||||
|   | ||||
							
								
								
									
										44
									
								
								llama-util.h
									
									
									
									
									
								
							
							
						
						
									
										44
									
								
								llama-util.h
									
									
									
									
									
								
							| @@ -395,6 +395,8 @@ struct llama_buffer { | ||||
|     uint8_t * addr = NULL; | ||||
|     size_t size = 0; | ||||
|  | ||||
|     llama_buffer() = default; | ||||
|  | ||||
|     void resize(size_t size) { | ||||
|         delete[] addr; | ||||
|         addr = new uint8_t[size]; | ||||
| @@ -404,27 +406,59 @@ struct llama_buffer { | ||||
|     ~llama_buffer() { | ||||
|         delete[] addr; | ||||
|     } | ||||
|  | ||||
|     // disable copy and move | ||||
|     llama_buffer(const llama_buffer&) = delete; | ||||
|     llama_buffer(llama_buffer&&) = delete; | ||||
|     llama_buffer& operator=(const llama_buffer&) = delete; | ||||
|     llama_buffer& operator=(llama_buffer&&) = delete; | ||||
| }; | ||||
|  | ||||
| #ifdef GGML_USE_CUBLAS | ||||
| #include "ggml-cuda.h" | ||||
| struct llama_ctx_buffer { | ||||
|     uint8_t * addr = NULL; | ||||
|     bool is_cuda; | ||||
|     size_t size = 0; | ||||
|  | ||||
|     llama_ctx_buffer() = default; | ||||
|  | ||||
|     void resize(size_t size) { | ||||
|         if (addr) { | ||||
|             ggml_cuda_host_free(addr); | ||||
|         } | ||||
|         free(); | ||||
|  | ||||
|         addr = (uint8_t *) ggml_cuda_host_malloc(size); | ||||
|         if (addr) { | ||||
|             is_cuda = true; | ||||
|         } | ||||
|         else { | ||||
|             // fall back to pageable memory | ||||
|             addr = new uint8_t[size]; | ||||
|             is_cuda = false; | ||||
|         } | ||||
|         this->size = size; | ||||
|     } | ||||
|  | ||||
|     ~llama_ctx_buffer() { | ||||
|     void free() { | ||||
|         if (addr) { | ||||
|             ggml_cuda_host_free(addr); | ||||
|             if (is_cuda) { | ||||
|                 ggml_cuda_host_free(addr); | ||||
|             } | ||||
|             else { | ||||
|                 delete[] addr; | ||||
|             } | ||||
|         } | ||||
|         addr = NULL; | ||||
|     } | ||||
|  | ||||
|     ~llama_ctx_buffer() { | ||||
|         free(); | ||||
|     } | ||||
|  | ||||
|     // disable copy and move | ||||
|     llama_ctx_buffer(const llama_ctx_buffer&) = delete; | ||||
|     llama_ctx_buffer(llama_ctx_buffer&&) = delete; | ||||
|     llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete; | ||||
|     llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete; | ||||
| }; | ||||
| #else | ||||
| typedef llama_buffer llama_ctx_buffer; | ||||
|   | ||||
| @@ -727,8 +727,7 @@ struct llama_model_loader { | ||||
|             LLAMA_ASSERT(offset == lt.size); | ||||
|         } else if (lt.split_type == SPLIT_BY_COLUMNS) { | ||||
|             // Let's load the data into temporary buffers to ensure the OS performs large loads. | ||||
|             std::vector<llama_buffer> tmp_bufs; | ||||
|             tmp_bufs.resize(lt.shards.size()); | ||||
|             std::vector<llama_buffer> tmp_bufs(lt.shards.size()); | ||||
|             for (size_t i = 0; i < lt.shards.size(); i++) { | ||||
|                 llama_load_tensor_shard & shard = lt.shards.at(i); | ||||
|                 llama_file & file = file_loaders.at(shard.file_idx)->file; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren