mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	cuda : improve cuda pool efficiency using virtual memory (#4606)
* cuda : improve cuda pool efficiency using virtual memory * fix mixtral * fix cmake build * check for vmm support, disable for hip ggml-ci * fix hip build * clarify granularity * move all caps to g_device_caps * refactor error checking * add cuda_pool_alloc, refactor most pool allocations ggml-ci * fix hip build * CUBLAS_TF32_TENSOR_OP_MATH is not a macro * more hip crap * llama : fix msvc warnings * ggml : fix msvc warnings * minor * minor * cuda : fallback to CPU on host buffer alloc fail * Update ggml-cuda.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update ggml-cuda.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * ensure allocations are always aligned * act_size -> actual_size --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
		| @@ -1281,7 +1281,7 @@ struct llama_hparams { | ||||
|         if (this->rope_finetuned  != other.rope_finetuned)  return true; | ||||
|         if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; | ||||
|  | ||||
|         const float EPSILON = 1e-9; | ||||
|         const float EPSILON = 1e-9f; | ||||
|  | ||||
|         if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true; | ||||
|         if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true; | ||||
| @@ -10300,7 +10300,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch | ||||
|                 std::string result = model->vocab.id_to_token[token].text; | ||||
|                 llama_unescape_whitespace(result); | ||||
|                 if (length < (int) result.length()) { | ||||
|                     return -result.length(); | ||||
|                     return -(int) result.length(); | ||||
|                 } | ||||
|                 memcpy(buf, result.c_str(), result.length()); | ||||
|                 return result.length(); | ||||
| @@ -10330,7 +10330,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch | ||||
|                 std::string result = model->vocab.id_to_token[token].text; | ||||
|                 result = llama_decode_text(result); | ||||
|                 if (length < (int) result.length()) { | ||||
|                     return -result.length(); | ||||
|                     return -(int) result.length(); | ||||
|                 } | ||||
|                 memcpy(buf, result.c_str(), result.length()); | ||||
|                 return result.length(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren