mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	Vulkan: RTE rounding for cpy to quant (#12480)
* Vulkan: RTE rounding for cpy to quant Co-Authored-By: Jeff Bolz <jbolz@nvidia.com> * remove trailing whitespace * avoid duplicating pipeline_cpy_f32_quant * fix copypasting issue * remove duplicated code --------- Co-authored-by: Jeff Bolz <jbolz@nvidia.com>
This commit is contained in:
		@@ -1,5 +1,10 @@
 | 
			
		||||
#version 450
 | 
			
		||||
 | 
			
		||||
#if RTE16
 | 
			
		||||
#extension GL_EXT_spirv_intrinsics : enable
 | 
			
		||||
spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits
 | 
			
		||||
#endif // RTE16
 | 
			
		||||
 | 
			
		||||
#include "types.comp"
 | 
			
		||||
#include "generic_unary_head.comp"
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -445,6 +445,7 @@ void process_shaders() {
 | 
			
		||||
 | 
			
		||||
    for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
 | 
			
		||||
        string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 | 
			
		||||
        string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
 | 
			
		||||
        string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user