mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Vulkan: VK_KHR_cooperative_matrix support to speed up prompt processing (#10597)
* Vulkan: Implement VK_KHR_cooperative_matrix support in the matrix matrix multiplication shader * Improve performance with better q4_k and q5_k dequant and store unrolling * Add Vulkan MUL_MAT and MUL_MAT_ID accumulator precision selection * Rework mulmat shader selection and compilation logic, avoid compiling shaders that won't get used by device * Vulkan: Implement accumulator switch for specific mul mat mat shaders * Vulkan: Unroll more loops for more mul mat mat performance * Vulkan: Add VK_AMD_shader_core_properties2 support to read Compute Unit count for split_k logic * Disable coopmat support on AMD proprietary driver * Remove redundant checks * Add environment variable GGML_VK_DISABLE_COOPMAT to disable VK_KHR_cooperative_matrix support * Fix rebase typo * Fix coopmat2 MUL_MAT_ID pipeline selection
This commit is contained in:
		| @@ -60,6 +60,7 @@ const std::vector<std::string> type_names = { | ||||
|     "iq4_nl" | ||||
| }; | ||||
|  | ||||
| namespace { | ||||
| void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) { | ||||
| #ifdef _WIN32 | ||||
|     HANDLE stdout_read, stdout_write; | ||||
| @@ -198,8 +199,8 @@ static uint32_t compile_count = 0; | ||||
| static std::mutex compile_count_mutex; | ||||
| static std::condition_variable compile_count_cond; | ||||
|  | ||||
| void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { | ||||
|     std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); | ||||
| void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) { | ||||
|     std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat ? "_coopmat" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); | ||||
|     std::string out_fname = join_paths(output_dir, name + ".spv"); | ||||
|     std::string in_path = join_paths(input_dir, in_fname); | ||||
|  | ||||
| @@ -258,7 +259,7 @@ std::map<std::string, std::string> merge_maps(const std::map<std::string, std::s | ||||
| } | ||||
|  | ||||
| static std::vector<std::future<void>> compiles; | ||||
| void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { | ||||
| void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) { | ||||
|     { | ||||
|         // wait until fewer than N compiles are in progress. | ||||
|         // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors. | ||||
| @@ -269,10 +270,10 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const | ||||
|         } | ||||
|         compile_count++; | ||||
|     } | ||||
|     compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat2, f16acc)); | ||||
|     compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat, coopmat2, f16acc)); | ||||
| } | ||||
|  | ||||
| void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { | ||||
| void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool f16acc) { | ||||
|     std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4"; | ||||
|     std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; | ||||
|     std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; | ||||
| @@ -291,14 +292,20 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { | ||||
|  | ||||
|     base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; | ||||
|  | ||||
|     if (coopmat) { | ||||
|         base_dict["COOPMAT"] = "1"; | ||||
|     } | ||||
|  | ||||
|     base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; | ||||
|  | ||||
|     std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; | ||||
|  | ||||
|     // Shaders with f16 B_TYPE | ||||
|     string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat2, f16acc); | ||||
|     string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); | ||||
|     string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc); | ||||
|     string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); | ||||
|  | ||||
|     string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); | ||||
|     string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat2, f16acc); | ||||
|     string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); | ||||
|     string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); | ||||
|  | ||||
|     for (const auto& tname : type_names) { | ||||
|         std::string data_a_key = "DATA_A_" + to_uppercase(tname); | ||||
| @@ -307,12 +314,12 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { | ||||
|         // For aligned matmul loads | ||||
|         std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2"; | ||||
|  | ||||
|         string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); | ||||
|         string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); | ||||
|         string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc); | ||||
|         string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); | ||||
|  | ||||
|         if (tname != "f16" && tname != "f32") { | ||||
|             string_to_spv(shader_name + "_" + tname + "_f16", source_name,          merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                           {"B_TYPE", "float16_t"},        {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); | ||||
|             string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name,  merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); | ||||
|             string_to_spv(shader_name + "_" + tname + "_f16", source_name,          merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                           {"B_TYPE", "float16_t"},        {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc); | ||||
|             string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name,  merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -322,25 +329,24 @@ void process_shaders() { | ||||
|     std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}}; | ||||
|  | ||||
|     // matmul | ||||
|     for (const auto& fp16 : {false, true}) { | ||||
|         for (const auto& matmul_id : {false, true}) { | ||||
|             for (const auto& coopmat2 : {false, true}) { | ||||
|                 for (const auto& f16acc : {false, true}) { | ||||
| #if !defined(VK_NV_cooperative_matrix2) | ||||
|                     if (coopmat2) { | ||||
|                         continue; | ||||
|                     } | ||||
|     for (const auto& matmul_id : {false, true}) { | ||||
|         // No coopmats | ||||
|         // fp32 | ||||
|         matmul_shaders(false, matmul_id, false, false, false); | ||||
|  | ||||
|         // fp16, fp32acc and fp16acc | ||||
|         matmul_shaders(true, matmul_id, false, false, false); | ||||
|         matmul_shaders(true, matmul_id, false, false, true); | ||||
|  | ||||
|         // Coopmat, fp32acc and fp16acc | ||||
|         matmul_shaders(true, matmul_id, true, false, false); | ||||
|         matmul_shaders(true, matmul_id, true, false, true); | ||||
|  | ||||
| #if defined(VK_NV_cooperative_matrix2) | ||||
|         // Coopmat2, fp32acc and fp16acc | ||||
|         matmul_shaders(true, matmul_id, false, true, false); | ||||
|         matmul_shaders(true, matmul_id, false, true, true); | ||||
| #endif | ||||
|                     if (coopmat2 && !fp16) { | ||||
|                         continue; | ||||
|                     } | ||||
|                     if (!coopmat2 && f16acc) { | ||||
|                         continue; | ||||
|                     } | ||||
|                     matmul_shaders(fp16, matmul_id, coopmat2, f16acc); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| #if defined(VK_NV_cooperative_matrix2) | ||||
| @@ -355,11 +361,11 @@ void process_shaders() { | ||||
|  | ||||
|             if (tname == "f16") { | ||||
|                 string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", | ||||
|                     merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, true, f16acc); | ||||
|                     merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, false, true, f16acc); | ||||
|             } else { | ||||
|                 std::string data_a_key = "DATA_A_" + to_uppercase(tname); | ||||
|                 string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", | ||||
|                     merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, true, f16acc); | ||||
|                     merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, true, f16acc); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| @@ -524,6 +530,7 @@ void write_output_files() { | ||||
|     fclose(hdr); | ||||
|     fclose(src); | ||||
| } | ||||
| } | ||||
|  | ||||
| int main(int argc, char** argv) { | ||||
|     std::map<std::string, std::string> args; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 0cc4m
					0cc4m