mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
vulkan: Use larger loads in scalar/coopmat1 matmul (#15729)
I think glslang will translate an access like x[i][1].z to OpAccessChain ... x, i, 1, 2 OpLoad float16_t ... rather than loading all of x[i] in a single OpLoad. Change the code to explicitly load the vector/matrix.
This commit is contained in:
@@ -315,21 +315,23 @@ void main() {
|
|||||||
#if LOAD_VEC_A == 8
|
#if LOAD_VEC_A == 8
|
||||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||||
buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx][0].x);
|
A_TYPE32 aa = A_TYPE32(data_a[idx]);
|
||||||
buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx][0].y);
|
buf_a[buf_idx ] = FLOAT_TYPE(aa[0].x);
|
||||||
buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx][0].z);
|
buf_a[buf_idx + 1] = FLOAT_TYPE(aa[0].y);
|
||||||
buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx][0].w);
|
buf_a[buf_idx + 2] = FLOAT_TYPE(aa[0].z);
|
||||||
buf_a[buf_idx + 4] = FLOAT_TYPE(data_a[idx][1].x);
|
buf_a[buf_idx + 3] = FLOAT_TYPE(aa[0].w);
|
||||||
buf_a[buf_idx + 5] = FLOAT_TYPE(data_a[idx][1].y);
|
buf_a[buf_idx + 4] = FLOAT_TYPE(aa[1].x);
|
||||||
buf_a[buf_idx + 6] = FLOAT_TYPE(data_a[idx][1].z);
|
buf_a[buf_idx + 5] = FLOAT_TYPE(aa[1].y);
|
||||||
buf_a[buf_idx + 7] = FLOAT_TYPE(data_a[idx][1].w);
|
buf_a[buf_idx + 6] = FLOAT_TYPE(aa[1].z);
|
||||||
|
buf_a[buf_idx + 7] = FLOAT_TYPE(aa[1].w);
|
||||||
#elif LOAD_VEC_A == 4
|
#elif LOAD_VEC_A == 4
|
||||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||||
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
||||||
buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx].x);
|
A_TYPE32 aa = A_TYPE32(data_a[idx]);
|
||||||
buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx].y);
|
buf_a[buf_idx ] = FLOAT_TYPE(aa.x);
|
||||||
buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx].z);
|
buf_a[buf_idx + 1] = FLOAT_TYPE(aa.y);
|
||||||
buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx].w);
|
buf_a[buf_idx + 2] = FLOAT_TYPE(aa.z);
|
||||||
|
buf_a[buf_idx + 3] = FLOAT_TYPE(aa.w);
|
||||||
#else
|
#else
|
||||||
if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
|
if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
|
||||||
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
|
buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
|
||||||
@@ -808,14 +810,19 @@ void main() {
|
|||||||
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||||
#endif
|
#endif
|
||||||
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
||||||
buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx][0].x);
|
#if defined(DATA_B_BF16)
|
||||||
buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx][0].y);
|
B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]);
|
||||||
buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx][0].z);
|
#else
|
||||||
buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx][0].w);
|
B_TYPE32 bb = B_TYPE32(data_b[idx]);
|
||||||
buf_b[buf_idx + 4] = FLOAT_TYPE(data_b[idx][1].x);
|
#endif
|
||||||
buf_b[buf_idx + 5] = FLOAT_TYPE(data_b[idx][1].y);
|
buf_b[buf_idx + 0] = FLOAT_TYPE(bb[0].x);
|
||||||
buf_b[buf_idx + 6] = FLOAT_TYPE(data_b[idx][1].z);
|
buf_b[buf_idx + 1] = FLOAT_TYPE(bb[0].y);
|
||||||
buf_b[buf_idx + 7] = FLOAT_TYPE(data_b[idx][1].w);
|
buf_b[buf_idx + 2] = FLOAT_TYPE(bb[0].z);
|
||||||
|
buf_b[buf_idx + 3] = FLOAT_TYPE(bb[0].w);
|
||||||
|
buf_b[buf_idx + 4] = FLOAT_TYPE(bb[1].x);
|
||||||
|
buf_b[buf_idx + 5] = FLOAT_TYPE(bb[1].y);
|
||||||
|
buf_b[buf_idx + 6] = FLOAT_TYPE(bb[1].z);
|
||||||
|
buf_b[buf_idx + 7] = FLOAT_TYPE(bb[1].w);
|
||||||
#elif LOAD_VEC_B == 4
|
#elif LOAD_VEC_B == 4
|
||||||
#ifdef MUL_MAT_ID
|
#ifdef MUL_MAT_ID
|
||||||
const u16vec2 row_idx = row_ids[loadc_b + l];
|
const u16vec2 row_idx = row_ids[loadc_b + l];
|
||||||
@@ -824,10 +831,15 @@ void main() {
|
|||||||
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
|
||||||
#endif
|
#endif
|
||||||
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
|
||||||
buf_b[buf_idx + 0] = TO_FLOAT_TYPE(data_b[idx].x);
|
#if defined(DATA_B_BF16)
|
||||||
buf_b[buf_idx + 1] = TO_FLOAT_TYPE(data_b[idx].y);
|
B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]);
|
||||||
buf_b[buf_idx + 2] = TO_FLOAT_TYPE(data_b[idx].z);
|
#else
|
||||||
buf_b[buf_idx + 3] = TO_FLOAT_TYPE(data_b[idx].w);
|
B_TYPE32 bb = B_TYPE32(data_b[idx]);
|
||||||
|
#endif
|
||||||
|
buf_b[buf_idx + 0] = FLOAT_TYPE(bb.x);
|
||||||
|
buf_b[buf_idx + 1] = FLOAT_TYPE(bb.y);
|
||||||
|
buf_b[buf_idx + 2] = FLOAT_TYPE(bb.z);
|
||||||
|
buf_b[buf_idx + 3] = FLOAT_TYPE(bb.w);
|
||||||
#elif !MUL_MAT_ID
|
#elif !MUL_MAT_ID
|
||||||
if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) {
|
if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) {
|
||||||
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]);
|
buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]);
|
||||||
|
|||||||
@@ -13,10 +13,13 @@
|
|||||||
|
|
||||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||||
#define A_TYPE float
|
#define A_TYPE float
|
||||||
|
#define A_TYPE32 float
|
||||||
#elif LOAD_VEC_A == 4
|
#elif LOAD_VEC_A == 4
|
||||||
#define A_TYPE vec4
|
#define A_TYPE vec4
|
||||||
|
#define A_TYPE32 vec4
|
||||||
#elif LOAD_VEC_A == 8
|
#elif LOAD_VEC_A == 8
|
||||||
#define A_TYPE mat2x4
|
#define A_TYPE mat2x4
|
||||||
|
#define A_TYPE32 mat2x4
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -26,10 +29,13 @@
|
|||||||
|
|
||||||
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||||
#define A_TYPE float16_t
|
#define A_TYPE float16_t
|
||||||
|
#define A_TYPE32 float
|
||||||
#elif LOAD_VEC_A == 4
|
#elif LOAD_VEC_A == 4
|
||||||
#define A_TYPE f16vec4
|
#define A_TYPE f16vec4
|
||||||
|
#define A_TYPE32 vec4
|
||||||
#elif LOAD_VEC_A == 8
|
#elif LOAD_VEC_A == 8
|
||||||
#define A_TYPE f16mat2x4
|
#define A_TYPE f16mat2x4
|
||||||
|
#define A_TYPE32 mat2x4
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -1424,6 +1430,11 @@ float bf16_to_fp32(uint32_t u)
|
|||||||
return uintBitsToFloat(u << 16);
|
return uintBitsToFloat(u << 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vec4 bf16_to_fp32(uvec4 u)
|
||||||
|
{
|
||||||
|
return vec4(bf16_to_fp32(u.x), bf16_to_fp32(u.y), bf16_to_fp32(u.z), bf16_to_fp32(u.w));
|
||||||
|
}
|
||||||
|
|
||||||
float e8m0_to_fp32(uint8_t x) {
|
float e8m0_to_fp32(uint8_t x) {
|
||||||
uint32_t bits;
|
uint32_t bits;
|
||||||
|
|
||||||
|
|||||||
@@ -364,11 +364,11 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Shaders with f16 B_TYPE
|
// Shaders with f16 B_TYPE
|
||||||
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
|
||||||
string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
|
|
||||||
string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
|
|
||||||
// bf16
|
// bf16
|
||||||
{
|
{
|
||||||
@@ -384,8 +384,8 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
|||||||
if (!(coopmat || coopmat2))
|
if (!(coopmat || coopmat2))
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"}, {"B_TYPE32", "vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
string_to_spv(shader_name + "_bf16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_bf16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -408,13 +408,13 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
|||||||
|
|
||||||
// don't generate f32 variants for coopmat2
|
// don't generate f32 variants for coopmat2
|
||||||
if (!coopmat2) {
|
if (!coopmat2) {
|
||||||
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tname != "f16" && tname != "f32") {
|
if (tname != "f16" && tname != "f32") {
|
||||||
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
#if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
|
||||||
|
|||||||
Reference in New Issue
Block a user