mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-12 10:47:01 +00:00
Vulkan: Add Integer Dot Product mul_mat_vec shader for legacy quants (#14903)
* vulkan: Add Integer Dot Product mul_mat_vec shader for legacy quants * vulkan: use subgroup operations for quantize_q8_1 shader * vulkan: add q8_1_x4 type with 128-bit alignment, use in mul_mat_vecq shader * vulkan: use q8_1_x4 blocks in mul_mmq shader * vulkan: do 8 calculations per invocation instead of 32 in mul_mat_vecq, similar to mul_mat_vec * vulkan: tune mul_mat_vecq performance for Intel * vulkan: fix quantizing issue when tensor is not divisible by 128 * vulkan: adapt integer dot mmv to mmv small m optimization (#15355) * vulkan: allow all subgroup modes for mmv and mmvq * vulkan: use prealloc intermediate reuse for mmvq path * vulkan: tune mmvq for Intel, AMD GCN and Nvidia RTX 3090 * vulkan: adapt mmv quantize_y path to conditional sync logic * vulkan: disable q8_0 mmvq on Nvidia * vulkan: enable q8_0 on Nvidia pre-turing * fix prealloc sync condition * fix llvmpipe subgroup 8 issue
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
#extension GL_EXT_control_flow_attributes : enable
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
#extension GL_EXT_shader_8bit_storage : require
|
||||
#if USE_SUBGROUP_ADD
|
||||
|
||||
#if USE_SUBGROUP_ADD || USE_SUBGROUP_ADD_NO_SHMEM
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#endif
|
||||
@@ -12,10 +13,19 @@
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#ifndef MMQ
|
||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||
#else
|
||||
layout (binding = 0) readonly buffer A {A_TYPE_PACKED16 data_a[];};
|
||||
#endif
|
||||
|
||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||
#ifdef B_TYPE_VEC2
|
||||
layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
|
||||
#endif
|
||||
#ifdef B_TYPE_VEC4
|
||||
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
|
||||
#endif
|
||||
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
#ifdef MUL_MAT_ID
|
||||
@@ -92,6 +102,23 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
||||
layout (constant_id = 2) const uint NUM_COLS = 1;
|
||||
|
||||
#ifdef USE_SUBGROUP_ADD_NO_SHMEM
|
||||
void reduce_result(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
temp[j][n] = subgroupAdd(temp[j][n]);
|
||||
}
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
|
||||
|
||||
void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
||||
@@ -152,3 +179,4 @@ void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offs
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user