mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-02 09:12:03 +00:00
* vulkan (DRAFT): split shader generation by GLSL source file, to improve incremental build times * support dep-files so shaders are recompiled if their included files change * rename shader files which are used as "headers" to use .glsl extension * move glslc extension detection shaders to separate folders * the above is to prevent them from getting glob'd with the actual compute shaders that need to be compiled * vulkan : only write embedded shader .hpp/.cpp when they change * avoid recompiling ggml-vulkan.cpp when editing shaders * pass single --source argument instead of --input-dir & --filter to shader gen * check for source file match earlier * fix hang in vulkan-shaders-gen when there are compilation errors * early out did not decrement compile_count * clean up * fix glslc integer dot product test * unconditionally write the embedded shader cpp output * replace output filepath in generated dep-files to match output in CMakeLists --------- Co-authored-by: Jeff Bolz <jbolz@nvidia.com>
183 lines
5.2 KiB
GLSL
183 lines
5.2 KiB
GLSL
#extension GL_EXT_control_flow_attributes : enable
|
|
#extension GL_EXT_shader_16bit_storage : require
|
|
#extension GL_EXT_shader_8bit_storage : require
|
|
|
|
#if USE_SUBGROUP_ADD || USE_SUBGROUP_ADD_NO_SHMEM
|
|
#extension GL_KHR_shader_subgroup_basic : require
|
|
#extension GL_KHR_shader_subgroup_arithmetic : require
|
|
#endif
|
|
|
|
#ifdef MUL_MAT_ID
|
|
#define EXPERT_COUNT 8
|
|
#endif
|
|
|
|
#include "types.glsl"
|
|
|
|
#ifndef MMQ
|
|
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
|
#else
|
|
layout (binding = 0) readonly buffer A {A_TYPE_PACKED16 data_a[];};
|
|
#endif
|
|
|
|
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
|
#ifdef B_TYPE_VEC2
|
|
layout (binding = 1) readonly buffer BV2 {B_TYPE_VEC2 data_b_v2[];};
|
|
#endif
|
|
#ifdef B_TYPE_VEC4
|
|
layout (binding = 1) readonly buffer BV4 {B_TYPE_VEC4 data_b_v4[];};
|
|
#endif
|
|
|
|
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
|
#ifdef MUL_MAT_ID
|
|
layout (binding = 3) readonly buffer IDS {int data_ids[];};
|
|
#endif
|
|
|
|
#include "dequant_funcs.glsl"
|
|
|
|
layout (push_constant) uniform parameter
|
|
{
|
|
uint ncols;
|
|
uint stride_a;
|
|
uint stride_b;
|
|
uint stride_d;
|
|
|
|
uint batch_stride_a;
|
|
uint batch_stride_b;
|
|
uint batch_stride_d;
|
|
|
|
#ifdef MUL_MAT_ID
|
|
uint nei0;
|
|
uint ne11;
|
|
#else
|
|
uint ne02;
|
|
uint ne12;
|
|
uint broadcast2;
|
|
uint broadcast3;
|
|
#endif
|
|
} p;
|
|
|
|
void get_offsets(out uint a_offset, out uint b_offset, out uint d_offset) {
|
|
#ifdef MUL_MAT_ID
|
|
const uint expert_idx = gl_GlobalInvocationID.y;
|
|
#else
|
|
const uint batch_idx = gl_GlobalInvocationID.y;
|
|
#endif
|
|
|
|
#ifndef MUL_MAT_ID
|
|
uint batch_idx_a = 0;
|
|
if (batch_idx != 0) {
|
|
const uint i13 = batch_idx / p.ne12;
|
|
const uint i12 = batch_idx % p.ne12;
|
|
|
|
const uint i03 = i13 / p.broadcast3;
|
|
const uint i02 = i12 / p.broadcast2;
|
|
|
|
batch_idx_a = i03 * p.ne02 + i02;
|
|
}
|
|
#else
|
|
const uint expert_id = data_ids[expert_idx];
|
|
#endif
|
|
|
|
a_offset =
|
|
#ifdef MUL_MAT_ID
|
|
expert_id * p.batch_stride_a;
|
|
#else
|
|
batch_idx_a * p.batch_stride_a;
|
|
#endif
|
|
b_offset =
|
|
#ifdef MUL_MAT_ID
|
|
(expert_idx % p.ne11) * p.stride_b;
|
|
#else
|
|
batch_idx * p.batch_stride_b;
|
|
#endif
|
|
d_offset =
|
|
#ifdef MUL_MAT_ID
|
|
expert_idx * p.stride_d;
|
|
#else
|
|
batch_idx * p.batch_stride_d;
|
|
#endif
|
|
}
|
|
|
|
layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
|
layout (constant_id = 1) const uint NUM_ROWS = 1;
|
|
layout (constant_id = 2) const uint NUM_COLS = 1;
|
|
|
|
#ifdef USE_SUBGROUP_ADD_NO_SHMEM
|
|
void reduce_result(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
temp[j][n] = subgroupAdd(temp[j][n]);
|
|
}
|
|
}
|
|
|
|
if (tid == 0) {
|
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
shared FLOAT_TYPE tmpsh[NUM_COLS][NUM_ROWS][BLOCK_SIZE];
|
|
|
|
void reduce_result(FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const in uint32_t d_offset, const in uint32_t first_row, const in uint32_t num_rows, const in uint32_t tid) {
|
|
// subgroupAdd is probably faster on devices that support it,
|
|
// particularly when the workgroup has more than one subgroup
|
|
#if USE_SUBGROUP_ADD
|
|
// sum up partial sums within a subgroup
|
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
temp[j][n] = subgroupAdd(temp[j][n]);
|
|
}
|
|
}
|
|
|
|
// Go through shared memory to sum partials across subgroups
|
|
if (gl_SubgroupInvocationID == 0) {
|
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
tmpsh[j][n][gl_SubgroupID] = temp[j][n];
|
|
}
|
|
}
|
|
}
|
|
barrier();
|
|
if (tid == 0) {
|
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
temp[j][n] = FLOAT_TYPE(0);
|
|
[[unroll]] for (uint s = 0; s < gl_NumSubgroups; ++s) {
|
|
temp[j][n] += tmpsh[j][n][s];
|
|
}
|
|
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(temp[j][n]);
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
// sum up partial sums and write back result
|
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
tmpsh[j][n][tid] = temp[j][n];
|
|
}
|
|
}
|
|
barrier();
|
|
[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
|
|
if (tid < s) {
|
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
tmpsh[j][n][tid] += tmpsh[j][n][tid + s];
|
|
}
|
|
}
|
|
}
|
|
barrier();
|
|
}
|
|
if (tid == 0) {
|
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
|
data_d[j*p.batch_stride_d + d_offset + first_row + n] = D_TYPE(tmpsh[j][n][0]);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
#endif
|