mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-11 10:36:54 +00:00
vulkan: 64-bit im2col (#16135)
* vulkan: 64-bit im2col Add variants of the im2col shaders that use buffer_device_address/buffer_reference, and use 64-bit address calculations. This is needed for large convolutions used in stable-diffusion.cpp. * fix validation error for large im2col
This commit is contained in:
@@ -5,8 +5,11 @@
|
||||
|
||||
#include "rte.comp"
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
BDA_STORAGE_T dst_addr;
|
||||
uint batch_offset; uint offset_delta;
|
||||
uint IC;
|
||||
uint IW; uint IH;
|
||||
@@ -19,8 +22,6 @@ layout (push_constant) uniform parameter
|
||||
int d0; int d1;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout(constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||
|
||||
const uint NUM_ITER = 512 / BLOCK_SIZE;
|
||||
@@ -30,6 +31,10 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#if BDA
|
||||
layout (buffer_reference) buffer D_ptr {D_TYPE d;};
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
const uint gidx = gl_GlobalInvocationID.x;
|
||||
|
||||
@@ -38,7 +43,7 @@ void main() {
|
||||
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
||||
|
||||
const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
|
||||
const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
|
||||
const BDA_OFFSET_T dst_base = ((BDA_OFFSET_T(batch) * p.OH + oh) * p.OW) * p.CHW + BDA_OFFSET_T(ic) * (p.KW * p.KH);
|
||||
const int oh_s1 = int(oh) * p.s1;
|
||||
const uint ksize = p.OW * p.KH;
|
||||
|
||||
@@ -50,7 +55,7 @@ void main() {
|
||||
uint current_ix = rem % p.OW;
|
||||
|
||||
A_TYPE values[NUM_ITER];
|
||||
uint offset_dst[NUM_ITER];
|
||||
BDA_OFFSET_T offset_dst[NUM_ITER];
|
||||
[[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
|
||||
values[idx] = A_TYPE(0);
|
||||
}
|
||||
@@ -66,7 +71,7 @@ void main() {
|
||||
const uint iiw = current_ix * p.s0 + current_kx * p.d0 - p.p0;
|
||||
const uint iih = oh_s1 + current_ky * p.d1 - p.p1;
|
||||
|
||||
offset_dst[idx] = dst_base + current_ix * p.CHW + current_ky * p.KW + current_kx;
|
||||
offset_dst[idx] = dst_base + BDA_OFFSET_T(current_ix) * p.CHW + current_ky * p.KW + current_kx;
|
||||
|
||||
if ((iih < p.IH) && (iiw < p.IW)) {
|
||||
values[idx] = data_a[src_base + iih * p.IW + iiw];
|
||||
@@ -89,7 +94,11 @@ void main() {
|
||||
continue;
|
||||
}
|
||||
|
||||
#if BDA
|
||||
D_ptr dst_addr = D_ptr(p.dst_addr + D_SIZE * offset_dst[idx]);
|
||||
dst_addr.d = D_TYPE(values[idx]);
|
||||
#else
|
||||
data_d[offset_dst[idx]] = D_TYPE(values[idx]);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -6,8 +6,11 @@
|
||||
|
||||
#include "rte.comp"
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
layout (push_constant) uniform parameter
|
||||
{
|
||||
BDA_STORAGE_T dst_addr;
|
||||
uint32_t nb10;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
@@ -38,8 +41,6 @@ layout (push_constant) uniform parameter
|
||||
uint32_t misalign_offsets;
|
||||
} p;
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
uint get_aoffset() { return p.misalign_offsets >> 16; }
|
||||
uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
|
||||
|
||||
@@ -50,6 +51,10 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
#if BDA
|
||||
layout (buffer_reference) buffer D_ptr {D_TYPE d;};
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
const uint32_t i = gl_GlobalInvocationID.x;
|
||||
|
||||
@@ -100,13 +105,22 @@ void main() {
|
||||
const uint32_t iih = ioh * s1 + ikh * d1 - p1;
|
||||
const uint32_t iid = iod * s2 + ikd * d2 - p2;
|
||||
|
||||
const uint32_t offset_dst = in_*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
|
||||
const BDA_OFFSET_T offset_dst = BDA_OFFSET_T(in_)*OD_OH_OW_IC_KD_KH_KW + BDA_OFFSET_T(iod)*OH_OW_IC_KD_KH_KW + BDA_OFFSET_T(ioh)*OW_IC_KD_KH_KW + BDA_OFFSET_T(iow)*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
|
||||
|
||||
const uint32_t offset_src = (in_*IC + iic)*nb13 + iid*nb12 + iih*nb11 + iiw*nb10;
|
||||
#if BDA
|
||||
D_ptr dst_addr = D_ptr(p.dst_addr + D_SIZE * offset_dst);
|
||||
if (iih >= IH || iiw >= IW || iid >= ID) {
|
||||
dst_addr.d = D_TYPE(0.0f);
|
||||
} else {
|
||||
dst_addr.d = D_TYPE(data_a[offset_src + get_aoffset()]);
|
||||
}
|
||||
#else
|
||||
if (iih >= IH || iiw >= IW || iid >= ID) {
|
||||
data_d[offset_dst + get_doffset()] = D_TYPE(0.0f);
|
||||
} else {
|
||||
const uint32_t offset_src = (in_*IC + iic)*nb13 + iid*nb12 + iih*nb11 + iiw*nb10;
|
||||
data_d[offset_dst + get_doffset()] = D_TYPE(data_a[offset_src + get_aoffset()]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1447,4 +1447,19 @@ float e8m0_to_fp32(uint8_t x) {
|
||||
return uintBitsToFloat(bits);
|
||||
}
|
||||
|
||||
#if BDA
|
||||
|
||||
#extension GL_EXT_buffer_reference : enable
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int64 : enable
|
||||
|
||||
#define BDA_STORAGE_T uint64_t
|
||||
#define BDA_OFFSET_T uint64_t
|
||||
|
||||
#else
|
||||
|
||||
#define BDA_STORAGE_T uvec2
|
||||
#define BDA_OFFSET_T uint
|
||||
|
||||
#endif
|
||||
|
||||
#endif // !defined(GGML_TYPES_COMP)
|
||||
|
||||
@@ -775,13 +775,15 @@ void process_shaders() {
|
||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("count_equal_i32", "count_equal.comp", merge_maps(base_dict, {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}));
|
||||
|
||||
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||
string_to_spv("im2col_f32_f16_rte", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}));
|
||||
|
||||
string_to_spv("im2col_3d_f32", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
string_to_spv("im2col_3d_f32_f16", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||
string_to_spv("im2col_3d_f32_f16_rte", "im2col_3d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}));
|
||||
for (std::string dim_str : {"", "_3d"}) {
|
||||
for (bool bda : {false, true}) {
|
||||
std::string bda_str = bda ? "_bda" : "";
|
||||
std::string bda_def = bda ? "1" : "0";
|
||||
string_to_spv("im2col" + dim_str + "_f32" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"D_SIZE", "4"}, {"BDA", bda_def}}));
|
||||
string_to_spv("im2col" + dim_str + "_f32_f16" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"D_SIZE", "2"}, {"BDA", bda_def}}));
|
||||
string_to_spv("im2col" + dim_str + "_f32_f16_rte" + bda_str, "im2col" + dim_str + ".comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"D_SIZE", "2"}, {"RTE16", "1"}, {"BDA", bda_def}}));
|
||||
}
|
||||
}
|
||||
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user