mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	vulkan : implement Stable Diffusion operators (ggml/904)
* Fix Vulkan repeat op * Implement Vulkan concat op * Delete old Vulkan shader generator * Implement Vulkan im2col op * Implement Vulkan unary gelu_quick op * Implement Vulkan group_norm op * Implement Vulkan timestep_embedding op * Implement Vulkan upscale op * Fix Vulkan vk_context tensor extra index issue * Fix Vulkan matmul shader parameter bug * Properly fix Vulkan matmul shader parameter bug * Add Vulkan ADD f16 + f32 -> f16 operator support * Implement Vulkan tanh op * Fix Vulkan group count too large Validation error on non-Nvidia GPUs * Throw error when too much memory is requested * Fix another Vulkan group count too large Validation error on non-Nvidia GPUs * Fix matmul MMQ condition * Implement Vulkan pad op * Fix Vulkan crash when tensor is used multiple times in a compute graph * Add Vulkan CONCAT f16 + f16 -> f16 op * Add Vulkan LEAKY_RELU op
This commit is contained in:
		
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -4,9 +4,11 @@ | |||||||
| #include "generic_binary_head.comp" | #include "generic_binary_head.comp" | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     if (gl_GlobalInvocationID.x >= p.ne) { |     const uint idx = get_idx(); | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)])); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -4,10 +4,12 @@ | |||||||
| #include "generic_unary_head.comp" | #include "generic_unary_head.comp" | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     if (gl_GlobalInvocationID.x >= p.ne) { |     const uint idx = get_idx(); | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); | ||||||
|     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										35
									
								
								ggml/src/vulkan-shaders/concat.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								ggml/src/vulkan-shaders/concat.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #include "types.comp" | ||||||
|  | #include "generic_binary_head.comp" | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |     const int dim = p.param3; | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const uint i3 = idx / (p.ne22*p.ne21*p.ne20); | ||||||
|  |     const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20; | ||||||
|  |     const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20); | ||||||
|  |     const uint i2_offset = i2*p.ne21*p.ne20; | ||||||
|  |     const uint i1 = (idx - i3_offset - i2_offset) / p.ne20; | ||||||
|  |     const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20; | ||||||
|  |  | ||||||
|  |     uint o[4] = {0, 0, 0, 0}; | ||||||
|  |     o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03)); | ||||||
|  |  | ||||||
|  |     const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; | ||||||
|  |     const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10; | ||||||
|  |     const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20; | ||||||
|  |  | ||||||
|  |     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; | ||||||
|  |  | ||||||
|  | #ifndef OPTIMIZATION_ERROR_WORKAROUND | ||||||
|  |     data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]); | ||||||
|  | #else | ||||||
|  |     data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx]; | ||||||
|  | #endif | ||||||
|  | } | ||||||
| @@ -4,13 +4,15 @@ | |||||||
| #include "generic_unary_head.comp" | #include "generic_unary_head.comp" | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     if (gl_GlobalInvocationID.x >= p.ne) { |     const uint idx = get_idx(); | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| #ifndef OPTIMIZATION_ERROR_WORKAROUND | #ifndef OPTIMIZATION_ERROR_WORKAROUND | ||||||
|     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]); | ||||||
| #else | #else | ||||||
|     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)]; |     data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)]; | ||||||
| #endif | #endif | ||||||
| } | } | ||||||
|   | |||||||
| @@ -4,9 +4,11 @@ | |||||||
| #include "generic_binary_head.comp" | #include "generic_binary_head.comp" | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     if (gl_GlobalInvocationID.x >= p.ne) { |     const uint idx = get_idx(); | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)])); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | |||||||
| void main() { | void main() { | ||||||
|     const float GELU_COEF_A    = 0.044715f; |     const float GELU_COEF_A    = 0.044715f; | ||||||
|     const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; |     const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; | ||||||
|     const uint i = gl_GlobalInvocationID.x; |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |  | ||||||
|     if (i >= p.KX) { |     if (i >= p.KX) { | ||||||
|         return; |         return; | ||||||
|   | |||||||
							
								
								
									
										23
									
								
								ggml/src/vulkan-shaders/gelu_quick.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								ggml/src/vulkan-shaders/gelu_quick.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #include "generic_head.comp" | ||||||
|  | #include "types.comp" | ||||||
|  |  | ||||||
|  | #extension GL_EXT_control_flow_attributes : enable | ||||||
|  |  | ||||||
|  | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; | ||||||
|  |  | ||||||
|  | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; | ||||||
|  | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const float GELU_QUICK_COEF = -1.702f; | ||||||
|  |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |  | ||||||
|  |     if (i >= p.KX) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const float x = float(data_a[i]); | ||||||
|  |     data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)))); | ||||||
|  | } | ||||||
| @@ -7,7 +7,7 @@ layout (push_constant) uniform parameter | |||||||
|     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; |     uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; | ||||||
|     uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23; |     uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23; | ||||||
|     uint d_offset; |     uint d_offset; | ||||||
|     float param1; float param2; |     float param1; float param2; int param3; | ||||||
| } p; | } p; | ||||||
|  |  | ||||||
| layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; | ||||||
| @@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; | |||||||
| layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; | layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; | ||||||
| layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; | layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | uint get_idx() { | ||||||
|  |     return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  | } | ||||||
|  |  | ||||||
| uint src0_idx(uint idx) { | uint src0_idx(uint idx) { | ||||||
|     const uint i03 = idx / (p.ne02*p.ne01*p.ne00); |     const uint i03 = idx / (p.ne02*p.ne01*p.ne00); | ||||||
|     const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; |     const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; | ||||||
|   | |||||||
| @@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; | |||||||
| layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; | ||||||
| layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | uint get_idx() { | ||||||
|  |     return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  | } | ||||||
|  |  | ||||||
| uint src0_idx(uint idx) { | uint src0_idx(uint idx) { | ||||||
|     const uint i03 = idx / (p.ne02*p.ne01*p.ne00); |     const uint i03 = idx / (p.ne02*p.ne01*p.ne00); | ||||||
|     const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; |     const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; | ||||||
|   | |||||||
							
								
								
									
										66
									
								
								ggml/src/vulkan-shaders/group_norm.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								ggml/src/vulkan-shaders/group_norm.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,66 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #include "generic_head.comp" | ||||||
|  | #include "types.comp" | ||||||
|  |  | ||||||
|  | #extension GL_EXT_control_flow_attributes : enable | ||||||
|  | #define BLOCK_SIZE 512 | ||||||
|  |  | ||||||
|  | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; | ||||||
|  |  | ||||||
|  | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; | ||||||
|  | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | shared float tmp[BLOCK_SIZE]; | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint group_size = p.KX; | ||||||
|  |     const float eps = p.param1; | ||||||
|  |  | ||||||
|  |     const uint tid = gl_LocalInvocationID.x; | ||||||
|  |     const uint start = gl_WorkGroupID.x * group_size + tid; | ||||||
|  |     const uint end = start + group_size; | ||||||
|  |  | ||||||
|  |     tmp[tid] = 0.0f; | ||||||
|  |  | ||||||
|  |     // Calculate mean | ||||||
|  |     [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { | ||||||
|  |         tmp[tid] += float(data_a[col]); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // tmp up partial tmps and write back result | ||||||
|  |     barrier(); | ||||||
|  |     [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { | ||||||
|  |         if (tid < s) { | ||||||
|  |             tmp[tid] += tmp[tid + s]; | ||||||
|  |         } | ||||||
|  |         barrier(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const float mean = tmp[0] / group_size; | ||||||
|  |     barrier(); | ||||||
|  |     tmp[tid] = 0.0f; | ||||||
|  |  | ||||||
|  |     // Calculate variance | ||||||
|  |     [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { | ||||||
|  |         const float xi = float(data_a[col]) - mean; | ||||||
|  |         data_d[col] = D_TYPE(xi); | ||||||
|  |         tmp[tid] += xi * xi; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // sum up partial sums and write back result | ||||||
|  |     barrier(); | ||||||
|  |     [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { | ||||||
|  |         if (tid < s) { | ||||||
|  |             tmp[tid] += tmp[tid + s]; | ||||||
|  |         } | ||||||
|  |         barrier(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const float variance = tmp[0] / group_size; | ||||||
|  |     const float scale = inversesqrt(variance + eps); | ||||||
|  |  | ||||||
|  |     [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) { | ||||||
|  |         data_d[col] *= D_TYPE(scale); | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										57
									
								
								ggml/src/vulkan-shaders/im2col.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								ggml/src/vulkan-shaders/im2col.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,57 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #extension GL_EXT_shader_16bit_storage : require | ||||||
|  |  | ||||||
|  | layout (push_constant) uniform parameter | ||||||
|  | { | ||||||
|  |     uint batch_offset; uint offset_delta; | ||||||
|  |     uint IC; | ||||||
|  |     uint IW; uint IH; | ||||||
|  |     uint OW; uint OH; | ||||||
|  |     uint KW; uint KH; | ||||||
|  |     uint pelements; | ||||||
|  |     uint CHW; | ||||||
|  |     int s0; int s1; | ||||||
|  |     int p0; int p1; | ||||||
|  |     int d0; int d1; | ||||||
|  | } p; | ||||||
|  |  | ||||||
|  | #include "types.comp" | ||||||
|  |  | ||||||
|  | #define BLOCK_SIZE 256 | ||||||
|  |  | ||||||
|  | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; | ||||||
|  |  | ||||||
|  | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; | ||||||
|  | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint i = gl_GlobalInvocationID.x; | ||||||
|  |     if (i >= p.pelements) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1); | ||||||
|  |     const uint kx = i / ksize; | ||||||
|  |     const uint kd = kx * ksize; | ||||||
|  |     const uint ky = (i - kd) / p.OW; | ||||||
|  |     const uint ix = i % p.OW; | ||||||
|  |  | ||||||
|  |     const uint oh = gl_GlobalInvocationID.y; | ||||||
|  |     const uint batch = gl_GlobalInvocationID.z / p.IC; | ||||||
|  |     const uint ic = gl_GlobalInvocationID.z % p.IC; | ||||||
|  |  | ||||||
|  |     const uint iiw = ix * p.s0 + kx * p.d0 - p.p0; | ||||||
|  |     const uint iih = oh * p.s1 + ky * p.d1 - p.p1; | ||||||
|  |  | ||||||
|  |     const uint offset_dst = | ||||||
|  |         ((batch * p.OH + oh) * p.OW + ix) * p.CHW + | ||||||
|  |         (ic * (p.KW * p.KH) + ky * p.KW + kx); | ||||||
|  |  | ||||||
|  |     if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) { | ||||||
|  |         data_d[offset_dst] = D_TYPE(0.0f); | ||||||
|  |     } else { | ||||||
|  |         const uint offset_src = ic * p.offset_delta + batch * p.batch_offset; | ||||||
|  |         data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]); | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										22
									
								
								ggml/src/vulkan-shaders/leaky_relu.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								ggml/src/vulkan-shaders/leaky_relu.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #include "generic_head.comp" | ||||||
|  | #include "types.comp" | ||||||
|  |  | ||||||
|  | #extension GL_EXT_control_flow_attributes : enable | ||||||
|  |  | ||||||
|  | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; | ||||||
|  |  | ||||||
|  | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; | ||||||
|  | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |  | ||||||
|  |     if (i >= p.KX) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const float val = float(data_a[i]); | ||||||
|  |     data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1); | ||||||
|  | } | ||||||
| @@ -4,9 +4,11 @@ | |||||||
| #include "generic_binary_head.comp" | #include "generic_binary_head.comp" | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     if (gl_GlobalInvocationID.x >= p.ne) { |     const uint idx = get_idx(); | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)])); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | |||||||
| shared vec2 sum[BLOCK_SIZE]; | shared vec2 sum[BLOCK_SIZE]; | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     const uint row = gl_WorkGroupID.x; |     const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; | ||||||
|     const uint tid = gl_LocalInvocationID.x; |     const uint tid = gl_LocalInvocationID.x; | ||||||
|  |  | ||||||
|     sum[tid] = vec2(0.0f, 0.0f); |     sum[tid] = vec2(0.0f, 0.0f); | ||||||
|   | |||||||
							
								
								
									
										26
									
								
								ggml/src/vulkan-shaders/pad.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								ggml/src/vulkan-shaders/pad.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #include "types.comp" | ||||||
|  | #include "generic_unary_head.comp" | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const uint i3 = idx / (p.ne12*p.ne11*p.ne10); | ||||||
|  |     const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; | ||||||
|  |     const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10); | ||||||
|  |     const uint i2_offset = i2*p.ne11*p.ne10; | ||||||
|  |     const uint i1 = (idx - i3_offset - i2_offset) / p.ne10; | ||||||
|  |     const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; | ||||||
|  |  | ||||||
|  |     const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00; | ||||||
|  |     const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10; | ||||||
|  |  | ||||||
|  |     const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03; | ||||||
|  |  | ||||||
|  |     data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f); | ||||||
|  | } | ||||||
| @@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; | |||||||
| layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     const uint i = gl_GlobalInvocationID.x; |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |  | ||||||
|     if (i >= p.KX) { |     if (i >= p.KX) { | ||||||
|         return; |         return; | ||||||
|   | |||||||
| @@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | |||||||
| shared FLOAT_TYPE sum[BLOCK_SIZE]; | shared FLOAT_TYPE sum[BLOCK_SIZE]; | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     const uint row = gl_WorkGroupID.x; |     const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; | ||||||
|     const uint tid = gl_LocalInvocationID.x; |     const uint tid = gl_LocalInvocationID.x; | ||||||
|  |  | ||||||
|     sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp |     sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp | ||||||
|   | |||||||
| @@ -4,9 +4,11 @@ | |||||||
| #include "generic_unary_head.comp" | #include "generic_unary_head.comp" | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     if (gl_GlobalInvocationID.x >= p.ne) { |     const uint idx = get_idx(); | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1)); |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1)); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; | |||||||
| layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     const uint i = gl_GlobalInvocationID.x; |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |  | ||||||
|     if (i >= p.KX) { |     if (i >= p.KX) { | ||||||
|         return; |         return; | ||||||
|   | |||||||
| @@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE]; | |||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     const uint tid = gl_LocalInvocationID.x; |     const uint tid = gl_LocalInvocationID.x; | ||||||
|     const uint rowx = gl_WorkGroupID.x; |     const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; | ||||||
|     const uint rowy = rowx % p.KY; |     const uint rowy = rowx % p.KY; | ||||||
|  |  | ||||||
|     float slope = 1.0f; |     float slope = 1.0f; | ||||||
|   | |||||||
| @@ -4,10 +4,12 @@ | |||||||
| #include "generic_unary_head.comp" | #include "generic_unary_head.comp" | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     if (gl_GlobalInvocationID.x >= p.ne) { |     const uint idx = get_idx(); | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]); |     const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]); | ||||||
|     data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val); |     data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32; | |||||||
| shared FLOAT_TYPE tmp[BLOCK_SIZE]; | shared FLOAT_TYPE tmp[BLOCK_SIZE]; | ||||||
|  |  | ||||||
| void main() { | void main() { | ||||||
|     const uint row = gl_WorkGroupID.x; |     const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; | ||||||
|     const uint col = gl_LocalInvocationID.x; |     const uint col = gl_LocalInvocationID.x; | ||||||
|  |  | ||||||
|     tmp[col] = FLOAT_TYPE(0.0f); |     tmp[col] = FLOAT_TYPE(0.0f); | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								ggml/src/vulkan-shaders/tanh.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								ggml/src/vulkan-shaders/tanh.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #include "generic_head.comp" | ||||||
|  | #include "types.comp" | ||||||
|  |  | ||||||
|  | #extension GL_EXT_control_flow_attributes : enable | ||||||
|  |  | ||||||
|  | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; | ||||||
|  |  | ||||||
|  | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; | ||||||
|  | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |  | ||||||
|  |     if (i >= p.KX) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     data_d[i] = D_TYPE(tanh(data_a[i])); | ||||||
|  | } | ||||||
							
								
								
									
										41
									
								
								ggml/src/vulkan-shaders/timestep_embedding.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										41
									
								
								ggml/src/vulkan-shaders/timestep_embedding.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,41 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | #extension GL_EXT_shader_16bit_storage : require | ||||||
|  |  | ||||||
|  | layout (push_constant) uniform parameter | ||||||
|  | { | ||||||
|  |     uint nb1; | ||||||
|  |     uint dim; | ||||||
|  |     uint max_period; | ||||||
|  | } p; | ||||||
|  |  | ||||||
|  | #include "types.comp" | ||||||
|  |  | ||||||
|  | #extension GL_EXT_control_flow_attributes : enable | ||||||
|  | #define BLOCK_SIZE 256 | ||||||
|  |  | ||||||
|  | layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; | ||||||
|  |  | ||||||
|  | layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; | ||||||
|  | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint i = gl_WorkGroupID.y; | ||||||
|  |     const uint j = gl_GlobalInvocationID.x; | ||||||
|  |     const uint d_offset = i * p.nb1; | ||||||
|  |  | ||||||
|  |     if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) { | ||||||
|  |         data_d[d_offset + p.dim] = 0.f; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const uint half_dim = p.dim / 2; | ||||||
|  |     if (j >= half_dim) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const float timestep = float(data_a[i]); | ||||||
|  |     const float freq = float(exp(-log(p.max_period) * j / half_dim)); | ||||||
|  |     const float arg = timestep * freq; | ||||||
|  |     data_d[d_offset + j] = D_TYPE(cos(arg)); | ||||||
|  |     data_d[d_offset + j + half_dim] = D_TYPE(sin(arg)); | ||||||
|  | } | ||||||
| @@ -6,7 +6,7 @@ | |||||||
| #define QUANT_K 1 | #define QUANT_K 1 | ||||||
| #define QUANT_R 1 | #define QUANT_R 1 | ||||||
|  |  | ||||||
| #ifndef LOAD_VEC_A | #if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 | ||||||
| #define A_TYPE float | #define A_TYPE float | ||||||
| #elif LOAD_VEC_A == 4 | #elif LOAD_VEC_A == 4 | ||||||
| #define A_TYPE vec4 | #define A_TYPE vec4 | ||||||
| @@ -19,7 +19,7 @@ | |||||||
| #define QUANT_K 1 | #define QUANT_K 1 | ||||||
| #define QUANT_R 1 | #define QUANT_R 1 | ||||||
|  |  | ||||||
| #ifndef LOAD_VEC_A | #if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 | ||||||
| #define A_TYPE float16_t | #define A_TYPE float16_t | ||||||
| #elif LOAD_VEC_A == 4 | #elif LOAD_VEC_A == 4 | ||||||
| #define A_TYPE f16vec4 | #define A_TYPE f16vec4 | ||||||
|   | |||||||
							
								
								
									
										36
									
								
								ggml/src/vulkan-shaders/upscale.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								ggml/src/vulkan-shaders/upscale.comp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | |||||||
|  | #version 450 | ||||||
|  |  | ||||||
|  | layout (push_constant) uniform parameter | ||||||
|  | { | ||||||
|  |     uint ne; uint d_offset; | ||||||
|  |     uint nb00; uint nb01; uint nb02; uint nb03; | ||||||
|  |     uint ne10; uint ne11; uint ne12; uint ne13; | ||||||
|  |     float sf0; float sf1; float sf2; float sf3; | ||||||
|  | } p; | ||||||
|  |  | ||||||
|  | #include "types.comp" | ||||||
|  |  | ||||||
|  | layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; | ||||||
|  |  | ||||||
|  | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; | ||||||
|  | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||||||
|  |  | ||||||
|  | void main() { | ||||||
|  |     const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; | ||||||
|  |  | ||||||
|  |     if (idx >= p.ne) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const uint i10 = idx % p.ne10; | ||||||
|  |     const uint i11 = (idx / p.ne10) % p.ne11; | ||||||
|  |     const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12; | ||||||
|  |     const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13; | ||||||
|  |  | ||||||
|  |     const uint i00 = uint(i10 / p.sf0); | ||||||
|  |     const uint i01 = uint(i11 / p.sf1); | ||||||
|  |     const uint i02 = uint(i12 / p.sf2); | ||||||
|  |     const uint i03 = uint(i13 / p.sf3); | ||||||
|  |  | ||||||
|  |     data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]); | ||||||
|  | } | ||||||
| @@ -30,20 +30,6 @@ | |||||||
|  |  | ||||||
| #define ASYNCIO_CONCURRENCY 64 | #define ASYNCIO_CONCURRENCY 64 | ||||||
|  |  | ||||||
| // define prototypes |  | ||||||
| void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str); |  | ||||||
| bool directory_exists(const std::string& path); |  | ||||||
| bool create_directory(const std::string& path); |  | ||||||
| std::string to_uppercase(const std::string& input); |  | ||||||
| bool string_ends_with(const std::string& str, const std::string& suffix); |  | ||||||
| std::string join_paths(const std::string& path1, const std::string& path2); |  | ||||||
| std::string basename(const std::string &path); |  | ||||||
| void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16); |  | ||||||
| std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b); |  | ||||||
| void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id); |  | ||||||
| void process_shaders(std::vector<std::future<void>>& tasks); |  | ||||||
| void write_output_files(); |  | ||||||
|  |  | ||||||
| std::mutex lock; | std::mutex lock; | ||||||
| std::vector<std::pair<std::string, std::string>> shader_fnames; | std::vector<std::pair<std::string, std::string>> shader_fnames; | ||||||
|  |  | ||||||
| @@ -52,7 +38,7 @@ std::string input_dir = "vulkan-shaders"; | |||||||
| std::string output_dir = "/tmp"; | std::string output_dir = "/tmp"; | ||||||
| std::string target_hpp = "ggml-vulkan-shaders.hpp"; | std::string target_hpp = "ggml-vulkan-shaders.hpp"; | ||||||
| std::string target_cpp = "ggml-vulkan-shaders.cpp"; | std::string target_cpp = "ggml-vulkan-shaders.cpp"; | ||||||
| bool clean = true; | bool no_clean = false; | ||||||
|  |  | ||||||
| const std::vector<std::string> type_names = { | const std::vector<std::string> type_names = { | ||||||
|     "f32", |     "f32", | ||||||
| @@ -283,9 +269,12 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu | |||||||
|  |  | ||||||
|     for (const auto& tname : type_names) { |     for (const auto& tname : type_names) { | ||||||
|         std::string data_a_key = "DATA_A_" + to_uppercase(tname); |         std::string data_a_key = "DATA_A_" + to_uppercase(tname); | ||||||
|  |         // For unaligned, load one at a time for f32/f16, or two at a time for quants | ||||||
|  |         std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2"; | ||||||
|  |         // For aligned matmul loads | ||||||
|         std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2"; |         std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2"; | ||||||
|         tasks.push_back(std::async(std::launch::async, [=] { |         tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|             string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); |             string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); | ||||||
|         })); |         })); | ||||||
|         tasks.push_back(std::async(std::launch::async, [=] { |         tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|             string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16); |             string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16); | ||||||
| @@ -354,6 +343,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) { | |||||||
|     tasks.push_back(std::async(std::launch::async, [=] { |     tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|         string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); |         string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); | ||||||
|     })); |     })); | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|  |         string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); | ||||||
|  |     })); | ||||||
|     tasks.push_back(std::async(std::launch::async, [=] { |     tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|         string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); |         string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); | ||||||
|     })); |     })); | ||||||
| @@ -371,6 +363,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) { | |||||||
|     tasks.push_back(std::async(std::launch::async, [] { |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|         string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); |         string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); | ||||||
|     })); |     })); | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}}); | ||||||
|  |     })); | ||||||
|  |  | ||||||
|     tasks.push_back(std::async(std::launch::async, [] { |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|         string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); |         string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); | ||||||
| @@ -396,15 +391,42 @@ void process_shaders(std::vector<std::future<void>>& tasks) { | |||||||
|         string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); |         string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); | ||||||
|     })); |     })); | ||||||
|  |  | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|  |     })); | ||||||
|  |  | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|  |     })); | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); | ||||||
|  |     })); | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}}); | ||||||
|  |     })); | ||||||
|  |  | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|  |     })); | ||||||
|  |  | ||||||
|     tasks.push_back(std::async(std::launch::async, [] { |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|         string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); |         string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|     })); |     })); | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|  |     })); | ||||||
|     tasks.push_back(std::async(std::launch::async, [] { |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|         string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); |         string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|     })); |     })); | ||||||
|     tasks.push_back(std::async(std::launch::async, [] { |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|         string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); |         string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|     })); |     })); | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|  |     })); | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|  |         string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
|  |     })); | ||||||
|  |  | ||||||
|     tasks.push_back(std::async(std::launch::async, [] { |     tasks.push_back(std::async(std::launch::async, [] { | ||||||
|         string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); |         string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); | ||||||
| @@ -438,6 +460,17 @@ void process_shaders(std::vector<std::future<void>>& tasks) { | |||||||
|     tasks.push_back(std::async(std::launch::async, [=] { |     tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|         string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); |         string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); | ||||||
|     })); |     })); | ||||||
|  |  | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|  |         string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); | ||||||
|  |     })); | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|  |         string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}})); | ||||||
|  |     })); | ||||||
|  |  | ||||||
|  |     tasks.push_back(std::async(std::launch::async, [=] { | ||||||
|  |         string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); | ||||||
|  |     })); | ||||||
| } | } | ||||||
|  |  | ||||||
| void write_output_files() { | void write_output_files() { | ||||||
| @@ -478,9 +511,8 @@ void write_output_files() { | |||||||
|         } |         } | ||||||
|         fprintf(src, "\n};\n\n"); |         fprintf(src, "\n};\n\n"); | ||||||
|  |  | ||||||
|         if (clean) { |         if (!no_clean) { | ||||||
|             std::remove(path.c_str()); |             std::remove(path.c_str()); | ||||||
|             // fprintf(stderr, "Removed: %s\n", path.c_str()); |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -496,18 +528,6 @@ int main(int argc, char** argv) { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (argc <= 1 || args.find("--help") != args.end()) { |  | ||||||
|         std::cout << "Usage:\n" |  | ||||||
|                      "\tvulkan-shaders-gen [options]\n\n" |  | ||||||
|                      "Options:\n" |  | ||||||
|                      "\t--glslc <path>        Path to glslc executable (default: /usr/bin/glslc)\n" |  | ||||||
|                      "\t--input-dir           Directory containing shader sources (required)\n" |  | ||||||
|                      "\t--output-dir          Output directory for generated SPIR-V files and optional C++ headers\n" |  | ||||||
|                      "\t--target-hpp <path>   Path to generate a header file with shader declarations in C++ format\n" |  | ||||||
|                      "\t--target-cpp <path>   Path to generate a source code file implementing the declared shaders (optional)\n" |  | ||||||
|                      "\t--no-clean            Keep temporary SPIR-V files after build (default: remove them)\n"; |  | ||||||
|         return EXIT_SUCCESS; |  | ||||||
|     } |  | ||||||
|     if (args.find("--glslc") != args.end()) { |     if (args.find("--glslc") != args.end()) { | ||||||
|         GLSLC = args["--glslc"]; // Path to glslc |         GLSLC = args["--glslc"]; // Path to glslc | ||||||
|     } |     } | ||||||
| @@ -524,7 +544,7 @@ int main(int argc, char** argv) { | |||||||
|         target_cpp = args["--target-cpp"]; // Path to generated cpp file |         target_cpp = args["--target-cpp"]; // Path to generated cpp file | ||||||
|     } |     } | ||||||
|     if (args.find("--no-clean") != args.end()) { |     if (args.find("--no-clean") != args.end()) { | ||||||
|         clean = false; // Keep temporary SPIR-V files in output-dir after build |         no_clean = true; // Keep temporary SPIR-V files in output-dir after build | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (!directory_exists(input_dir)) { |     if (!directory_exists(input_dir)) { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 0cc4m
					0cc4m