mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	vulkan: Add N/2 and N/4 optimized paths in coopmat2 shader (#12312)
This commit is contained in:
		@@ -23,6 +23,10 @@ layout (constant_id = 1) const uint BM = 64;
 | 
			
		||||
layout (constant_id = 2) const uint BN = 64;
 | 
			
		||||
layout (constant_id = 3) const uint BK = 16;  // Assumed to be 32 if working with a quant
 | 
			
		||||
 | 
			
		||||
layout (constant_id = 4) const bool enable_smaller_matrices = false;
 | 
			
		||||
const uint BNover2 = enable_smaller_matrices ? (BN / 2) : BN;
 | 
			
		||||
const uint BNover4 = enable_smaller_matrices ? (BN / 4) : BN;
 | 
			
		||||
 | 
			
		||||
layout (push_constant) uniform parameter
 | 
			
		||||
{
 | 
			
		||||
    uint M;
 | 
			
		||||
@@ -168,15 +172,13 @@ void main() {
 | 
			
		||||
    const uint end_k = min(p.K, (ik + 1) * p.k_split);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> sum;
 | 
			
		||||
    sum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(0.0);
 | 
			
		||||
 | 
			
		||||
#ifdef MUL_MAT_ID
 | 
			
		||||
    uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K;
 | 
			
		||||
    uint pos_b = 0;
 | 
			
		||||
#else
 | 
			
		||||
    uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K;
 | 
			
		||||
    uint pos_b = batch_idx * p.batch_stride_b;
 | 
			
		||||
    uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    uint stride_a = p.stride_a / QUANT_K;
 | 
			
		||||
@@ -197,6 +199,7 @@ void main() {
 | 
			
		||||
    tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2);
 | 
			
		||||
    tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
 | 
			
		||||
    tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
 | 
			
		||||
    tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1);
 | 
			
		||||
 | 
			
		||||
#if QUANT_K > 1
 | 
			
		||||
    tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K);
 | 
			
		||||
@@ -232,16 +235,54 @@ void main() {
 | 
			
		||||
        tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1);
 | 
			
		||||
 | 
			
		||||
        uint k_iters = (end_k - start_k + BK - 1) / BK;
 | 
			
		||||
        if (enable_smaller_matrices && ic * BN + BNover4 >= p.N) {
 | 
			
		||||
            coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BNover4, gl_MatrixUseAccumulator> sum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BNover4, gl_MatrixUseAccumulator>(0.0);
 | 
			
		||||
            for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) {
 | 
			
		||||
 | 
			
		||||
        for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) {
 | 
			
		||||
                coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
 | 
			
		||||
                coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BNover4, gl_MatrixUseB> mat_b;
 | 
			
		||||
 | 
			
		||||
            coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
 | 
			
		||||
            coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
 | 
			
		||||
                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
 | 
			
		||||
                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose);
 | 
			
		||||
 | 
			
		||||
            coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
 | 
			
		||||
            coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose);
 | 
			
		||||
                sum = coopMatMulAdd(mat_a, mat_b, sum);
 | 
			
		||||
            }
 | 
			
		||||
            coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BNover4, gl_MatrixUseAccumulator> mat_d = coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BNover4, gl_MatrixUseAccumulator>(sum);
 | 
			
		||||
 | 
			
		||||
            sum = coopMatMulAdd(mat_a, mat_b, sum);
 | 
			
		||||
            coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BNover4, ir * BM, BM), tensorViewTranspose);
 | 
			
		||||
            return;
 | 
			
		||||
        } else if (enable_smaller_matrices && ic * BN + BNover2 >= p.N) {
 | 
			
		||||
            coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BNover2, gl_MatrixUseAccumulator> sum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BNover2, gl_MatrixUseAccumulator>(0.0);
 | 
			
		||||
            for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) {
 | 
			
		||||
 | 
			
		||||
                coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
 | 
			
		||||
                coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BNover2, gl_MatrixUseB> mat_b;
 | 
			
		||||
 | 
			
		||||
                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
 | 
			
		||||
                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose);
 | 
			
		||||
 | 
			
		||||
                sum = coopMatMulAdd(mat_a, mat_b, sum);
 | 
			
		||||
            }
 | 
			
		||||
            coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BNover2, gl_MatrixUseAccumulator> mat_d = coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BNover2, gl_MatrixUseAccumulator>(sum);
 | 
			
		||||
 | 
			
		||||
            coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BNover2, ir * BM, BM), tensorViewTranspose);
 | 
			
		||||
            return;
 | 
			
		||||
        } else {
 | 
			
		||||
            coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> sum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(0.0);
 | 
			
		||||
            for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) {
 | 
			
		||||
 | 
			
		||||
                coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
 | 
			
		||||
                coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
 | 
			
		||||
 | 
			
		||||
                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
 | 
			
		||||
                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose);
 | 
			
		||||
 | 
			
		||||
                sum = coopMatMulAdd(mat_a, mat_b, sum);
 | 
			
		||||
            }
 | 
			
		||||
            coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> mat_d = coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(sum);
 | 
			
		||||
 | 
			
		||||
            coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose);
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
    } else
 | 
			
		||||
#endif // !defined(MUL_MAT_ID)
 | 
			
		||||
@@ -254,6 +295,9 @@ void main() {
 | 
			
		||||
 | 
			
		||||
        tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1);
 | 
			
		||||
 | 
			
		||||
        coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> sum;
 | 
			
		||||
        sum = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(0.0);
 | 
			
		||||
 | 
			
		||||
        [[dont_unroll]]
 | 
			
		||||
        for (uint block_k = start_k; block_k < end_k; block_k += BK) {
 | 
			
		||||
 | 
			
		||||
@@ -296,19 +340,16 @@ void main() {
 | 
			
		||||
                sum = coopMatMulAdd(mat_a, mat_b, sum);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Convert from ACC_TYPE to D_TYPE
 | 
			
		||||
    coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> mat_d;
 | 
			
		||||
    mat_d = coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(sum);
 | 
			
		||||
        // Convert from ACC_TYPE to D_TYPE
 | 
			
		||||
        coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> mat_d;
 | 
			
		||||
        mat_d = coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator>(sum);
 | 
			
		||||
 | 
			
		||||
#ifdef MUL_MAT_ID
 | 
			
		||||
    // Call callback to store each element, remapping row through shared memory
 | 
			
		||||
    coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic);
 | 
			
		||||
        // Call callback to store each element, remapping row through shared memory
 | 
			
		||||
        coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic);
 | 
			
		||||
#else
 | 
			
		||||
    tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1);
 | 
			
		||||
 | 
			
		||||
    uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
 | 
			
		||||
    coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose);
 | 
			
		||||
        coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user