mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	@@ -4416,7 +4416,6 @@ void kernel_mul_mv_q2_K_f32_impl(
 | 
				
			|||||||
        device const half     * dh = &x[ib].d;
 | 
					        device const half     * dh = &x[ib].d;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for (int row = 0; row < N_DST; row++) {
 | 
					        for (int row = 0; row < N_DST; row++) {
 | 
				
			||||||
 | 
					 | 
				
			||||||
            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
 | 
					            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
 | 
				
			||||||
            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
 | 
					            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
 | 
				
			||||||
            for (int i = 0; i < 8; i += 2) {
 | 
					            for (int i = 0; i < 8; i += 2) {
 | 
				
			||||||
@@ -4447,7 +4446,7 @@ void kernel_mul_mv_q2_K_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum;
 | 
					            dst_f32[first_row + row] = all_sum;
 | 
				
			||||||
@@ -4613,7 +4612,7 @@ void kernel_mul_mv_q3_K_f32_impl(
 | 
				
			|||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (tiisg == 0) {
 | 
					    if (tiisg == 0) {
 | 
				
			||||||
        for (int row = 0; row < 2; ++row) {
 | 
					        for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
            dst_f32[first_row + row] = sumf1[row];
 | 
					            dst_f32[first_row + row] = sumf1[row];
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -4729,7 +4728,7 @@ void kernel_mul_mv_q4_K_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum;
 | 
					            dst_f32[first_row + row] = all_sum;
 | 
				
			||||||
@@ -4861,7 +4860,7 @@ void kernel_mul_mv_q5_K_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < 2; ++row) {
 | 
					    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        const float tot = simd_sum(sumf[row]);
 | 
					        const float tot = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = tot;
 | 
					            dst_f32[first_row + row] = tot;
 | 
				
			||||||
@@ -4906,6 +4905,10 @@ void kernel_mul_mv_q6_K_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    const int row = 2*r0 + sgitg;
 | 
					    const int row = 2*r0 + sgitg;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (row >= args.ne0) {
 | 
				
			||||||
 | 
					        return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    const uint i12 = im%args.ne12;
 | 
					    const uint i12 = im%args.ne12;
 | 
				
			||||||
    const uint i13 = im/args.ne12;
 | 
					    const uint i13 = im/args.ne12;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -5061,7 +5064,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum * 0.25f;
 | 
					            dst_f32[first_row + row] = all_sum * 0.25f;
 | 
				
			||||||
@@ -5179,7 +5182,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum * 0.25f;
 | 
					            dst_f32[first_row + row] = all_sum * 0.25f;
 | 
				
			||||||
@@ -5289,7 +5292,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum * 0.5f;
 | 
					            dst_f32[first_row + row] = all_sum * 0.5f;
 | 
				
			||||||
@@ -5401,7 +5404,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum;
 | 
					            dst_f32[first_row + row] = all_sum;
 | 
				
			||||||
@@ -5514,7 +5517,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum * 0.25f;
 | 
					            dst_f32[first_row + row] = all_sum * 0.25f;
 | 
				
			||||||
@@ -5614,7 +5617,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum;
 | 
					            dst_f32[first_row + row] = all_sum;
 | 
				
			||||||
@@ -5709,7 +5712,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < N_DST; ++row) {
 | 
					    for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum;
 | 
					            dst_f32[first_row + row] = all_sum;
 | 
				
			||||||
@@ -5799,7 +5802,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
 | 
					    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum;
 | 
					            dst_f32[first_row + row] = all_sum;
 | 
				
			||||||
@@ -5888,7 +5891,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
					    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int row = 0; row < 2; ++row) {
 | 
					    for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
 | 
				
			||||||
        all_sum = simd_sum(sumf[row]);
 | 
					        all_sum = simd_sum(sumf[row]);
 | 
				
			||||||
        if (tiisg == 0) {
 | 
					        if (tiisg == 0) {
 | 
				
			||||||
            dst_f32[first_row + row] = all_sum;
 | 
					            dst_f32[first_row + row] = all_sum;
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user