mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
312 lines
12 KiB
Plaintext
312 lines
12 KiB
Plaintext
#include "mmq.cuh"
|
|
#include "quantize.cuh"
|
|
|
|
#include <vector>
|
|
|
|
static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
|
switch (args.type_x) {
|
|
case GGML_TYPE_Q4_0:
|
|
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q4_1:
|
|
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q5_0:
|
|
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q5_1:
|
|
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q8_0:
|
|
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q2_K:
|
|
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q3_K:
|
|
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q4_K:
|
|
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q5_K:
|
|
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_Q6_K:
|
|
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_IQ2_XXS:
|
|
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_IQ2_XS:
|
|
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_IQ2_S:
|
|
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_IQ3_XXS:
|
|
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_IQ3_S:
|
|
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_IQ1_S:
|
|
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_IQ4_XS:
|
|
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
|
break;
|
|
case GGML_TYPE_IQ4_NL:
|
|
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
|
|
break;
|
|
default:
|
|
GGML_ABORT("fatal error");
|
|
break;
|
|
}
|
|
}
|
|
|
|
void ggml_cuda_mul_mat_q(
|
|
ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
|
|
GGML_ASSERT( src1->type == GGML_TYPE_F32);
|
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
|
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
|
|
cudaStream_t stream = ctx.stream();
|
|
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
|
|
|
const size_t ts_src0 = ggml_type_size(src0->type);
|
|
const size_t ts_src1 = ggml_type_size(src1->type);
|
|
const size_t ts_dst = ggml_type_size(dst->type);
|
|
|
|
GGML_ASSERT( nb00 == ts_src0);
|
|
GGML_ASSERT( nb10 == ts_src1);
|
|
GGML_ASSERT( nb0 == ts_dst);
|
|
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
|
|
|
|
const char * src0_d = (const char *) src0->data;
|
|
const float * src1_d = (const float *) src1->data;
|
|
float * dst_d = (float *) dst->data;
|
|
|
|
const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
|
|
|
|
const int64_t s01 = src0->nb[1] / ts_src0;
|
|
const int64_t s1 = dst->nb[1] / ts_dst;
|
|
const int64_t s02 = src0->nb[2] / ts_src0;
|
|
const int64_t s2 = dst->nb[2] / ts_dst;
|
|
const int64_t s03 = src0->nb[3] / ts_src0;
|
|
const int64_t s3 = dst->nb[3] / ts_dst;
|
|
|
|
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
|
|
|
|
if (!ids) {
|
|
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
|
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
|
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
|
|
|
|
{
|
|
const int64_t s11 = src1->nb[1] / ts_src1;
|
|
const int64_t s12 = src1->nb[2] / ts_src1;
|
|
const int64_t s13 = src1->nb[3] / ts_src1;
|
|
quantize_mmq_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type,
|
|
ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
|
|
}
|
|
|
|
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
|
const int64_t s13 = ne12*s12;
|
|
|
|
const mmq_args args = {
|
|
src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
|
|
ne00, ne01, ne1, s01, s1,
|
|
ne02, ne12, s02, s12, s2,
|
|
ne03, ne13, s03, s13, s3,
|
|
use_stream_k};
|
|
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
|
|
return;
|
|
}
|
|
|
|
GGML_ASSERT(ne13 == 1);
|
|
GGML_ASSERT(nb12 % nb11 == 0);
|
|
GGML_ASSERT(nb2 % nb1 == 0);
|
|
|
|
const int64_t n_expert_used = ids->ne[0];
|
|
const int64_t ne_get_rows = ne12 * n_expert_used;
|
|
|
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
std::vector<int32_t> ids_src1_host;
|
|
ids_src1_host.reserve(ne_get_rows);
|
|
std::vector<int32_t> ids_dst_host;
|
|
ids_dst_host.reserve(ne_get_rows);
|
|
std::vector<int32_t> tokens_per_expert_host(ne02);
|
|
std::vector<int32_t> expert_bounds_host(ne02 + 1);
|
|
ggml_cuda_pool_alloc<int32_t> ids_buf_dev(ctx.pool());
|
|
|
|
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids->data, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
for (int64_t i02 = 0; i02 < ne02; ++i02) { // expert matrices
|
|
for (int64_t i12 = 0; i12 < ne12; ++i12) { // tokens
|
|
for (int64_t iex = 0; iex < n_expert_used; ++iex) {
|
|
const int32_t expert_to_use = *(const int32_t *)(ids_host.data() + i12*ids->nb[1] + iex*ids->nb[0]);
|
|
assert(expert_to_use >= 0 && expert_to_use < ne02);
|
|
if (expert_to_use == i02) {
|
|
ids_src1_host.push_back(i12*(nb12/nb11) + iex % ne11);
|
|
ids_dst_host.push_back(i12*ne1 + iex);
|
|
tokens_per_expert_host[i02]++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int32_t cumsum = 0;
|
|
for (int64_t i = 0; i < ne02; ++i) {
|
|
expert_bounds_host[i] = cumsum;
|
|
cumsum += tokens_per_expert_host[i];
|
|
}
|
|
expert_bounds_host[ne02] = cumsum;
|
|
|
|
std::vector<int32_t> ids_buf_host;
|
|
ids_buf_host.reserve(ids_src1_host.size() + ids_dst_host.size() + expert_bounds_host.size());
|
|
ids_buf_host.insert(ids_buf_host.end(), ids_src1_host.begin(), ids_src1_host.end());
|
|
ids_buf_host.insert(ids_buf_host.end(), ids_dst_host.begin(), ids_dst_host.end());
|
|
ids_buf_host.insert(ids_buf_host.end(), expert_bounds_host.begin(), expert_bounds_host.end());
|
|
ids_buf_dev.alloc(ids_buf_host.size() + get_mmq_x_max_host(cc)); // Expert bounds are padded on device.
|
|
CUDA_CHECK(cudaMemcpyAsync(ids_buf_dev.ptr, ids_buf_host.data(), ids_buf_host.size()*sizeof(int32_t), cudaMemcpyHostToDevice, stream));
|
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
|
|
const int32_t * ids_src1_dev = ids_buf_dev.ptr;
|
|
const int32_t * ids_dst_dev = ids_src1_dev + ids_src1_host.size();
|
|
const int32_t * expert_bounds_dev = ids_dst_dev + ids_dst_host.size();
|
|
|
|
const size_t nbytes_src1_q8_1 = ne12*n_expert_used*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
|
get_mmq_x_max_host(cc)*sizeof(block_q8_1_mmq);
|
|
ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), nbytes_src1_q8_1);
|
|
|
|
const int64_t ne11_flat = ne12*n_expert_used;
|
|
const int64_t ne12_flat = 1;
|
|
const int64_t ne13_flat = 1;
|
|
|
|
{
|
|
const int64_t s11 = src1->nb[1] / ts_src1;
|
|
const int64_t s12 = src1->nb[2] / ts_src1;
|
|
const int64_t s13 = src1->nb[2] / ts_src1;
|
|
quantize_mmq_q8_1_cuda(src1_d, ids_src1_dev, src1_q8_1.get(), src0->type,
|
|
ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
|
}
|
|
|
|
const int64_t s12 = ne11*ne10_padded * sizeof(block_q8_1)/(QK8_1*sizeof(int));
|
|
const int64_t s13 = ne12*s12;
|
|
|
|
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
|
const mmq_args args = {
|
|
src0_d, src0->type, (const int *) src1_q8_1.ptr, ids_dst_dev, expert_bounds_dev, dst_d,
|
|
ne00, ne01, ne_get_rows, s01, s1,
|
|
ne02, ne02, s02, s12, s2,
|
|
ne03, ne13, s03, s13, s3,
|
|
use_stream_k};
|
|
|
|
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
|
|
}
|
|
|
|
void ggml_cuda_op_mul_mat_q(
|
|
ggml_backend_cuda_context & ctx,
|
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
|
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
|
|
const int64_t ne10 = src1->ne[0];
|
|
const int64_t ne11 = src1->ne[1];
|
|
GGML_ASSERT(ne10 % QK8_1 == 0);
|
|
|
|
const int64_t ne0 = dst->ne[0];
|
|
|
|
const int64_t row_diff = row_high - row_low;
|
|
const int64_t stride01 = ne00 / ggml_blck_size(src0->type);
|
|
|
|
const int id = ggml_cuda_get_device();
|
|
const int cc = ggml_cuda_info().devices[id].cc;
|
|
|
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
|
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
|
|
|
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
|
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
|
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
|
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
|
|
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
|
|
const mmq_args args = {
|
|
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
|
|
ne00, row_diff, src1_ncols, stride01, nrows_dst,
|
|
1, 1, 0, 0, 0,
|
|
1, 1, 0, 0, 0,
|
|
use_stream_k};
|
|
|
|
ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
|
|
|
|
GGML_UNUSED(src1);
|
|
GGML_UNUSED(dst);
|
|
GGML_UNUSED(src1_ddf_i);
|
|
GGML_UNUSED(src1_padded_row_size);
|
|
}
|
|
|
|
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
|
return false;
|
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
|
|
|
bool mmq_supported;
|
|
|
|
switch (type) {
|
|
case GGML_TYPE_Q4_0:
|
|
case GGML_TYPE_Q4_1:
|
|
case GGML_TYPE_Q5_0:
|
|
case GGML_TYPE_Q5_1:
|
|
case GGML_TYPE_Q8_0:
|
|
case GGML_TYPE_Q2_K:
|
|
case GGML_TYPE_Q3_K:
|
|
case GGML_TYPE_Q4_K:
|
|
case GGML_TYPE_Q5_K:
|
|
case GGML_TYPE_Q6_K:
|
|
case GGML_TYPE_IQ2_XXS:
|
|
case GGML_TYPE_IQ2_XS:
|
|
case GGML_TYPE_IQ2_S:
|
|
case GGML_TYPE_IQ3_XXS:
|
|
case GGML_TYPE_IQ3_S:
|
|
case GGML_TYPE_IQ1_S:
|
|
case GGML_TYPE_IQ4_XS:
|
|
case GGML_TYPE_IQ4_NL:
|
|
mmq_supported = true;
|
|
break;
|
|
default:
|
|
mmq_supported = false;
|
|
break;
|
|
}
|
|
|
|
if (!mmq_supported) {
|
|
return false;
|
|
}
|
|
|
|
if (new_mma_available(cc)) {
|
|
return true;
|
|
}
|
|
|
|
if (ggml_cuda_highest_compiled_arch(cc) < GGML_CUDA_CC_DP4A) {
|
|
return false;
|
|
}
|
|
|
|
#ifdef GGML_CUDA_FORCE_MMQ
|
|
return true;
|
|
#endif //GGML_CUDA_FORCE_MMQ
|
|
|
|
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
|
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
}
|
|
|
|
return (!GGML_CUDA_CC_IS_RDNA4(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
|
}
|