ggml : refactor online repacking (#10446)

* rename ggml-cpu-aarch64.c to .cpp

* reformat extra cpu backend.

- clean Q4_0_N_M and IQ4_0_N_M
  - remove from "file" tensor type
  - allow only with dynamic repack

- extract cpu extra bufts and convert to C++
  - hbm
  - "aarch64"

- more generic use of extra buffer
  - generalise extra_supports_op
  - new API for "cpu-accel":
     - amx
     - aarch64

* clang-format

* Clean Q4_0_N_M ref

Enable restrict on C++

* add op GGML_OP_MUL_MAT_ID for Q4_0_N_M with runtime repack

* added/corrected control on tensor size for Q4 repacking.

* Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* add debug logs on repacks.

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Djip007
2024-12-07 13:37:50 +01:00
committed by GitHub
parent c2a16c0bdb
commit 19d8762ab6
33 changed files with 1136 additions and 1049 deletions

View File

@@ -18,10 +18,6 @@
#include <unistd.h>
#endif
#if defined(_OPENMP)
#include <omp.h>
#endif
#if (defined(_WIN32) || defined(_WIN64))
#define RESTRICT __restrict
#else
@@ -1382,13 +1378,13 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K>
#define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size
template<typename TB, int BLOCK_K>
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) {
void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) {
const int NB = N / TILE_N;
const int KB = K / BLOCK_K;
const int TILE_SIZE = get_tile_size<TB>();
// parallel on NB should be enough
parallel_for(n_threads, NB, [&](int begin, int end) {
parallel_for(NB, [&](int begin, int end) {
for (int n = begin; n < end; ++n) {
for (int k = 0; k < KB; ++k) {
int n0 = n * TILE_N;
@@ -2334,15 +2330,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d
const int K = tensor->ne[0]; // ne0: in_features
const int N = tensor->ne[1]; // ne1: out_features
#if defined(_OPENMP)
// the buffer ctx is not initialized when .set_tensor is called
int n_threads = omp_get_num_threads();
#else
int n_threads = 1;
#endif
GGML_DISPATCH_QTYPES(TYPE, [&] {
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads);
convert_B_packed_format<type, blck_size>((void *)((char *)tensor->data + offset), (const type *)data, N, K);
});
}