mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
zdnn: refactor codebase + add docs (#16178)
* zdnn: initial matmul refactor Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: rm static from funcs Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: update ggml-zdnn.h Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: change header files to hpp Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: switch to common.hpp Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: move mulmat forward around Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: rm inline from utils Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: code cleanup Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * docs: add zDNN docs Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> --------- Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
This commit is contained in:
@@ -274,6 +274,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|||||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||||
|
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
|
||||||
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
||||||
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
||||||
|
|
||||||
|
|||||||
61
docs/backend/zDNN.md
Normal file
61
docs/backend/zDNN.md
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# llama.cpp for IBM zDNN Accelerator
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
|
||||||
|
|
||||||
|
### Llama.cpp + IBM zDNN
|
||||||
|
|
||||||
|
The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
|
||||||
|
|
||||||
|
## Software & Hardware Support
|
||||||
|
|
||||||
|
| Hardware Level | Status | Verified |
|
||||||
|
| -------------------- | ------------- | -------------------------- |
|
||||||
|
| IBM z17 / LinuxONE 5 | Supported | RHEL 9.6, IBM z17, 40 IFLs |
|
||||||
|
| IBM z16 / LinuxONE 4 | Not Supported | |
|
||||||
|
|
||||||
|
## Data Types Supported
|
||||||
|
|
||||||
|
| Data Type | Status |
|
||||||
|
| --------- | --------- |
|
||||||
|
| F32 | Supported |
|
||||||
|
| F16 | Supported |
|
||||||
|
| BF16 | Supported |
|
||||||
|
|
||||||
|
## CMake Options
|
||||||
|
|
||||||
|
The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
|
||||||
|
|
||||||
|
| CMake Option | Default Value | Description |
|
||||||
|
| ------------ | ------------- | ----------------------------------- |
|
||||||
|
| `GGML_ZDNN` | `OFF` | Compile llama.cpp with zDNN support |
|
||||||
|
| `ZDNN_ROOT` | `""` | Override zDNN library lookup |
|
||||||
|
|
||||||
|
## 1. Install zDNN Library
|
||||||
|
|
||||||
|
Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
git clone --recurse-submodules https://github.com/IBM/zDNN
|
||||||
|
cd zDNN
|
||||||
|
|
||||||
|
autoreconf .
|
||||||
|
./configure --prefix=/opt/zdnn-libs
|
||||||
|
|
||||||
|
make build
|
||||||
|
sudo make install
|
||||||
|
```
|
||||||
|
|
||||||
|
## 2. Build llama.cpp
|
||||||
|
|
||||||
|
```sh
|
||||||
|
git clone https://github.com/ggml-org/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
|
||||||
|
cmake -S . -G Ninja -B build \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_ZDNN=ON \
|
||||||
|
-DZDNN_ROOT=/opt/zdnn-libs
|
||||||
|
cmake --build build --config Release -j$(nproc)
|
||||||
|
```
|
||||||
@@ -7,6 +7,9 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// device buffer
|
||||||
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
|
||||||
|
|
||||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
1
ggml/src/ggml-zdnn/.gitignore
vendored
Normal file
1
ggml/src/ggml-zdnn/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
zdnn.h
|
||||||
59
ggml/src/ggml-zdnn/common.hpp
Normal file
59
ggml/src/ggml-zdnn/common.hpp
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
#ifndef GGML_ZDNN_COMMON_HPP
|
||||||
|
#define GGML_ZDNN_COMMON_HPP
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-impl.h"
|
||||||
|
|
||||||
|
#include "zdnn.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#define GGML_ZDNN_NAME "zDNN"
|
||||||
|
#define GGML_ZDNN_VERSION ZDNN_VERNUM
|
||||||
|
|
||||||
|
#define ZDNN_CHECK(stmt) \
|
||||||
|
do { \
|
||||||
|
zdnn_status status = (stmt); \
|
||||||
|
GGML_ASSERT(status == ZDNN_OK); \
|
||||||
|
} while (0);
|
||||||
|
|
||||||
|
struct ggml_backend_zdnn_device_context {
|
||||||
|
int zdnn_device;
|
||||||
|
int zdnn_device_ref_count;
|
||||||
|
|
||||||
|
bool has_parmblkformat_0;
|
||||||
|
bool has_parmblkformat_1; // checks for z17
|
||||||
|
|
||||||
|
size_t max_size;
|
||||||
|
|
||||||
|
char name[128];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_zdnn_context {
|
||||||
|
int device;
|
||||||
|
ggml_cgraph * gf;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_zdnn_buffer {
|
||||||
|
void * data;
|
||||||
|
ggml_backend_zdnn_buffer * extra; // for bias, etc.
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
zdnn_tensor_desc pre_tfm_desc;
|
||||||
|
zdnn_tensor_desc tfm_desc;
|
||||||
|
zdnn_ztensor ztensor;
|
||||||
|
|
||||||
|
char name[GGML_MAX_NAME];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_zdnn_buffer_context {
|
||||||
|
void * all_data;
|
||||||
|
size_t all_size;
|
||||||
|
bool owned;
|
||||||
|
|
||||||
|
int n_buffers;
|
||||||
|
std::vector<std::unique_ptr<ggml_backend_zdnn_buffer>> buffers;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // GGML_ZDNN_COMMON_HPP
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
#ifndef GGML_ZDNN_IMPL
|
|
||||||
#define GGML_ZDNN_IMPL
|
|
||||||
|
|
||||||
#include "zdnn.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-zdnn.h"
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <memory>
|
|
||||||
#include <vecintrin.h>
|
|
||||||
|
|
||||||
#define GGML_ZDNN_NAME "zDNN"
|
|
||||||
#define GGML_ZDNN_VERSION ZDNN_VERNUM
|
|
||||||
|
|
||||||
#define vec_neg(a) (-(a)) // Vector Negate
|
|
||||||
#define vec_add(a, b) ((a) + (b)) // Vector Add
|
|
||||||
#define vec_sub(a, b) ((a) - (b)) // Vector Subtract
|
|
||||||
#define vec_mul(a, b) ((a) * (b)) // Vector Multiply
|
|
||||||
#define vec_div(a, b) ((a) / (b)) // Vector Divide
|
|
||||||
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
|
|
||||||
#define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
|
|
||||||
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
|
|
||||||
#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
|
|
||||||
#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
|
|
||||||
|
|
||||||
#ifndef vec_and
|
|
||||||
#define vec_and(a, b) ((a) & (b)) // Vector AND
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef vec_or
|
|
||||||
#define vec_or(a, b) ((a) | (b)) // Vector OR
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef vec_xor
|
|
||||||
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef signed char char8x16_t __attribute__((vector_size(16)));
|
|
||||||
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
|
|
||||||
|
|
||||||
typedef int8_t int8x16_t __attribute__((vector_size(16)));
|
|
||||||
typedef int16_t int16x8_t __attribute__((vector_size(16)));
|
|
||||||
typedef int32_t int32x4_t __attribute__((vector_size(16)));
|
|
||||||
typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
|
|
||||||
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
|
|
||||||
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
|
|
||||||
|
|
||||||
typedef float float32x4_t __attribute__((vector_size(16)));
|
|
||||||
typedef double double64x2_t __attribute__((vector_size(16)));
|
|
||||||
|
|
||||||
typedef signed long long long64x2_t __attribute__((vector_size(16)));
|
|
||||||
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
|
|
||||||
|
|
||||||
#define ZDNN_CHECK(stmt) \
|
|
||||||
do { \
|
|
||||||
zdnn_status status = (stmt); \
|
|
||||||
GGML_ASSERT(status == ZDNN_OK); \
|
|
||||||
} while (0);
|
|
||||||
|
|
||||||
struct ggml_backend_zdnn_device_context {
|
|
||||||
int zdnn_device;
|
|
||||||
int zdnn_device_ref_count;
|
|
||||||
|
|
||||||
bool has_parmblkformat_0;
|
|
||||||
bool has_parmblkformat_1;
|
|
||||||
|
|
||||||
size_t max_size;
|
|
||||||
|
|
||||||
char name[128];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_backend_zdnn_context {
|
|
||||||
int device;
|
|
||||||
ggml_cgraph * gf;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_backend_zdnn_buffer {
|
|
||||||
void * data;
|
|
||||||
ggml_backend_zdnn_buffer * extra; // for bias, etc.
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
zdnn_tensor_desc pre_tfm_desc;
|
|
||||||
zdnn_tensor_desc tfm_desc;
|
|
||||||
zdnn_ztensor ztensor;
|
|
||||||
|
|
||||||
char name[GGML_MAX_NAME];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_backend_zdnn_buffer_context {
|
|
||||||
void * all_data;
|
|
||||||
size_t all_size;
|
|
||||||
bool owned;
|
|
||||||
|
|
||||||
int n_buffers;
|
|
||||||
std::vector<std::unique_ptr<ggml_backend_zdnn_buffer>> buffers;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // GGML_ZDNN_IMPL
|
|
||||||
@@ -1,187 +1,38 @@
|
|||||||
#include "zdnn.h"
|
|
||||||
#include "ggml-zdnn.h"
|
#include "ggml-zdnn.h"
|
||||||
#include "ggml-zdnn-impl.h"
|
|
||||||
|
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
#include "ggml-zdnn/common.hpp"
|
||||||
|
#include "ggml-zdnn/mmf.hpp"
|
||||||
|
#include "ggml-zdnn/utils.hpp"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <csignal>
|
#include <csignal> // raise(SIGTRAP)
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
inline zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) {
|
static void ggml_zdnn_compute_forward_mul_mat(
|
||||||
switch (type) {
|
const ggml_backend_zdnn_context * ctx,
|
||||||
case GGML_TYPE_F32:
|
ggml_tensor * dst) {
|
||||||
return FP32;
|
|
||||||
case GGML_TYPE_F16:
|
const ggml_tensor * src0 = dst->src[0]; // weights
|
||||||
return FP16;
|
const ggml_tensor * src1 = dst->src[1]; // inputs
|
||||||
case GGML_TYPE_BF16:
|
|
||||||
return BFLOAT;
|
// TODO: implement support for quantized types
|
||||||
case GGML_TYPE_I8:
|
// we currently only support f32, f16, and bf16
|
||||||
return INT8;
|
ggml_zdnn_mul_mat_f(ctx, src0, src1, dst);
|
||||||
case GGML_TYPE_I32:
|
|
||||||
return INT32;
|
|
||||||
case GGML_TYPE_Q8_0:
|
|
||||||
return INT8;
|
|
||||||
default:
|
|
||||||
GGML_ABORT("%s: fatal: unable to determine zTensor data type",
|
|
||||||
__func__);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc,
|
static bool ggml_zdnn_compute_forward(
|
||||||
zdnn_tensor_desc & tfm_desc,
|
ggml_backend_zdnn_context * ctx,
|
||||||
zdnn_ztensor & ztensor,
|
ggml_tensor * dst) {
|
||||||
const ggml_tensor * src,
|
|
||||||
const int64_t * ne,
|
|
||||||
const zdnn_data_layouts layout) {
|
|
||||||
zdnn_init_pre_transformed_desc(
|
|
||||||
layout,
|
|
||||||
ggml_zdnn_type_mapping(src->type),
|
|
||||||
&pre_tfm_desc,
|
|
||||||
ne[3], ne[2], ne[1], ne[0]
|
|
||||||
);
|
|
||||||
|
|
||||||
ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc));
|
|
||||||
ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor,
|
|
||||||
void * buffer) {
|
|
||||||
ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) {
|
|
||||||
switch (tensor->op) {
|
|
||||||
case GGML_OP_MUL_MAT:
|
|
||||||
{
|
|
||||||
zdnn_init_pre_transformed_desc(
|
|
||||||
ZDNN_2D,
|
|
||||||
ggml_zdnn_type_mapping(tensor->type),
|
|
||||||
&buffer->pre_tfm_desc,
|
|
||||||
tensor->ne[1], tensor->ne[0]
|
|
||||||
);
|
|
||||||
} break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
{
|
|
||||||
// For 4D tensors, GGML uses NCHW layout. However, because zDNN
|
|
||||||
// automatically transforms everything to NHWC, we will use it
|
|
||||||
// directly to avoid the performance penalty changing the
|
|
||||||
// layout and reshaping the tensor.
|
|
||||||
zdnn_init_pre_transformed_desc(
|
|
||||||
ZDNN_NHWC,
|
|
||||||
ggml_zdnn_type_mapping(tensor->type),
|
|
||||||
&buffer->pre_tfm_desc,
|
|
||||||
tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]
|
|
||||||
);
|
|
||||||
|
|
||||||
// TODO: Consider adding a ggml check.
|
|
||||||
// TODO: If tensor = 4D, use ZDNN_NCHW by default.
|
|
||||||
// TODO: If tensor = 2D, use ZDNN_NHWC by default.
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
|
|
||||||
ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc));
|
|
||||||
ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
||||||
|
|
||||||
const enum ggml_type type = src0->type;
|
|
||||||
|
|
||||||
GGML_ASSERT(ne0 == ne01);
|
|
||||||
GGML_ASSERT(ne1 == ne11);
|
|
||||||
GGML_ASSERT(ne2 == ne12);
|
|
||||||
GGML_ASSERT(ne3 == ne13);
|
|
||||||
|
|
||||||
// we don't support permuted src0 or src1
|
|
||||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
|
||||||
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
|
||||||
|
|
||||||
// dst cannot be transposed or permuted
|
|
||||||
GGML_ASSERT(nb0 == sizeof(float));
|
|
||||||
GGML_ASSERT(nb0 <= nb1);
|
|
||||||
GGML_ASSERT(nb1 <= nb2);
|
|
||||||
GGML_ASSERT(nb2 <= nb3);
|
|
||||||
|
|
||||||
const ggml_tensor * weights = src0;
|
|
||||||
const ggml_tensor * inputs = src1;
|
|
||||||
ggml_tensor * output = dst;
|
|
||||||
|
|
||||||
ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
|
|
||||||
ggml_backend_zdnn_buffer * inputs_extra = (ggml_backend_zdnn_buffer *)inputs->extra;
|
|
||||||
ggml_backend_zdnn_buffer * output_extra = (ggml_backend_zdnn_buffer *)output->extra;
|
|
||||||
ggml_backend_zdnn_buffer * bias_extra = (ggml_backend_zdnn_buffer *)output_extra->extra;
|
|
||||||
|
|
||||||
const int64_t weights_rows = ne01;
|
|
||||||
const int64_t weights_cols = ne00;
|
|
||||||
const int64_t inputs_rows = ne11;
|
|
||||||
const int64_t inputs_cols = ne10;
|
|
||||||
|
|
||||||
assert(inputs_cols == weights_cols);
|
|
||||||
|
|
||||||
const int64_t output_rows = ne1;
|
|
||||||
const int64_t output_cols = ne0;
|
|
||||||
|
|
||||||
// GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
|
|
||||||
// __func__, weights_extra->name,
|
|
||||||
// weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
|
|
||||||
// weights_extra->pre_tfm_desc.dim1,
|
|
||||||
// weights_extra->pre_tfm_desc.dim2,
|
|
||||||
// weights_extra->pre_tfm_desc.dim3,
|
|
||||||
// weights_extra->pre_tfm_desc.dim4);
|
|
||||||
|
|
||||||
// GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
|
|
||||||
// __func__, inputs_extra->name,
|
|
||||||
// inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
|
|
||||||
// inputs_extra->pre_tfm_desc.dim1,
|
|
||||||
// inputs_extra->pre_tfm_desc.dim2,
|
|
||||||
// inputs_extra->pre_tfm_desc.dim3,
|
|
||||||
// inputs_extra->pre_tfm_desc.dim4);
|
|
||||||
|
|
||||||
GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
|
|
||||||
GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
|
|
||||||
GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1 == inputs->ne[0] && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
|
|
||||||
GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2 == inputs->ne[1] && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
|
|
||||||
|
|
||||||
ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor,
|
|
||||||
false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
|
|
||||||
// TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
|
|
||||||
ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
|
|
||||||
|
|
||||||
GGML_UNUSED(ctx);
|
|
||||||
GGML_UNUSED(weights_rows);
|
|
||||||
GGML_UNUSED(weights_cols);
|
|
||||||
GGML_UNUSED(inputs_rows);
|
|
||||||
GGML_UNUSED(inputs_cols);
|
|
||||||
GGML_UNUSED(output_rows);
|
|
||||||
GGML_UNUSED(output_cols);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
||||||
// debug helpers
|
|
||||||
// GGML_LOG_INFO("%s: use_mul_mat_vec = %d\n", __func__, use_mul_mat_vec);
|
|
||||||
// GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q);
|
|
||||||
// GGML_LOG_INFO("%s: use_mul_mat_q = %d\n", __func__, use_mul_mat_q);
|
|
||||||
// GGML_LOG_INFO("%s: src0: %8d %8d %8d %8d\n", __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
|
||||||
// GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
|
||||||
// GGML_LOG_INFO("%s: src1: %8d %8d %8d %8d\n", __func__, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
|
||||||
// GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
|
||||||
// GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
|
||||||
// GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
|
||||||
|
|
||||||
ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) {
|
|
||||||
switch (dst->op) {
|
switch (dst->op) {
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
ggml_zdnn_mul_mat_dispatch(ctx, dst->src[0], dst->src[1], dst);
|
{
|
||||||
break;
|
ggml_zdnn_compute_forward_mul_mat(ctx, dst);
|
||||||
|
} break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
80
ggml/src/ggml-zdnn/mmf.cpp
Normal file
80
ggml/src/ggml-zdnn/mmf.cpp
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
#include "ggml.h"
|
||||||
|
#include "mmf.hpp"
|
||||||
|
|
||||||
|
void ggml_zdnn_mul_mat_f(
|
||||||
|
const ggml_backend_zdnn_context * ctx,
|
||||||
|
const ggml_tensor * src0,
|
||||||
|
const ggml_tensor * src1,
|
||||||
|
ggml_tensor * dst) {
|
||||||
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||||
|
|
||||||
|
const enum ggml_type type = src0->type;
|
||||||
|
|
||||||
|
GGML_ASSERT(ne0 == ne01);
|
||||||
|
GGML_ASSERT(ne1 == ne11);
|
||||||
|
GGML_ASSERT(ne2 == ne12);
|
||||||
|
GGML_ASSERT(ne3 == ne13);
|
||||||
|
|
||||||
|
// we don't support permuted src0 or src1
|
||||||
|
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||||
|
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
||||||
|
|
||||||
|
// dst cannot be transposed or permuted
|
||||||
|
GGML_ASSERT(nb0 == sizeof(float));
|
||||||
|
GGML_ASSERT(nb0 <= nb1);
|
||||||
|
GGML_ASSERT(nb1 <= nb2);
|
||||||
|
GGML_ASSERT(nb2 <= nb3);
|
||||||
|
|
||||||
|
const ggml_tensor * weights = src0;
|
||||||
|
const ggml_tensor * inputs = src1;
|
||||||
|
ggml_tensor * output = dst;
|
||||||
|
|
||||||
|
ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
|
||||||
|
ggml_backend_zdnn_buffer * inputs_extra = (ggml_backend_zdnn_buffer *)inputs->extra;
|
||||||
|
ggml_backend_zdnn_buffer * output_extra = (ggml_backend_zdnn_buffer *)output->extra;
|
||||||
|
ggml_backend_zdnn_buffer * bias_extra = (ggml_backend_zdnn_buffer *)output_extra->extra;
|
||||||
|
|
||||||
|
const int64_t weights_rows = ne01;
|
||||||
|
const int64_t weights_cols = ne00;
|
||||||
|
const int64_t inputs_rows = ne11;
|
||||||
|
const int64_t inputs_cols = ne10;
|
||||||
|
|
||||||
|
assert(inputs_cols == weights_cols);
|
||||||
|
|
||||||
|
const int64_t output_rows = ne1;
|
||||||
|
const int64_t output_cols = ne0;
|
||||||
|
|
||||||
|
// GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
|
||||||
|
// __func__, weights_extra->name,
|
||||||
|
// weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
|
||||||
|
// weights_extra->pre_tfm_desc.dim1,
|
||||||
|
// weights_extra->pre_tfm_desc.dim2,
|
||||||
|
// weights_extra->pre_tfm_desc.dim3,
|
||||||
|
// weights_extra->pre_tfm_desc.dim4);
|
||||||
|
|
||||||
|
// GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
|
||||||
|
// __func__, inputs_extra->name,
|
||||||
|
// inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
|
||||||
|
// inputs_extra->pre_tfm_desc.dim1,
|
||||||
|
// inputs_extra->pre_tfm_desc.dim2,
|
||||||
|
// inputs_extra->pre_tfm_desc.dim3,
|
||||||
|
// inputs_extra->pre_tfm_desc.dim4);
|
||||||
|
|
||||||
|
GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
|
||||||
|
GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
|
||||||
|
GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1 == inputs->ne[0] && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
|
||||||
|
GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2 == inputs->ne[1] && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
|
||||||
|
|
||||||
|
ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor,
|
||||||
|
false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
|
||||||
|
// TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
|
||||||
|
ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
|
||||||
|
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
|
GGML_UNUSED(weights_rows);
|
||||||
|
GGML_UNUSED(weights_cols);
|
||||||
|
GGML_UNUSED(inputs_rows);
|
||||||
|
GGML_UNUSED(inputs_cols);
|
||||||
|
GGML_UNUSED(output_rows);
|
||||||
|
GGML_UNUSED(output_cols);
|
||||||
|
}
|
||||||
12
ggml/src/ggml-zdnn/mmf.hpp
Normal file
12
ggml/src/ggml-zdnn/mmf.hpp
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#ifndef GGML_ZDNN_MMF_HPP
|
||||||
|
#define GGML_ZDNN_MMF_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
void ggml_zdnn_mul_mat_f(
|
||||||
|
const ggml_backend_zdnn_context * ctx,
|
||||||
|
const ggml_tensor * src0,
|
||||||
|
const ggml_tensor * src1,
|
||||||
|
ggml_tensor * dst);
|
||||||
|
|
||||||
|
#endif // GGML_ZDNN_MMF_HPP
|
||||||
79
ggml/src/ggml-zdnn/utils.cpp
Normal file
79
ggml/src/ggml-zdnn/utils.cpp
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
#include "ggml.h"
|
||||||
|
#include "utils.hpp"
|
||||||
|
|
||||||
|
zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
return FP32;
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
return FP16;
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
return BFLOAT;
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
return INT8;
|
||||||
|
case GGML_TYPE_I8:
|
||||||
|
return INT8;
|
||||||
|
case GGML_TYPE_I32:
|
||||||
|
return INT32;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("%s: fatal: unable to determine zTensor data type",
|
||||||
|
__func__);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc,
|
||||||
|
zdnn_tensor_desc & tfm_desc,
|
||||||
|
zdnn_ztensor & ztensor,
|
||||||
|
const ggml_tensor * src,
|
||||||
|
const int64_t * ne,
|
||||||
|
const zdnn_data_layouts layout) {
|
||||||
|
zdnn_init_pre_transformed_desc(
|
||||||
|
layout,
|
||||||
|
ggml_zdnn_type_mapping(src->type),
|
||||||
|
&pre_tfm_desc,
|
||||||
|
ne[3], ne[2], ne[1], ne[0]
|
||||||
|
);
|
||||||
|
|
||||||
|
ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc));
|
||||||
|
ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, void * buffer) {
|
||||||
|
ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer));
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) {
|
||||||
|
switch (tensor->op) {
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
{
|
||||||
|
zdnn_init_pre_transformed_desc(
|
||||||
|
ZDNN_2D,
|
||||||
|
ggml_zdnn_type_mapping(tensor->type),
|
||||||
|
&buffer->pre_tfm_desc,
|
||||||
|
tensor->ne[1], tensor->ne[0]
|
||||||
|
);
|
||||||
|
} break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
// For 4D tensors, GGML uses NCHW layout. However, because zDNN
|
||||||
|
// automatically transforms everything to NHWC, we will use it
|
||||||
|
// directly to avoid the performance penalty changing the
|
||||||
|
// layout and reshaping the tensor.
|
||||||
|
zdnn_init_pre_transformed_desc(
|
||||||
|
ZDNN_NHWC,
|
||||||
|
ggml_zdnn_type_mapping(tensor->type),
|
||||||
|
&buffer->pre_tfm_desc,
|
||||||
|
tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]
|
||||||
|
);
|
||||||
|
|
||||||
|
// TODO: Consider adding a ggml check.
|
||||||
|
// TODO: If tensor = 4D, use ZDNN_NCHW by default.
|
||||||
|
// TODO: If tensor = 2D, use ZDNN_NHWC by default.
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc));
|
||||||
|
ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor));
|
||||||
|
}
|
||||||
19
ggml/src/ggml-zdnn/utils.hpp
Normal file
19
ggml/src/ggml-zdnn/utils.hpp
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
#ifndef GGML_ZDNN_UTILITIES_HPP
|
||||||
|
#define GGML_ZDNN_UTILITIES_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
zdnn_data_types ggml_zdnn_type_mapping(ggml_type type);
|
||||||
|
|
||||||
|
void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc,
|
||||||
|
zdnn_tensor_desc & tfm_desc,
|
||||||
|
zdnn_ztensor & ztensor,
|
||||||
|
const ggml_tensor * src,
|
||||||
|
const int64_t * ne,
|
||||||
|
const zdnn_data_layouts layout);
|
||||||
|
|
||||||
|
void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, void * buffer);
|
||||||
|
|
||||||
|
void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor);
|
||||||
|
|
||||||
|
#endif // GGML_ZDNN_UTILITIES_HPP
|
||||||
Reference in New Issue
Block a user