zdnn: refactor codebase + add docs (#16178)

* zdnn: initial matmul refactor Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: rm static from funcs Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: update ggml-zdnn.h Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: change header files to hpp Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: switch to common.hpp Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: move mulmat forward around Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: rm inline from utils Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: code cleanup Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * docs: add zDNN docs Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> --------- Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2025-10-27 08:21:30 +00:00 · 2025-09-23 14:53:05 +08:00
parent 0bc7cc7154
commit 264f1b5187
11 changed files with 337 additions and 269 deletions
--- a/README.md
+++ b/README.md
@@ -274,6 +274,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
--- a/docs/backend/zDNN.md
+++ b/docs/backend/zDNN.md
@@ -0,0 +1,61 @@
 # llama.cpp for IBM zDNN Accelerator
 ## Background
 IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
 ### Llama.cpp + IBM zDNN
 The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
 ## Software & Hardware Support
 | Hardware Level       | Status        | Verified                   |
 | -------------------- | ------------- | -------------------------- |
 | IBM z17 / LinuxONE 5 | Supported     | RHEL 9.6, IBM z17, 40 IFLs |
 | IBM z16 / LinuxONE 4 | Not Supported |                            |
 ## Data Types Supported
 | Data Type | Status    |
 | --------- | --------- |
 | F32       | Supported |
 | F16       | Supported |
 | BF16      | Supported |
 ## CMake Options
 The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
 | CMake Option | Default Value | Description                         |
 | ------------ | ------------- | ----------------------------------- |
 | `GGML_ZDNN`  | `OFF`         | Compile llama.cpp with zDNN support |
 | `ZDNN_ROOT`  | `""`          | Override zDNN library lookup        |
 ## 1. Install zDNN Library
 Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
 ```sh
 git clone --recurse-submodules https://github.com/IBM/zDNN
 cd zDNN
 autoreconf .
 ./configure --prefix=/opt/zdnn-libs
 make build
 sudo make install
 ```
 ## 2. Build llama.cpp
 ```sh
 git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 cmake -S . -G Ninja -B build \
    -DCMAKE_BUILD_TYPE=Release \
    -DGGML_ZDNN=ON \
    -DZDNN_ROOT=/opt/zdnn-libs
 cmake --build build --config Release -j$(nproc)
 ```
--- a/ggml/include/ggml-zdnn.h
+++ b/ggml/include/ggml-zdnn.h
@@ -7,6 +7,9 @@
 extern "C" {
 #endif
 // device buffer
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
 #ifdef __cplusplus
--- a/ggml/src/ggml-zdnn/.gitignore
+++ b/ggml/src/ggml-zdnn/.gitignore
@@ -0,0 +1 @@
 zdnn.h
--- a/ggml/src/ggml-zdnn/common.hpp
+++ b/ggml/src/ggml-zdnn/common.hpp
@@ -0,0 +1,59 @@
 #ifndef GGML_ZDNN_COMMON_HPP
 #define GGML_ZDNN_COMMON_HPP
 #include "ggml.h"
 #include "ggml-impl.h"
 #include "zdnn.h"
 #include <vector>
 #include <memory>
 #define GGML_ZDNN_NAME    "zDNN"
 #define GGML_ZDNN_VERSION ZDNN_VERNUM
 #define ZDNN_CHECK(stmt)                \
    do {                                \
        zdnn_status status = (stmt);    \
        GGML_ASSERT(status == ZDNN_OK); \
    } while (0);
 struct ggml_backend_zdnn_device_context {
    int zdnn_device;
    int zdnn_device_ref_count;
    bool has_parmblkformat_0;
    bool has_parmblkformat_1;  // checks for z17
    size_t max_size;
    char name[128];
 };
 struct ggml_backend_zdnn_context {
    int device;
    ggml_cgraph * gf;
 };
 struct ggml_backend_zdnn_buffer {
    void * data;
    ggml_backend_zdnn_buffer * extra;  // for bias, etc.
    size_t size;
    zdnn_tensor_desc pre_tfm_desc;
    zdnn_tensor_desc tfm_desc;
    zdnn_ztensor     ztensor;
    char name[GGML_MAX_NAME];
 };
 struct ggml_backend_zdnn_buffer_context {
    void * all_data;
    size_t all_size;
    bool owned;
    int n_buffers;
    std::vector<std::unique_ptr<ggml_backend_zdnn_buffer>> buffers;
 };
 #endif  // GGML_ZDNN_COMMON_HPP
--- a/ggml/src/ggml-zdnn/ggml-zdnn-impl.h
+++ b/ggml/src/ggml-zdnn/ggml-zdnn-impl.h
@@ -1,98 +0,0 @@
 #ifndef GGML_ZDNN_IMPL
 #define GGML_ZDNN_IMPL
 #include "zdnn.h"
 #include "ggml.h"
 #include "ggml-zdnn.h"
 #include <vector>
 #include <memory>
 #include <vecintrin.h>
 #define GGML_ZDNN_NAME    "zDNN"
 #define GGML_ZDNN_VERSION ZDNN_VERNUM
 #define vec_neg(a)    (-(a))                // Vector Negate
 #define vec_add(a, b) ((a) + (b))           // Vector Add
 #define vec_sub(a, b) ((a) - (b))           // Vector Subtract
 #define vec_mul(a, b) ((a) * (b))           // Vector Multiply
 #define vec_div(a, b) ((a) / (b))           // Vector Divide
 #define vec_sl(a, b)  ((a) << (b))          // Vector Shift Left
 #define vec_sra(a, b) ((a) >> (b))          // Vector Shift Right
 #define vec_sr(a, b)  ((a) >> (b))          // Vector Shift Right Algebraic
 #define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
 #define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
 #ifndef vec_and
 #define vec_and(a, b) ((a) & (b)) // Vector AND
 #endif
 #ifndef vec_or
 #define vec_or(a, b)  ((a) | (b)) // Vector OR
 #endif
 #ifndef vec_xor
 #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
 #endif
 typedef   signed char char8x16_t  __attribute__((vector_size(16)));
 typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
 typedef int8_t   int8x16_t  __attribute__((vector_size(16)));
 typedef int16_t  int16x8_t  __attribute__((vector_size(16)));
 typedef int32_t  int32x4_t  __attribute__((vector_size(16)));
 typedef uint8_t  uint8x16_t __attribute__((vector_size(16)));
 typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
 typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
 typedef float float32x4_t   __attribute__((vector_size(16)));
 typedef double double64x2_t __attribute__((vector_size(16)));
 typedef   signed long long long64x2_t  __attribute__((vector_size(16)));
 typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
 #define ZDNN_CHECK(stmt)                \
    do {                                \
        zdnn_status status = (stmt);    \
        GGML_ASSERT(status == ZDNN_OK); \
    } while (0);
 struct ggml_backend_zdnn_device_context {
    int zdnn_device;
    int zdnn_device_ref_count;
    bool has_parmblkformat_0;
    bool has_parmblkformat_1;
    size_t max_size;
    char name[128];
 };
 struct ggml_backend_zdnn_context {
    int device;
    ggml_cgraph * gf;
 };
 struct ggml_backend_zdnn_buffer {
    void * data;
    ggml_backend_zdnn_buffer * extra;  // for bias, etc.
    size_t size;
    zdnn_tensor_desc pre_tfm_desc;
    zdnn_tensor_desc tfm_desc;
    zdnn_ztensor     ztensor;
    char name[GGML_MAX_NAME];
 };
 struct ggml_backend_zdnn_buffer_context {
    void * all_data;
    size_t all_size;
    bool owned;
    int n_buffers;
    std::vector<std::unique_ptr<ggml_backend_zdnn_buffer>> buffers;
 };
 #endif  // GGML_ZDNN_IMPL
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@@ -1,187 +1,38 @@
 #include "zdnn.h"
 #include "ggml-zdnn.h"
 #include "ggml-zdnn-impl.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-zdnn/common.hpp"
 #include "ggml-zdnn/mmf.hpp"
 #include "ggml-zdnn/utils.hpp"
 #include "ggml.h"
 #include <vector>
 #include <memory>
-#include <csignal>
+#include <csignal>  // raise(SIGTRAP)
 #include <unistd.h>
-inline zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) {
+static void ggml_zdnn_compute_forward_mul_mat(
-    switch (type) {
+    const ggml_backend_zdnn_context * ctx,
-        case GGML_TYPE_F32:
+          ggml_tensor * dst) {
-            return FP32;
+
-        case GGML_TYPE_F16:
+    const ggml_tensor * src0 = dst->src[0];  // weights
-            return FP16;
+    const ggml_tensor * src1 = dst->src[1];  // inputs
-        case GGML_TYPE_BF16:
+
-            return BFLOAT;
+    // TODO: implement support for quantized types
-        case GGML_TYPE_I8:
+    // we currently only support f32, f16, and bf16
-            return INT8;
+    ggml_zdnn_mul_mat_f(ctx, src0, src1, dst);
        case GGML_TYPE_I32:
            return INT32;
        case GGML_TYPE_Q8_0:
            return INT8;
        default:
            GGML_ABORT("%s: fatal: unable to determine zTensor data type",
                       __func__);
            break;
    }
 }
-inline void ggml_zdnn_create_tensor(zdnn_tensor_desc  & pre_tfm_desc,
+static bool ggml_zdnn_compute_forward(
-                                    zdnn_tensor_desc  & tfm_desc,
+    ggml_backend_zdnn_context * ctx,
-                                    zdnn_ztensor      & ztensor,
+    ggml_tensor * dst) {
                              const ggml_tensor       * src,
                              const int64_t           * ne,
                              const zdnn_data_layouts   layout) {
    zdnn_init_pre_transformed_desc(
        layout,
        ggml_zdnn_type_mapping(src->type),
        &pre_tfm_desc,
        ne[3], ne[2], ne[1], ne[0]
    );
    ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc));
    ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor));
 }
 inline void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor,
                                          void * buffer) {
    ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer));
 }
 inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) {
    switch (tensor->op) {
        case GGML_OP_MUL_MAT:
            {
                zdnn_init_pre_transformed_desc(
                    ZDNN_2D,
                    ggml_zdnn_type_mapping(tensor->type),
                    &buffer->pre_tfm_desc,
                    tensor->ne[1], tensor->ne[0]
                );
            } break;
        default:
            {
                // For 4D tensors, GGML uses NCHW layout. However, because zDNN
                // automatically transforms everything to NHWC, we will use it
                // directly to avoid the performance penalty changing the
                // layout and reshaping the tensor.
                zdnn_init_pre_transformed_desc(
                    ZDNN_NHWC,
                    ggml_zdnn_type_mapping(tensor->type),
                    &buffer->pre_tfm_desc,
                    tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]
                );
                // TODO: Consider adding a ggml check.
                // TODO: If tensor = 4D, use ZDNN_NCHW by default.
                // TODO: If tensor = 2D, use ZDNN_NHWC by default.
            } break;
    }
    ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc));
    ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor));
 }
 static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_TENSOR_BINARY_OP_LOCALS;
    const enum ggml_type type = src0->type;
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
    GGML_ASSERT(ne3 == ne13);
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);
    const ggml_tensor * weights = src0;
    const ggml_tensor * inputs  = src1;
          ggml_tensor * output  = dst;
    ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
    ggml_backend_zdnn_buffer * inputs_extra  = (ggml_backend_zdnn_buffer *)inputs->extra;
    ggml_backend_zdnn_buffer * output_extra  = (ggml_backend_zdnn_buffer *)output->extra;
    ggml_backend_zdnn_buffer * bias_extra    = (ggml_backend_zdnn_buffer *)output_extra->extra;
    const int64_t weights_rows = ne01;
    const int64_t weights_cols = ne00;
    const int64_t inputs_rows  = ne11;
    const int64_t inputs_cols  = ne10;
    assert(inputs_cols == weights_cols);
    const int64_t output_rows = ne1;
    const int64_t output_cols = ne0;
    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
    //               __func__, weights_extra->name,
    //               weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
    //               weights_extra->pre_tfm_desc.dim1,
    //               weights_extra->pre_tfm_desc.dim2,
    //               weights_extra->pre_tfm_desc.dim3,
    //               weights_extra->pre_tfm_desc.dim4);
    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
    //               __func__, inputs_extra->name,
    //               inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
    //               inputs_extra->pre_tfm_desc.dim1,
    //               inputs_extra->pre_tfm_desc.dim2,
    //               inputs_extra->pre_tfm_desc.dim3,
    //               inputs_extra->pre_tfm_desc.dim4);
    GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
    GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1  == inputs->ne[0]  && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2  == inputs->ne[1]  && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
    ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor,
                                        false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
    // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
    ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
    GGML_UNUSED(ctx);
    GGML_UNUSED(weights_rows);
    GGML_UNUSED(weights_cols);
    GGML_UNUSED(inputs_rows);
    GGML_UNUSED(inputs_cols);
    GGML_UNUSED(output_rows);
    GGML_UNUSED(output_cols);
 }
 static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    // debug helpers
    // GGML_LOG_INFO("%s: use_mul_mat_vec   = %d\n", __func__, use_mul_mat_vec);
    // GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q);
    // GGML_LOG_INFO("%s: use_mul_mat_q     = %d\n", __func__, use_mul_mat_q);
    // GGML_LOG_INFO("%s: src0: %8d %8d %8d %8d\n", __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
    // GGML_LOG_INFO("%s:       %8d %8d %8d %8d\n", __func__, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
    // GGML_LOG_INFO("%s: src1: %8d %8d %8d %8d\n", __func__, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
    // GGML_LOG_INFO("%s:       %8d %8d %8d %8d\n", __func__, src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
    // GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    // GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
    ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
 }
 static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) {
    switch (dst->op) {
        case GGML_OP_MUL_MAT:
-            ggml_zdnn_mul_mat_dispatch(ctx, dst->src[0], dst->src[1], dst);
+            {
-            break;
+                ggml_zdnn_compute_forward_mul_mat(ctx, dst);
            } break;
        default:
            return false;
--- a/ggml/src/ggml-zdnn/mmf.cpp
+++ b/ggml/src/ggml-zdnn/mmf.cpp
@@ -0,0 +1,80 @@
 #include "ggml.h"
 #include "mmf.hpp"
 void ggml_zdnn_mul_mat_f(
    const ggml_backend_zdnn_context * ctx,
    const               ggml_tensor * src0,
    const               ggml_tensor * src1,
                        ggml_tensor * dst) {
    GGML_TENSOR_BINARY_OP_LOCALS;
    const enum ggml_type type = src0->type;
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
    GGML_ASSERT(ne3 == ne13);
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);
    const ggml_tensor * weights = src0;
    const ggml_tensor * inputs  = src1;
          ggml_tensor * output  = dst;
    ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
    ggml_backend_zdnn_buffer * inputs_extra  = (ggml_backend_zdnn_buffer *)inputs->extra;
    ggml_backend_zdnn_buffer * output_extra  = (ggml_backend_zdnn_buffer *)output->extra;
    ggml_backend_zdnn_buffer * bias_extra    = (ggml_backend_zdnn_buffer *)output_extra->extra;
    const int64_t weights_rows = ne01;
    const int64_t weights_cols = ne00;
    const int64_t inputs_rows  = ne11;
    const int64_t inputs_cols  = ne10;
    assert(inputs_cols == weights_cols);
    const int64_t output_rows = ne1;
    const int64_t output_cols = ne0;
    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
    //               __func__, weights_extra->name,
    //               weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
    //               weights_extra->pre_tfm_desc.dim1,
    //               weights_extra->pre_tfm_desc.dim2,
    //               weights_extra->pre_tfm_desc.dim3,
    //               weights_extra->pre_tfm_desc.dim4);
    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
    //               __func__, inputs_extra->name,
    //               inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
    //               inputs_extra->pre_tfm_desc.dim1,
    //               inputs_extra->pre_tfm_desc.dim2,
    //               inputs_extra->pre_tfm_desc.dim3,
    //               inputs_extra->pre_tfm_desc.dim4);
    GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
    GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1  == inputs->ne[0]  && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2  == inputs->ne[1]  && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
    ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor,
                                        false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
    // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
    ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
    GGML_UNUSED(ctx);
    GGML_UNUSED(weights_rows);
    GGML_UNUSED(weights_cols);
    GGML_UNUSED(inputs_rows);
    GGML_UNUSED(inputs_cols);
    GGML_UNUSED(output_rows);
    GGML_UNUSED(output_cols);
 }
--- a/ggml/src/ggml-zdnn/mmf.hpp
+++ b/ggml/src/ggml-zdnn/mmf.hpp
@@ -0,0 +1,12 @@
 #ifndef GGML_ZDNN_MMF_HPP
 #define GGML_ZDNN_MMF_HPP
 #include "common.hpp"
 void ggml_zdnn_mul_mat_f(
    const ggml_backend_zdnn_context * ctx,
    const               ggml_tensor * src0,
    const               ggml_tensor * src1,
                        ggml_tensor * dst);
 #endif  // GGML_ZDNN_MMF_HPP
--- a/ggml/src/ggml-zdnn/utils.cpp
+++ b/ggml/src/ggml-zdnn/utils.cpp
@@ -0,0 +1,79 @@
 #include "ggml.h"
 #include "utils.hpp"
 zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) {
    switch (type) {
        case GGML_TYPE_F32:
            return FP32;
        case GGML_TYPE_F16:
            return FP16;
        case GGML_TYPE_BF16:
            return BFLOAT;
        case GGML_TYPE_Q8_0:
            return INT8;
        case GGML_TYPE_I8:
            return INT8;
        case GGML_TYPE_I32:
            return INT32;
        default:
            GGML_ABORT("%s: fatal: unable to determine zTensor data type",
                       __func__);
            break;
    }
 }
 void ggml_zdnn_create_tensor(zdnn_tensor_desc  & pre_tfm_desc,
                             zdnn_tensor_desc  & tfm_desc,
                             zdnn_ztensor      & ztensor,
                       const ggml_tensor       * src,
                       const int64_t           * ne,
                       const zdnn_data_layouts   layout) {
    zdnn_init_pre_transformed_desc(
        layout,
        ggml_zdnn_type_mapping(src->type),
        &pre_tfm_desc,
        ne[3], ne[2], ne[1], ne[0]
    );
    ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc));
    ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor));
 }
 void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, void * buffer) {
    ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer));
 }
 void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) {
    switch (tensor->op) {
        case GGML_OP_MUL_MAT:
            {
                zdnn_init_pre_transformed_desc(
                    ZDNN_2D,
                    ggml_zdnn_type_mapping(tensor->type),
                    &buffer->pre_tfm_desc,
                    tensor->ne[1], tensor->ne[0]
                );
            } break;
        default:
            {
                // For 4D tensors, GGML uses NCHW layout. However, because zDNN
                // automatically transforms everything to NHWC, we will use it
                // directly to avoid the performance penalty changing the
                // layout and reshaping the tensor.
                zdnn_init_pre_transformed_desc(
                    ZDNN_NHWC,
                    ggml_zdnn_type_mapping(tensor->type),
                    &buffer->pre_tfm_desc,
                    tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]
                );
                // TODO: Consider adding a ggml check.
                // TODO: If tensor = 4D, use ZDNN_NCHW by default.
                // TODO: If tensor = 2D, use ZDNN_NHWC by default.
            } break;
    }
    ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc));
    ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor));
 }
--- a/ggml/src/ggml-zdnn/utils.hpp
+++ b/ggml/src/ggml-zdnn/utils.hpp
@@ -0,0 +1,19 @@
 #ifndef GGML_ZDNN_UTILITIES_HPP
 #define GGML_ZDNN_UTILITIES_HPP
 #include "common.hpp"
 zdnn_data_types ggml_zdnn_type_mapping(ggml_type type);
 void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc,
                             zdnn_tensor_desc & tfm_desc,
                             zdnn_ztensor     & ztensor,
                      const ggml_tensor       * src,
                      const int64_t           * ne,
                      const zdnn_data_layouts   layout);
 void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, void * buffer);
 void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor);
 #endif  // GGML_ZDNN_UTILITIES_HPP