From e084821a3f07ef023102b2801f7d11ed2d06e287 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 18 Jul 2025 19:59:24 +0800 Subject: [PATCH] ggml-zdnn: inital backend impl Signed-off-by: Aaron Teo ggml-zdnn: temp change z17 to arch15 Signed-off-by: Aaron Teo ggml-zdnn: fix build bugs Signed-off-by: Aaron Teo --- ggml/CMakeLists.txt | 1 + ggml/include/ggml-zdnn.h | 16 + ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 7 + ggml/src/ggml-cpu/CMakeLists.txt | 2 +- ggml/src/ggml-zdnn/CMakeLists.txt | 38 ++ ggml/src/ggml-zdnn/ggml-zdnn-impl.h | 59 +++ ggml/src/ggml-zdnn/ggml-zdnn.cpp | 622 ++++++++++++++++++++++++++ ggml/src/ggml-zdnn/zdnn.h | 660 ++++++++++++++++++++++++++++ 9 files changed, 1405 insertions(+), 1 deletion(-) create mode 100644 ggml/include/ggml-zdnn.h create mode 100644 ggml/src/ggml-zdnn/CMakeLists.txt create mode 100644 ggml/src/ggml-zdnn/ggml-zdnn-impl.h create mode 100644 ggml/src/ggml-zdnn/ggml-zdnn.cpp create mode 100644 ggml/src/ggml-zdnn/zdnn.h diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index de6d789c98..6819a98b2b 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -183,6 +183,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_WEBGPU "ggml: use WebGPU" OFF) option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF) +option(GGML_ZDNN "ggml: use zDNN" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) diff --git a/ggml/include/ggml-zdnn.h b/ggml/include/ggml-zdnn.h new file mode 100644 index 0000000000..c2c30c977c --- /dev/null +++ b/ggml/include/ggml-zdnn.h @@ -0,0 +1,16 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 0425fd60a9..a31f423689 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -371,6 +371,7 @@ ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) ggml_add_backend(WebGPU) +ggml_add_backend(zDNN) ggml_add_backend(OpenCL) foreach (target ggml-base ggml) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index f0cdac31ea..a7025d49d3 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -49,6 +49,10 @@ #include "ggml-webgpu.h" #endif +#ifdef GGML_USE_ZDNN +#include "ggml-zdnn.h" +#endif + #ifdef GGML_USE_OPENCL #include "ggml-opencl.h" #endif @@ -180,6 +184,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_WEBGPU register_backend(ggml_backend_webgpu_reg()); #endif +#ifdef GGML_USE_ZDNN + register_backend(ggml_backend_zdnn_reg()); +#endif #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); #endif diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 66a5ad8d2e..35b58b5f73 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -457,7 +457,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) elseif (${S390X_M} MATCHES "9175|9176") # NOTE: Only available from GCC 15.1.0 onwards. Any z17 machine with compile issues must first verify their GCC version. message(STATUS "z17 target") - list(APPEND ARCH_FLAGS -march=z17) + list(APPEND ARCH_FLAGS -march=arch15) else() message(STATUS "Unknown target") message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.") diff --git a/ggml/src/ggml-zdnn/CMakeLists.txt b/ggml/src/ggml-zdnn/CMakeLists.txt new file mode 100644 index 0000000000..b6675e1718 --- /dev/null +++ b/ggml/src/ggml-zdnn/CMakeLists.txt @@ -0,0 +1,38 @@ +if (GGML_ZDNN) + if (DEFINED ZDNN_ROOT) + message(STATUS "zdnn: using ZDNN_ROOT override: ${ZDNN_ROOT}") + set(ZDNN_HINT "${ZDNN_ROOT}") + else() + set(ZDNN_HINT "") + endif() + + find_path(ZDNN_INCLUDE + NAMES zdnn.h + HINTS ${ZDNN_HINT} /usr /usr/local + PATH_SUFFIXES include) + if (ZDNN_INCLUDE) + message(STATUS "zdnn: found include: ${ZDNN_INCLUDE}") + else() + message(FATAL_ERROR "zdnn: include directory not found, please set ZDNN_ROOT to the proper path if necessary") + endif() + + find_library(ZDNN_LIB + NAMES zdnn + HINTS ${ZDNN_HINT} /usr /usr/local + PATH_SUFFIXES lib lib64) + if (ZDNN_LIB) + message(STATUS "zdnn: found library: ${ZDNN_LIB}") + else() + message(FATAL_ERROR "zdnn: library not found, please set ZDNN_ROOT to the proper path if necessary") + endif() + + file(GLOB GGML_SOURCES_ZDNN "*.c" "*.cpp") + file(GLOB GGML_HEADERS_ZDNN "*.h" "*.hpp") + + ggml_add_backend_library(ggml-zdnn ${GGML_HEADERS_ZDNN} ${GGML_SOURCES_ZDNN}) + target_link_libraries(ggml-zdnn PRIVATE ${ZDNN_LIB}) + target_include_directories(ggml-zdnn PRIVATE ${ZDNN_INCLUDE}) + target_link_directories(ggml-zdnn PRIVATE ${ZDNN_LIB}) + + target_compile_definitions(ggml-zdnn PRIVATE GGML_ZDNN GGML_USE_ZDNN) +endif() diff --git a/ggml/src/ggml-zdnn/ggml-zdnn-impl.h b/ggml/src/ggml-zdnn/ggml-zdnn-impl.h new file mode 100644 index 0000000000..bf9e19a8b0 --- /dev/null +++ b/ggml/src/ggml-zdnn/ggml-zdnn-impl.h @@ -0,0 +1,59 @@ +#ifndef GGML_ZDNN_IMPL +#define GGML_ZDNN_IMPL + +#include "zdnn.h" +#include "ggml.h" +#include "ggml-zdnn.h" + +#include +#include + +#define GGML_ZDNN_NAME "zDNN" +#define GGML_ZDNN_VERSION ZDNN_VERNUM + +#define vec_neg(a) (-(a)) // Vector Negate +#define vec_add(a, b) ((a) + (b)) // Vector Add +#define vec_sub(a, b) ((a) - (b)) // Vector Subtract +#define vec_mul(a, b) ((a) * (b)) // Vector Multiply +#define vec_div(a, b) ((a) / (b)) // Vector Divide +#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left +#define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right +#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic +#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet +#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet + +#ifndef vec_and +#define vec_and(a, b) ((a) & (b)) // Vector AND +#endif + +#ifndef vec_or +#define vec_or(a, b) ((a) | (b)) // Vector OR +#endif + +#ifndef vec_xor +#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR +#endif + +typedef signed char char8x16_t __attribute__((vector_size(16))); +typedef unsigned char uchar8x16_t __attribute__((vector_size(16))); + +typedef int8_t int8x16_t __attribute__((vector_size(16))); +typedef int16_t int16x8_t __attribute__((vector_size(16))); +typedef int32_t int32x4_t __attribute__((vector_size(16))); +typedef uint8_t uint8x16_t __attribute__((vector_size(16))); +typedef uint16_t uint16x8_t __attribute__((vector_size(16))); +typedef uint32_t uint32x4_t __attribute__((vector_size(16))); + +typedef float float32x4_t __attribute__((vector_size(16))); +typedef double double64x2_t __attribute__((vector_size(16))); + +typedef signed long long long64x2_t __attribute__((vector_size(16))); +typedef unsigned long long ulong64x2_t __attribute__((vector_size(16))); + +#define ZDNN_CHECK(stmt) \ + do { \ + zdnn_status status = (stmt); \ + GGML_ASSERT(status == ZDNN_OK); \ + } while (0); + +#endif // GGML_ZDNN_IMPL diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp new file mode 100644 index 0000000000..1b1dc3f046 --- /dev/null +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -0,0 +1,622 @@ +#include "zdnn.h" +#include "ggml-zdnn.h" +#include "ggml-zdnn-impl.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" + +#include +#include + +struct zdnn_extra { + zdnn_tensor_desc pre_tfm_desc; + zdnn_tensor_desc tfm_desc; + zdnn_ztensor ztensor; + + struct zdnn_extra * extra; // for bias, etc. +}; + +struct ggml_backend_zdnn_context { + int n_threads = GGML_DEFAULT_N_THREADS; +}; + +inline zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return FP32; + case GGML_TYPE_F16: + return FP16; + case GGML_TYPE_BF16: + return BFLOAT; + case GGML_TYPE_I8: + return INT8; + case GGML_TYPE_I32: + return INT32; + case GGML_TYPE_Q8_0: + return INT8; + default: + GGML_ABORT("%s: fatal: unable to determine zTensor data type", + __func__); + break; + } +} + +inline void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc, + zdnn_tensor_desc & tfm_desc, + zdnn_ztensor & ztensor, + const ggml_tensor * src, + const int64_t * ne, + const zdnn_data_layouts layout) { + zdnn_init_pre_transformed_desc( + layout, + ggml_zdnn_type_mapping(src->type), + &pre_tfm_desc, + ne[3], ne[2], ne[1], ne[0] + ); + + ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc)); + ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor)); +} + +inline void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, + void * buffer) { + ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer)); +} + +static void ggml_backend_zdnn_mul_mat(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + const ggml_tensor * weights = src0; + const ggml_tensor * inputs = src1; + ggml_tensor * output = dst; + + zdnn_tensor_desc pre_tfm_desc_weights, tfm_desc_weights; + zdnn_tensor_desc pre_tfm_desc_inputs, tfm_desc_inputs; + zdnn_tensor_desc pre_tfm_desc_bias, tfm_desc_bias; + zdnn_tensor_desc pre_tfm_desc_output, tfm_desc_output; + + zdnn_ztensor ztensor_weights, ztensor_inputs, ztensor_bias, ztensor_output; + + const int64_t weights_rows = ne01; + const int64_t weights_cols = ne00; + const int64_t inputs_rows = ne11; + const int64_t inputs_cols = ne10; + + assert(inputs_cols == weights_cols); + + const int64_t output_rows = dst->ne[1]; + const int64_t output_cols = dst->ne[0]; + + const int64_t inputs_dim [GGML_MAX_DIMS] = { 1, 1, inputs_cols, inputs_rows }; + const int64_t weights_dim[GGML_MAX_DIMS] = { 1, 1, weights_cols, weights_rows }; + const int64_t bias_dim [GGML_MAX_DIMS] = { 1, 1, 1, output_cols }; + const int64_t output_dim [GGML_MAX_DIMS] = { 1, 1, output_cols, output_rows }; + + ggml_zdnn_create_tensor(pre_tfm_desc_inputs, tfm_desc_inputs, ztensor_inputs, src1, inputs_dim, ZDNN_2D); + ggml_zdnn_create_tensor(pre_tfm_desc_weights, tfm_desc_weights, ztensor_weights, src0, weights_dim, ZDNN_2D); + ggml_zdnn_create_tensor(pre_tfm_desc_bias, tfm_desc_bias, ztensor_bias, dst, bias_dim, ZDNN_1D); + ggml_zdnn_create_tensor(pre_tfm_desc_output, tfm_desc_output, ztensor_output, dst, output_dim, ZDNN_2D); + + const size_t weights_size = ggml_element_size(src0); + + void * bias_data = (void *)calloc(output_cols, sizeof(ggml_element_size(dst))); + + ZDNN_CHECK(zdnn_transform_ztensor(&ztensor_weights, weights->data)); + ZDNN_CHECK(zdnn_transform_ztensor(&ztensor_inputs, inputs->data)); + ZDNN_CHECK(zdnn_transform_ztensor(&ztensor_bias, bias_data)); + + ZDNN_CHECK(zdnn_matmul_transpose_op(&ztensor_inputs, &ztensor_weights, &ztensor_bias, + false, true, MATMUL_OP_ADDITION, &ztensor_output)); + ZDNN_CHECK(zdnn_transform_origtensor(&ztensor_output, output->data)); + + ZDNN_CHECK(zdnn_free_ztensor_buffer(&ztensor_weights)); + ZDNN_CHECK(zdnn_free_ztensor_buffer(&ztensor_inputs)); + ZDNN_CHECK(zdnn_free_ztensor_buffer(&ztensor_bias)); + ZDNN_CHECK(zdnn_free_ztensor_buffer(&ztensor_output)); + + free(bias_data); +} + +static void ggml_backend_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_UNUSED(ctx); + + bool use_mul_mat_vec = + (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F16) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 + && src0->ne[0] % 2 == 0 && src1->ne[1] == 1; + + bool use_mul_mat_vec_q = + ggml_is_quantized(src0->type) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; + + bool use_mul_mat_q = + ggml_is_quantized(src0->type) + && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; + + // debug helpers + // GGML_LOG_INFO("%s: use_mul_mat_vec = %d\n", __func__, use_mul_mat_vec); + // GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q); + // GGML_LOG_INFO("%s: use_mul_mat_q = %d\n", __func__, use_mul_mat_q); + // GGML_LOG_INFO("%s: src0: %8d %8d %8d %8d\n", __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + // GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); + // GGML_LOG_INFO("%s: src1: %8d %8d %8d %8d\n", __func__, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]); + // GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); + // GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); + // GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); + + if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 + && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) + && src1->ne[2] * src1->ne[3] > 1) { + // general KQ + KQV multi-batch + GGML_LOG_INFO("%s: using zdnn_mul_mat_batched for KQ + KQV multi-batch\n", __func__); + // ggml_zdnn_mul_mat_batched(ctx, src0, src1, dst); + } else if (use_mul_mat_vec) { + GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec for vector multiplication\n", __func__); + // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec, nullptr); + } else if (use_mul_mat_vec_q) { + GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec_q for quantized vector multiplication\n", __func__); + // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec_q, ggml_zdnn_quantize_row_q8_1); + } else if (use_mul_mat_q) { + GGML_LOG_INFO("%s: using zdnn_op_mul_mat_q for quantized matrix multiplication\n", __func__); + // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_q, ggml_zdnn_quantize_mmq_q8_1); + } else { + // GGML_LOG_INFO("%s: using zdnn_op_mul_mat for general matrix multiplication\n", __func__); + ggml_backend_zdnn_mul_mat(ctx, src0, src1, dst); + } +} + +static bool ggml_backend_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) { + switch (dst->op) { + case GGML_OP_MUL_MAT: + ggml_backend_zdnn_mul_mat_dispatch(ctx, dst->src[0], dst->src[1], dst); + break; + + default: + return false; + } + + return true; +} + +static const char * ggml_backend_zdnn_get_name(ggml_backend_t backend) { + return GGML_ZDNN_NAME; + + GGML_UNUSED(backend); +} + +static void ggml_backend_zdnn_free(ggml_backend_t backend) { + ggml_backend_zdnn_context * ctx = (ggml_backend_zdnn_context *)backend->context; + delete ctx; + delete backend; +} + +static ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_zdnn_context * ctx = (ggml_backend_zdnn_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + + if (ggml_is_empty(node) + || node->op == GGML_OP_NONE + || node->op == GGML_OP_RESHAPE + || node->op == GGML_OP_VIEW + || node->op == GGML_OP_PERMUTE + || node->op == GGML_OP_TRANSPOSE) { + continue; + } + + bool ok = ggml_backend_zdnn_compute_forward(ctx, node); + if (!ok) { + GGML_LOG_ERROR("%s: unsupported op %s (%s)\n", + __func__, node->name, ggml_op_name(node->op)); + } + + GGML_ASSERT(ok); + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +static ggml_backend_i ggml_backend_zdnn_i = { + /* .get_name = */ ggml_backend_zdnn_get_name, + /* .free = */ ggml_backend_zdnn_free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_zdnn_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static ggml_guid_t ggml_backend_zdnn_guid(void) { + // guid spells out IBM-NNPA-ACCELER + static ggml_guid guid = { 0x49, 0x42, 0x4D, 0x2D, 0x4E, 0x4E, 0x50, 0x41, + 0x2D, 0x41, 0x43, 0x43, 0x45, 0x4C, 0x45, 0x52 }; + + return &guid; +} + +ggml_backend_t ggml_backend_zdnn_init(void) { + ggml_backend_zdnn_context * ctx = new ggml_backend_zdnn_context; + + ggml_backend_t backend = new ggml_backend { + /* .guid = */ ggml_backend_zdnn_guid(), + /* .iface = */ ggml_backend_zdnn_i, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_zdnn_reg(), 0), + /* .context = */ ctx, + }; + + return backend; +} + +bool ggml_backend_is_zdnn(ggml_backend_t backend) { + return backend != NULL && + ggml_guid_matches(backend->guid, ggml_backend_zdnn_guid()); +} + +void ggml_backend_zdnn_set_n_threads(ggml_backend_t backend_zdnn, int n_threads) { + GGML_ASSERT(ggml_backend_is_zdnn(backend_zdnn)); + + ggml_backend_zdnn_context * ctx = (ggml_backend_zdnn_context *)backend_zdnn->context; + ctx->n_threads = n_threads; +} + +static const char * ggml_backend_zdnn_device_get_name(ggml_backend_dev_t dev) { + return GGML_ZDNN_NAME; +} + +static const char * ggml_backend_zdnn_device_get_description(ggml_backend_dev_t dev) { + return GGML_ZDNN_NAME; + + GGML_UNUSED(dev); +} + +static void ggml_backend_zdnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + *free = 0; + *total = 0; + + GGML_UNUSED(dev); +} + +static enum ggml_backend_dev_type ggml_backend_zdnn_device_get_type(ggml_backend_dev_t dev) { + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + + GGML_UNUSED(dev); +} + +static void ggml_backend_zdnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_zdnn_device_get_name(dev); + props->description = ggml_backend_zdnn_device_get_description(dev); + props->type = ggml_backend_zdnn_device_get_type(dev); + ggml_backend_zdnn_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, + }; +} + +static ggml_backend_t ggml_backend_zdnn_device_init_backend(ggml_backend_dev_t dev, const char * params) { + return ggml_backend_zdnn_init(); + + GGML_UNUSED(dev); + GGML_UNUSED(params); +} + +static void * ggml_backend_zdnn_buffer_get_base(ggml_backend_buffer_t buffer) { + uintptr_t data = (uintptr_t)buffer->context; + if (data % 256 != 0) { + data = GGML_PAD(data, 256); + } + + return (void *)data; +} + +static ggml_status ggml_backend_zdnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (tensor->view_src != NULL) { + assert(tensor->view_src->buffer->buft == buffer->buft); + return GGML_STATUS_SUCCESS; + } + + zdnn_extra * extra = (zdnn_extra *)malloc(sizeof(zdnn_extra)); + const int64_t dims[GGML_MAX_DIMS] = { 1, 1, tensor->ne[0], tensor->ne[1] }; + + zdnn_init_pre_transformed_desc( + ZDNN_2D, + ggml_zdnn_type_mapping(tensor->type), + &extra->pre_tfm_desc, + dims[3], dims[2], dims[1], dims[0] + ); + + ZDNN_CHECK(zdnn_generate_transformed_desc(&extra->pre_tfm_desc, &extra->tfm_desc)); + ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&extra->pre_tfm_desc, &extra->tfm_desc, &extra->ztensor)); + + if (tensor->op == GGML_OP_MUL_MAT) { + zdnn_extra * bias_extra = (zdnn_extra *)malloc(sizeof(zdnn_extra)); + const int64_t bias_dims[GGML_MAX_DIMS] = { 1, 1, 1, tensor->ne[0] }; + + zdnn_init_pre_transformed_desc( + ZDNN_1D, + ggml_zdnn_type_mapping(tensor->type), + &bias_extra->pre_tfm_desc, + bias_dims[3], bias_dims[2], bias_dims[1], bias_dims[0] + ); + ZDNN_CHECK(zdnn_generate_transformed_desc(&bias_extra->pre_tfm_desc, &bias_extra->tfm_desc)); + ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&bias_extra->pre_tfm_desc, &bias_extra->tfm_desc, &bias_extra->ztensor)); + + extra->extra = bias_extra; + } + + tensor->extra = extra; + + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_zdnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_aligned_free(buffer->context, buffer->size); +} + +static void ggml_backend_zdnn_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + memset((char *)tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_zdnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + memcpy((char *)tensor->data + offset, data, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_zdnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + memcpy(data, (const char *)tensor->data + offset, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_zdnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); +} + +static const ggml_backend_buffer_i ggml_backend_zdnn_buffer_i = { + /* .free_buffer = */ ggml_backend_zdnn_buffer_free_buffer, // zdnn buffers are not owned by the backend + /* .get_base = */ ggml_backend_zdnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_zdnn_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_zdnn_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_zdnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_zdnn_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_zdnn_buffer_clear, + /* .reset = */ NULL, +}; + +static const ggml_backend_buffer_i ggml_backend_zdnn_buffer_from_ptr_i = { + /* .free_buffer = */ NULL, // ptr is not owned by the buffer + /* .get_base = */ ggml_backend_zdnn_buffer_get_base, + /* .init_tensor = */ ggml_backend_zdnn_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_zdnn_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_zdnn_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_zdnn_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_zdnn_buffer_clear, + /* .reset = */ NULL, +}; + +static const char * ggml_backend_zdnn_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return GGML_ZDNN_NAME; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_zdnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * data = ggml_aligned_malloc(size); + if (data == NULL) { + GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size); + return NULL; + } + + return ggml_backend_buffer_init(buft, ggml_backend_zdnn_buffer_i, data, size); +} + +static size_t ggml_backend_zdnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 256; + + GGML_UNUSED(buft); +} + +static bool ggml_backend_zdnn_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_t ggml_backend_zdnn_device_get_buffer_type(ggml_backend_dev_t dev) { + static ggml_backend_buffer_type ggml_backend_zdnn_buffer_type = { + /* .iface = */ { + /* .get_name = */ ggml_backend_zdnn_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_zdnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_zdnn_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_zdnn_buffer_type_is_host, + }, + /* .device = */ NULL, + /* .context = */ NULL, + }; + + return &ggml_backend_zdnn_buffer_type; +} + +static const char * ggml_backend_zdnn_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { + return GGML_ZDNN_NAME "_Mapped"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_from_ptr_type(void) { + static ggml_backend_buffer_type ggml_backend_zdnn_buffer_type = { + /* .iface = */ { + /* .get_name = */ ggml_backend_zdnn_buffer_from_ptr_type_get_name, + /* .alloc_buffer = */ ggml_backend_zdnn_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_zdnn_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_zdnn_buffer_type_is_host, + }, + /* .device = */ NULL, + /* .context = */ NULL, + }; + + return &ggml_backend_zdnn_buffer_type; +} + +static ggml_backend_buffer_t ggml_backend_zdnn_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + GGML_ASSERT((uintptr_t)ptr % 256 == 0 && "buffer pointer must be aligned"); + return ggml_backend_buffer_init(ggml_backend_zdnn_buffer_from_ptr_type(), ggml_backend_zdnn_buffer_from_ptr_i, ptr, size); +} + +static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return true; + + case GGML_OP_MUL_MAT: + { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = op->ne[0]; + const int64_t ne1 = op->ne[1]; + + const int64_t max_batch = zdnn_get_nnpa_max_dim_idx_size(); + + return ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 <= max_batch && ne1 <= max_batch && ne10 <= max_batch) && + (src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL); + } + + default: + return false; + } + + GGML_UNUSED(dev); +} + +static bool ggml_backend_zdnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name; + + GGML_UNUSED(dev); +} + +static ggml_backend_device_i ggml_backend_zdnn_device_i = { + /* .get_name = */ ggml_backend_zdnn_device_get_name, + /* .get_description = */ ggml_backend_zdnn_device_get_description, + /* .get_memory = */ ggml_backend_zdnn_device_get_memory, + /* .get_type = */ ggml_backend_zdnn_device_get_type, + /* .get_props = */ ggml_backend_zdnn_device_get_props, + /* .init_backend = */ ggml_backend_zdnn_device_init_backend, + /* .get_buffer_type = */ ggml_backend_zdnn_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_zdnn_device_buffer_from_host_ptr, + /* .supports_op = */ ggml_backend_zdnn_device_supports_op, + /* .supports_buft = */ ggml_backend_zdnn_device_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +// +// backend registry +// + +static const char * ggml_backend_zdnn_reg_get_name(ggml_backend_reg_t reg) { + return GGML_ZDNN_NAME; + + GGML_UNUSED(reg); +} + +static size_t ggml_backend_zdnn_reg_get_device_count(ggml_backend_reg_t reg) { + return 1; + + GGML_UNUSED(reg); +} + +static ggml_backend_dev_t ggml_backend_zdnn_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_device ggml_backend_zdnn_device = { + /* .iface = */ ggml_backend_zdnn_device_i, + /* .reg = */ reg, + /* .context = */ nullptr, + }; + + return &ggml_backend_zdnn_device; + + GGML_UNUSED(reg); + GGML_UNUSED(index); +} + +static void * ggml_backend_zdnn_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "ggml_backend_set_n_threads") == 0) { + return (void *)ggml_backend_zdnn_set_n_threads; + } + return NULL; + + GGML_UNUSED(reg); +} + +static const ggml_backend_reg_i ggml_backend_zdnn_reg_i = { + /* .get_name = */ ggml_backend_zdnn_reg_get_name, + /* .get_device_count = */ ggml_backend_zdnn_reg_get_device_count, + /* .get_device = */ ggml_backend_zdnn_reg_get_device, + /* .get_proc_address = */ ggml_backend_zdnn_get_proc_address, +}; + +ggml_backend_reg_t ggml_backend_zdnn_reg(void) { + static ggml_backend_reg ggml_backend_zdnn_reg = { + /* .api_version = */ GGML_ZDNN_VERSION, + /* .iface = */ ggml_backend_zdnn_reg_i, + /* .context = */ NULL, + }; + + return &ggml_backend_zdnn_reg; +} + +GGML_BACKEND_DL_IMPL(ggml_backend_zdnn_reg) diff --git a/ggml/src/ggml-zdnn/zdnn.h b/ggml/src/ggml-zdnn/zdnn.h new file mode 100644 index 0000000000..0c9cacc0bd --- /dev/null +++ b/ggml/src/ggml-zdnn/zdnn.h @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright IBM Corp. 2021, 2024 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ZDNN_ZDNN_H_ +#define ZDNN_ZDNN_H_ + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// ----------------------------------------------------------------------------- +// NOTE: +// Ensure that symbols in zdnn.h and zdnn.map are in sync! +// Please also have a look at zdnn.map how to add, update or remove a symbol. +// ----------------------------------------------------------------------------- + +// ----------------------------------------------------------------------------- +// Initializer and global variables +// ----------------------------------------------------------------------------- + +void zdnn_init(); + +// ----------------------------------------------------------------------------- +// zDNN Status +// ----------------------------------------------------------------------------- + +// ----------------------------------------------------------------------------- +// NOTE: +// Update status.c and zdnn_private.h after any status modification! +// ----------------------------------------------------------------------------- + +// Status categories +#define ZDNN_WARNING 0x00020000 +#define ZDNN_PARAMETER_ERROR 0x00040000 +#define ZDNN_DATA_ERROR 0x00100000 +#define ZDNN_HW_ERROR 0x000c0000 + +// clang-format off +typedef enum zdnn_status { + // ---------------------------------------------------------------- + ZDNN_OK = 0x00000000, // Success. + // ---------------------------------------------------------------- + ZDNN_ELEMENT_RANGE_VIOLATION = ZDNN_WARNING + 0x0001, // zAIU operation resulted in data that was out of the normal range. + // ---------------------------------------------------------------- + ZDNN_INVALID_SHAPE = ZDNN_PARAMETER_ERROR + 0x0001, // Invalid shape information in one (or more) of the input/output tensor(s). + ZDNN_INVALID_LAYOUT, // Invalid layout information in one (or more) of the input/output tensor(s). + ZDNN_INVALID_TYPE, // Invalid type information in one (or more) of the input/output tensor(s). + ZDNN_INVALID_FORMAT, // Invalid format information in one (or more) of the input/output tensor(s). + ZDNN_INVALID_DIRECTION, // Invalid RNN direction. + ZDNN_INVALID_CONCAT_INFO, // Invalid concatenation info. + ZDNN_INVALID_STRIDE_PADDING, // Invalid padding type parameter for current strides + ZDNN_INVALID_STRIDES, // Invalid stride height or width parameter. + ZDNN_MISALIGNED_PARMBLOCK, // NNPA parameter block is not on double word boundary. + ZDNN_INVALID_CLIPPING_VALUE, // Invalid clipping for the specified operation. + ZDNN_INVALID_ADJUSTMENT_FACTOR, // Invalid adjustment for the specified operation. + ZDNN_INVALID_EPSILON, // Invalid epsilon for the specified operation. + ZDNN_INVALID_TRANSFORM_TYPE, // Invalid transformation type + ZDNN_INVALID_BETA, // Invalid beta value for the specified operation. + ZDNN_INVALID_GAMMA, // Invalid gamma value for the specified operation. + ZDNN_INVALID_BESSEL_CORRECTION, // Invalid bessel correction value for the specified operation. + ZDNN_INVALID_SCALE, // Invalid scale value for the specified operation. + ZDNN_INVALID_OFFSET, // Invalid offset value for the specified operation. + // ---------------------------------------------------------------- + ZDNN_ALLOCATION_FAILURE = ZDNN_DATA_ERROR + 0x0001, // Can not allocate storage. + ZDNN_INVALID_BUFFER, // Buffer address is NULL or not on 4K-byte boundary, or insufficient buffer size. + ZDNN_CONVERT_FAILURE, // Floating point data conversion failure. + ZDNN_INVALID_STATE, // Invalid zTensor state. + ZDNN_UNSUPPORTED_AIU_EXCEPTION, // zAIU operation returned an unexpected exception. + // ---------------------------------------------------------------- + ZDNN_UNSUPPORTED_PARMBLOCK = ZDNN_HW_ERROR + 0x0001, // NNPA parameter block format is not supported by the model. + ZDNN_UNAVAILABLE_FUNCTION, // Specified NNPA function is not defined or installed on the machine. + ZDNN_UNSUPPORTED_FORMAT = ZDNN_HW_ERROR + 0x0010, // Specified tensor data layout format is not supported. + ZDNN_UNSUPPORTED_TYPE, // Specified tensor data type is not supported. + ZDNN_EXCEEDS_MDIS, // Tensor dimension exceeds maximum dimension index size (MDIS). + ZDNN_EXCEEDS_MTS, // Total number of elements in tensor exceeds maximum tensor size. (MTS). + ZDNN_MISALIGNED_TENSOR, // Tensor address is not on 4K-byte boundary. + ZDNN_MISALIGNED_SAVEAREA, // Function specific save area address is not on 4K-byte boundary. + // ---------------------------------------------------------------- + // Function specific response code (F00x) + ZDNN_FUNC_RC_F000 = ZDNN_HW_ERROR + 0xF000, // Function specific response code (F000). + ZDNN_FUNC_RC_F001, // Function specific response code (F001). + ZDNN_FUNC_RC_F002, // Function specific response code (F002). + ZDNN_FUNC_RC_F003, // Function specific response code (F003). + ZDNN_FUNC_RC_F004, // Function specific response code (F004). + ZDNN_FUNC_RC_F005, // Function specific response code (F005). + ZDNN_FUNC_RC_F006, // Function specific response code (F006). + ZDNN_FUNC_RC_F007, // Function specific response code (F007). + ZDNN_FUNC_RC_F008, // Function specific response code (F008). + ZDNN_FUNC_RC_F009, // Function specific response code (F009). + + // ---------------------------------------------------------------- +} zdnn_status; +// clang-format on + +// ----------------------------------------------------------------------------- +// NNPA hardware defined values as described in +// z/Architecture - Principles of Operation +// ----------------------------------------------------------------------------- + +typedef enum nnpa_function_code { + NNPA_QAF = 0, + NNPA_ADD = 16, + NNPA_SUB = 17, + NNPA_MUL = 18, + NNPA_DIV = 19, + NNPA_MIN = 20, + NNPA_MAX = 21, + NNPA_LOG = 32, + NNPA_EXP = 33, + NNPA_SQRT = 34, + NNPA_INVSQRT = 35, + // reserved = 48 + NNPA_RELU = 49, + NNPA_TANH = 50, + NNPA_SIGMOID = 51, + NNPA_SOFTMAX = 52, + NNPA_GELU = 53, + NNPA_BATCHNORMALIZATION = 64, + NNPA_MOMENTS = 65, + NNPA_LAYERNORM = 66, + NNPA_NORM = 67, + NNPA_MAXPOOL2D = 80, + NNPA_AVGPOOL2D = 81, + NNPA_LSTMACT = 96, + NNPA_GRUACT = 97, + NNPA_CONVOLUTION = 112, + NNPA_MATMUL_OP = 113, + NNPA_MATMUL_OP_BCAST23 = 114, + NNPA_MATMUL_OP_BCAST1 = 115, + NNPA_TRANSFORM = 240, + NNPA_REDUCE = 241 +} nnpa_function_code; + +typedef enum nnpa_parmblk_format { + NNPA_PARMBLKFORMAT_0 = 0, + NNPA_PARMBLKFORMAT_1 = 1, +} nnpa_parmblk_format; + +typedef enum nnpa_data_type { + NNPA_DATATYPE_1 = 0, + NNPA_32_BIT_BINARY_FP_SHORT = 6, + NNPA_8_BIT_BINARY_INT = 8, + NNPA_32_BIT_BINARY_INT = 10 +} nnpa_data_type; + +typedef enum nnpa_layout_format { + NNPA_LAYOUTFMT_4DFEATURE = 0, + NNPA_LAYOUTFMT_4DKERNEL = 1, + NNPA_LAYOUTFMT_4DWEIGHTS = 2, + NNPA_LAYOUTFMT_4DGENERIC = 31 +} nnpa_layout_format; + +typedef enum nnpa_bfp_format { + // 0 is reversed + NNPA_BFPFMT_TINY = 1, + NNPA_BFPFMT_SHORT = 2 +} nnpa_bfp_format; + +// NNPA_SOFTMAX, NNPA_REDUCE, and NNPA_TRANSFORM require 8K work area +#define ZDNN_SOFTMAX_SAVEAREA_SIZE 8 * 1024 +#define ZDNN_8K_SAVEAREA_SIZE 8 * 1024 + +// NNPA Hardware defined values for Function Specific Parameters +typedef enum nnpa_matmul_operations { + NNPA_MATMUL_OP_ADDITION = 0, + NNPA_MATMUL_OP_COMP_HIGH = 1, + NNPA_MATMUL_OP_COMP_NOT_LOW = 2, + NNPA_MATMUL_OP_COMP_EQUAL = 3, + NNPA_MATMUL_OP_COMP_NOT_EQUAL = 4, + NNPA_MATMUL_OP_COMP_NOT_HIGH = 5, + NNPA_MATMUL_OP_COMP_LOW = 6, +} nnpa_matmul_operations; + +typedef enum nnpa_matmul_bcast_operations { + NNPA_MATMUL_BCAST_OP_ADDITION = 0, + NNPA_MATMUL_BCAST_OP_COMP_HIGH = 1, + NNPA_MATMUL_BCAST_OP_COMP_NOT_LOW = 2, + NNPA_MATMUL_BCAST_OP_COMP_EQUAL = 3, + NNPA_MATMUL_BCAST_OP_COMP_NOT_EQUAL = 4, + NNPA_MATMUL_BCAST_OP_COMP_NOT_HIGH = 5, + NNPA_MATMUL_BCAST_OP_COMP_LOW = 6 +} nnpa_matmul_bcast_operations; + +typedef enum nnpa_softmax_act { + NNPA_SOFTMAX_NONE = 0, + NNPA_SOFTMAX_LOG = 1 +} nnpa_softmax_act; + +typedef enum nnpa_reduce_operations { + NNPA_REDUCE_OP_MINIMUM = 0, + NNPA_REDUCE_OP_MINIMUM_IDX = 1, + NNPA_REDUCE_OP_MAXIMUM = 2, + NNPA_REDUCE_OP_MAXIMUM_IDX = 3 +} nnpa_reduce_operations; + +// ----------------------------------------------------------------------------- +// zdnn_query_*() bit-field enums +// ----------------------------------------------------------------------------- + +// pos is counting from left to right +#define MSB_BITMASK(field_size, pos) 1u << ((field_size - 1) - pos) + +typedef enum zdnn_query_datatypes { + QUERY_DATATYPE_INTERNAL1 = MSB_BITMASK(16, NNPA_DATATYPE_1), + QUERY_DATATYPE_BINARY_FP32 = MSB_BITMASK(16, NNPA_32_BIT_BINARY_FP_SHORT), + QUERY_DATATYPE_BINARY_INT8 = MSB_BITMASK(16, NNPA_8_BIT_BINARY_INT), + QUERY_DATATYPE_BINARY_INT32 = MSB_BITMASK(16, NNPA_32_BIT_BINARY_INT) +} zdnn_query_datatypes; + +typedef enum zdnn_query_layoutfmts { + QUERY_LAYOUTFMT_4DFEATURE = MSB_BITMASK(32, NNPA_LAYOUTFMT_4DFEATURE), + QUERY_LAYOUTFMT_4DKERNEL = MSB_BITMASK(32, NNPA_LAYOUTFMT_4DKERNEL), + QUERY_LAYOUTFMT_4DWEIGHTS = MSB_BITMASK(32, NNPA_LAYOUTFMT_4DWEIGHTS), + QUERY_LAYOUTFMT_4DGENERIC = MSB_BITMASK(32, NNPA_LAYOUTFMT_4DGENERIC) +} zdnn_query_layoutfmts; + +typedef enum zdnn_query_bfpfmts { + QUERY_BFPFMT_TINY = MSB_BITMASK(16, NNPA_BFPFMT_TINY), + QUERY_BFPFMT_SHORT = MSB_BITMASK(16, NNPA_BFPFMT_SHORT) +} zdnn_query_bfpfmts; + +// ----------------------------------------------------------------------------- +// ZDNN enums +// ----------------------------------------------------------------------------- + +typedef enum zdnn_data_types { + ZDNN_DLFLOAT16 = NNPA_DATATYPE_1, // 16-bit deep learning format + ZDNN_BINARY_FP32 = + NNPA_32_BIT_BINARY_FP_SHORT, // 32-bit binary-floating-point format + ZDNN_BINARY_INT8 = + NNPA_8_BIT_BINARY_INT, // 8-bit signed or unsigned binary integer + ZDNN_BINARY_INT32 = + NNPA_32_BIT_BINARY_INT, // 32-bit signed or unsigned binary integer + INT8 = 251, // 8-bit signed or unsigned binary integer format + INT32 = 252, // 32-bit signed or unsigned binary integer format + BFLOAT = 253, // Brain floating point format + FP16 = 254, // 16-bit IEEE-754 floating point format + FP32 = 255, // 32-bit IEEE-754 floating point format +} zdnn_data_types; + +typedef enum zdnn_data_layouts { + ZDNN_1D, // 1d tensor + ZDNN_2D, // 2d tensor + ZDNN_2DS, // represents special 2D tensors required by LSTM/GRU + ZDNN_3D, // 3d tensor + ZDNN_3DS, // represents special 3D tensors required by + // LSTM/GRU/Softmax/Matmul + ZDNN_ZRH, // represents (update, reset, hidden) used by GRU + ZDNN_4D, // 4d tensor + ZDNN_4DS, // represents special 4D tensors required by LSTM/GRU output + ZDNN_NHWC, // 4d feature tensor in NHWC + ZDNN_NCHW, // 4d feature tensor in NCHW + ZDNN_FICO, // represents (forget, input, cell, output) used by LSTM + ZDNN_HWCK, // 4d kernel CNN tensor + ZDNN_BIDIR_ZRH, // ZRH variant to work with bidirectional LSTM/GRU output + ZDNN_BIDIR_FICO // FICO variant to work with bidirectional LSTM/GRU output +} zdnn_data_layouts; + +typedef enum zdnn_data_formats { + ZDNN_FORMAT_4DFEATURE = + NNPA_LAYOUTFMT_4DFEATURE, // tensor in zAIU data layout format 0 + ZDNN_FORMAT_4DKERNEL = + NNPA_LAYOUTFMT_4DKERNEL, // tensor in zAIU data layout format 1 + ZDNN_FORMAT_4DWEIGHTS = + NNPA_LAYOUTFMT_4DWEIGHTS, // tensor in zAIU data layout format 2 + ZDNN_FORMAT_4DGENERIC = + NNPA_LAYOUTFMT_4DGENERIC, // tensor in zAIU data layout 31 +} zdnn_data_formats; + +typedef enum zdnn_quantized_transform_types { + QUANTIZED_DLFLOAT16 = 0, // quantized dlfloat16 + QUANTIZED_INT8 = 1, // quantized int8 + QUANTIZED_WEIGHTS_INT8 = 2 // quantized weights +} zdnn_quantized_transform_types; + +// Supported padding types for use in pooling functions +typedef enum zdnn_pool_padding { + VALID_PADDING = 0, + SAME_PADDING = 1 +} zdnn_pool_padding; + +// Support operations for use in matmul functions +typedef enum zdnn_matmul_ops { + MATMUL_OP_ADDITION = NNPA_MATMUL_OP_ADDITION, + MATMUL_OP_GREATER = NNPA_MATMUL_OP_COMP_HIGH, + MATMUL_OP_GREATER_EQUAL = NNPA_MATMUL_OP_COMP_NOT_LOW, + MATMUL_OP_EQUAL = NNPA_MATMUL_OP_COMP_EQUAL, + MATMUL_OP_NOT_EQUAL = NNPA_MATMUL_OP_COMP_NOT_EQUAL, + MATMUL_OP_LESSER_EQUAL = NNPA_MATMUL_OP_COMP_NOT_HIGH, + MATMUL_OP_LESSER = NNPA_MATMUL_OP_COMP_LOW +} zdnn_matmul_ops; + +// Support operations for use in matmul function +typedef enum zdnn_matmul_bcast_ops { + MATMUL_BCAST_OP_ADDITION = NNPA_MATMUL_BCAST_OP_ADDITION, + MATMUL_BCAST_OP_GREATER = NNPA_MATMUL_BCAST_OP_COMP_HIGH, + MATMUL_BCAST_OP_GREATER_EQUAL = NNPA_MATMUL_BCAST_OP_COMP_NOT_LOW, + MATMUL_BCAST_OP_EQUAL = NNPA_MATMUL_BCAST_OP_COMP_EQUAL, + MATMUL_BCAST_OP_NOT_EQUAL = NNPA_MATMUL_BCAST_OP_COMP_NOT_EQUAL, + MATMUL_BCAST_OP_LESSER_EQUAL = NNPA_MATMUL_BCAST_OP_COMP_NOT_HIGH, + MATMUL_BCAST_OP_LESSER = NNPA_MATMUL_BCAST_OP_COMP_LOW + +} zdnn_matmul_bcast_ops; + +typedef enum zdnn_softmax_act { + SOFTMAX_ACT_NONE = NNPA_SOFTMAX_NONE, + SOFTMAX_ACT_LOG = NNPA_SOFTMAX_LOG +} zdnn_softmax_act; + +typedef enum zdnn_conv2d_act { + CONV2D_ACT_NONE, + CONV2D_ACT_RELU +} zdnn_conv2d_act; + +// Support operations for use in reduce functions +typedef enum zdnn_reduce_ops { + REDUCE_OP_MINIMUM = NNPA_REDUCE_OP_MINIMUM, + REDUCE_OP_MINIMUM_IDX = NNPA_REDUCE_OP_MINIMUM_IDX, + REDUCE_OP_MAXIMUM = NNPA_REDUCE_OP_MAXIMUM, + REDUCE_OP_MAXIMUM_IDX = NNPA_REDUCE_OP_MAXIMUM_IDX +} zdnn_reduce_ops; + +typedef enum zdnn_moments_bessel { + MOMENTS_BESSEL_POPULATION, + MOMENTS_BESSEL_SAMPLE, +} zdnn_moments_bessel; + +// ----------------------------------------------------------------------------- +// Structs +// ---------------------------------------------------------------------------- + +// describes general pre-transformed or transformed information (e.g. shape) of +// a tensor +typedef struct zdnn_tensor_desc { + zdnn_data_layouts layout; // data layout + zdnn_data_formats format; // internal use only + zdnn_data_types type; // data type + uint32_t dim4; // number of elements in outermost dimension + uint32_t dim3; // ... outer dimension + uint32_t dim2; // ... inner dimension + uint32_t dim1; // number of elements in innermost dimension +} zdnn_tensor_desc; + +// struct for describing a ztensor +typedef struct zdnn_ztensor { + zdnn_tensor_desc + *pre_transformed_desc; // tensor's shape information before transformation + zdnn_tensor_desc *transformed_desc; // transformed tensor's shape information + uint64_t buffer_size; // tensor size in bytes + void *buffer; // pointer to the tensor in memory + bool is_transformed; // indicator if data in buffer has been transformed + char reserved[3]; // not currently used, should contain zeros. + float rec_scale; // the scale factor for quantization, stored as reciprocal + float offset; // the offset for quantization + char reserved2[20]; // not currently used, should contain zeros. +} zdnn_ztensor; + +#define ZDNN_VERSION "1.2.0" +#define ZDNN_VERNUM 0x010200 // 0x[major][minor][patch] +#define ZDNN_VER_MAJOR 1 +#define ZDNN_VER_MINOR 2 +#define ZDNN_VER_PATCH 0 + +// ----------------------------------------------------------------------------- +// External Tensor Functions +// ----------------------------------------------------------------------------- + +// Concatenation information is encoded into a 32-bit word: +// [RNN_TYPE: 8][PREV_LAYER_TYPE: 8][USAGE: 8][8] + +typedef uint32_t zdnn_concat_info; + +#define BITSHIFT_RNN_TYPE 24 +#define BITSHIFT_PREV_LAYER 16 +#define BITSHIFT_USAGE 8 + +#define RNN_TYPE_LSTM (0 << BITSHIFT_RNN_TYPE) +#define RNN_TYPE_GRU (1 << BITSHIFT_RNN_TYPE) + +#define PREV_LAYER_UNI (0 << BITSHIFT_PREV_LAYER) +#define PREV_LAYER_NONE PREV_LAYER_UNI +#define PREV_LAYER_BIDIR (1 << BITSHIFT_PREV_LAYER) + +#define USAGE_WEIGHTS (0 << BITSHIFT_USAGE) +#define USAGE_HIDDEN_WEIGHTS (1 << BITSHIFT_USAGE) +#define USAGE_BIASES (2 << BITSHIFT_USAGE) +#define USAGE_HIDDEN_BIASES (3 << BITSHIFT_USAGE) + +#define CONCAT_RNN_TYPE(info) (info & (0xFFu << BITSHIFT_RNN_TYPE)) +#define CONCAT_PREV_LAYER(info) (info & (0xFFu << BITSHIFT_PREV_LAYER)) +#define CONCAT_USAGE(info) (info & (0xFFu << BITSHIFT_USAGE)) + +void zdnn_init_pre_transformed_desc(zdnn_data_layouts layout, + zdnn_data_types type, + zdnn_tensor_desc *pre_tfrmd_desc, ...); + +zdnn_status +zdnn_generate_transformed_desc(const zdnn_tensor_desc *pre_tfrmd_desc, + zdnn_tensor_desc *tfrmd_desc); + +zdnn_status zdnn_generate_quantized_transformed_desc( + const zdnn_tensor_desc *pre_tfrmd_desc, + zdnn_quantized_transform_types transform_type, + zdnn_tensor_desc *tfrmd_desc); + +zdnn_status zdnn_generate_transformed_desc_concatenated( + const zdnn_tensor_desc *pre_tfrmd_desc, zdnn_concat_info info, + zdnn_tensor_desc *tfrmd_desc); + +zdnn_status zdnn_allochelper_ztensor(zdnn_ztensor *ztensor); +zdnn_status zdnn_free_ztensor_buffer(const zdnn_ztensor *ztensor); + +void zdnn_init_ztensor(zdnn_tensor_desc *pre_tfrmd_desc, + zdnn_tensor_desc *tfrmd_desc, zdnn_ztensor *output); + +void zdnn_init_quantized_ztensor(zdnn_tensor_desc *pre_tfrmd_desc, + zdnn_tensor_desc *tfrmd_desc, float scale, + float offset, zdnn_ztensor *output); + +zdnn_status zdnn_init_ztensor_with_malloc(zdnn_tensor_desc *pre_tfrmd_desc, + zdnn_tensor_desc *tfrmd_desc, + zdnn_ztensor *output); + +zdnn_status zdnn_init_quantized_ztensor_with_malloc( + zdnn_tensor_desc *pre_tfrmd_desc, zdnn_tensor_desc *tfrmd_desc, float scale, + float offset, zdnn_ztensor *output); + +bool zdnn_is_quantized_ztensor(zdnn_ztensor *ztensor); + +void zdnn_reset_ztensor(zdnn_ztensor *ztensor); + +uint64_t zdnn_getsize_ztensor(const zdnn_tensor_desc *tfrmd_desc); + +zdnn_status zdnn_getrange_ztensor(const zdnn_ztensor *ztensor, float *min, + float *max); + +// ----------------------------------------------------------------------------- +// External Query Functions +// ----------------------------------------------------------------------------- + +bool zdnn_is_nnpa_installed(); +bool zdnn_is_nnpa_function_installed(int count, ...); +bool zdnn_is_nnpa_parmblk_fmt_installed(int count, ...); +bool zdnn_is_nnpa_datatype_installed(uint16_t types_bitmask); +bool zdnn_is_nnpa_layout_fmt_installed(uint32_t layout_bitmask); +bool zdnn_is_nnpa_conversion_installed(nnpa_data_type type, + uint16_t format_bitmask); + +uint32_t zdnn_get_nnpa_max_dim_idx_size(); +uint32_t zdnn_get_max_for_dim(uint8_t dimension); +uint64_t zdnn_get_nnpa_max_tensor_size(); + +zdnn_status zdnn_refresh_nnpa_query_result(); + +// ----------------------------------------------------------------------------- +// Versioning Functions +// ----------------------------------------------------------------------------- + +bool zdnn_is_version_runnable(uint32_t ver_num); +uint32_t zdnn_get_max_runnable_version(); + +// ----------------------------------------------------------------------------- +// External Elementwise Operations +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_add(const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + zdnn_ztensor *output); +zdnn_status zdnn_sub(const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + zdnn_ztensor *output); +zdnn_status zdnn_mul(const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + zdnn_ztensor *output); +zdnn_status zdnn_div(const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + zdnn_ztensor *output); +zdnn_status zdnn_min(const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + zdnn_ztensor *output); +zdnn_status zdnn_max(const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + zdnn_ztensor *output); + +zdnn_status zdnn_log(const zdnn_ztensor *input, zdnn_ztensor *output); +zdnn_status zdnn_exp(const zdnn_ztensor *input, zdnn_ztensor *output); +zdnn_status zdnn_sqrt(const zdnn_ztensor *input, zdnn_ztensor *output); +zdnn_status zdnn_invsqrt(const zdnn_ztensor *input, float epsilon, + zdnn_ztensor *output); + +// ----------------------------------------------------------------------------- +// External Activation Operations +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_relu(const zdnn_ztensor *input, const void *clipping_value, + zdnn_ztensor *output); +zdnn_status zdnn_leaky_relu(const zdnn_ztensor *input, + const void *clipping_value, float adjustment_factor, + zdnn_ztensor *output); +zdnn_status zdnn_tanh(const zdnn_ztensor *input, zdnn_ztensor *output); +zdnn_status zdnn_sigmoid(const zdnn_ztensor *input, zdnn_ztensor *output); +zdnn_status zdnn_softmax(const zdnn_ztensor *input, void *save_area, + zdnn_softmax_act act_func, zdnn_ztensor *output); +zdnn_status zdnn_softmax_mask(const zdnn_ztensor *input, void *save_area, + zdnn_softmax_act act_func, uint32_t softmax_mask, + zdnn_ztensor *output); +zdnn_status zdnn_gelu(const zdnn_ztensor *input, zdnn_ztensor *output); + +// ----------------------------------------------------------------------------- +// Recurrent Neural Network (RNN) Operations +// ----------------------------------------------------------------------------- + +typedef enum lstm_gru_direction { FWD, BWD, BIDIR } lstm_gru_direction; + +zdnn_status zdnn_lstm(const zdnn_ztensor *input, const zdnn_ztensor *h0, + const zdnn_ztensor *c0, const zdnn_ztensor *weights, + const zdnn_ztensor *biases, + const zdnn_ztensor *hidden_weights, + const zdnn_ztensor *hidden_biases, + lstm_gru_direction direction, void *work_area, + zdnn_ztensor *hn_output, zdnn_ztensor *cf_output); +zdnn_status zdnn_gru(const zdnn_ztensor *input, const zdnn_ztensor *h0, + const zdnn_ztensor *weights, const zdnn_ztensor *biases, + const zdnn_ztensor *hidden_weights, + const zdnn_ztensor *hidden_biases, + lstm_gru_direction direction, void *work_area, + zdnn_ztensor *hn_output); + +// ----------------------------------------------------------------------------- +// Matrix Multiplication Operations +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_matmul_op(const zdnn_ztensor *input_a, + const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, zdnn_matmul_ops op_type, + zdnn_ztensor *output); +zdnn_status zdnn_matmul_bcast_op(const zdnn_ztensor *input_a, + const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, + zdnn_matmul_bcast_ops op_type, + zdnn_ztensor *output); +zdnn_status zdnn_matmul_transpose_op(const zdnn_ztensor *input_a, + const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, + bool transpose_a, bool transpose_b, + zdnn_matmul_ops op_type, + zdnn_ztensor *output); +zdnn_status zdnn_quantized_matmul_op( + const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, zdnn_matmul_ops op_type, const int8_t clip_min, + const int8_t clip_max, const bool disable_clipping, const bool dequantize, + const bool pre_computed, void *work_area, zdnn_ztensor *output); + +// ----------------------------------------------------------------------------- +// External Norm Operations +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_batchnorm(const zdnn_ztensor *input_a, + const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, zdnn_ztensor *output); +zdnn_status zdnn_norm(const zdnn_ztensor *input_a, const zdnn_ztensor *input_b, + zdnn_ztensor *output); +zdnn_status zdnn_moments(const zdnn_ztensor *input, + zdnn_moments_bessel bessel_correction_type, + zdnn_ztensor *output_a, zdnn_ztensor *output_b); +zdnn_status zdnn_layernorm(const zdnn_ztensor *input_a, + const zdnn_ztensor *input_b, + const zdnn_ztensor *input_c, const float beta_value, + const float gamma_value, const float epsilon_value, + zdnn_ztensor *output); +zdnn_status zdnn_meanreduce2d(const zdnn_ztensor *input, zdnn_ztensor *output); + +zdnn_status zdnn_reduce(const zdnn_ztensor *input, void *save_area, + zdnn_reduce_ops op_type, zdnn_ztensor *output); + +// ----------------------------------------------------------------------------- +// External Pool Operations +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_avgpool2d(const zdnn_ztensor *input, + zdnn_pool_padding padding_type, + uint32_t kernel_height, uint32_t kernel_width, + uint32_t stride_height, uint32_t stride_width, + zdnn_ztensor *output); + +zdnn_status zdnn_maxpool2d(const zdnn_ztensor *input, + zdnn_pool_padding padding_type, + uint32_t kernel_height, uint32_t kernel_width, + uint32_t stride_height, uint32_t stride_width, + zdnn_ztensor *output); + +// ----------------------------------------------------------------------------- +// External Convolution Operations +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_conv2d(const zdnn_ztensor *input, const zdnn_ztensor *kernel, + const zdnn_ztensor *bias, + zdnn_pool_padding padding_type, uint32_t stride_height, + uint32_t stride_width, zdnn_conv2d_act act_func, + const void *clipping_value, zdnn_ztensor *output); + +// ----------------------------------------------------------------------------- +// External Tensor Transform Operations +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_transform_ztensor(zdnn_ztensor *ztensor, ...); + +zdnn_status zdnn_transform_ztensor_with_saturation(zdnn_ztensor *ztensor, ...); + +zdnn_status zdnn_transform_quantized_ztensor(zdnn_ztensor *ztensor, + bool saturation_control, + int8_t clip_min, int8_t clip_max, + const void *data); + +zdnn_status zdnn_transform_origtensor(const zdnn_ztensor *ztensor, + void *out_buf); + +zdnn_status zdnn_reshape_ztensor(const zdnn_ztensor *src, zdnn_ztensor *dest); + +// ----------------------------------------------------------------------------- +// External Version Related Functions +// ----------------------------------------------------------------------------- + +char *zdnn_get_library_version_str(); +uint32_t zdnn_get_library_version(); + +// ----------------------------------------------------------------------------- +// zDNN Status Related Functions +// ----------------------------------------------------------------------------- + +const char *zdnn_get_status_message(zdnn_status status); + +// ----------------------------------------------------------------------------- +// zDNN Data Type Limit Functions +// ----------------------------------------------------------------------------- + +zdnn_status zdnn_get_max_limit(zdnn_data_types transformed_type, + zdnn_data_types pre_transformed_type, + void *limit); +zdnn_status zdnn_get_min_limit(zdnn_data_types transformed_type, + zdnn_data_types pre_transformed_type, + void *limit); +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* ZDNN_ZDNN_H_ */