mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			608 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			608 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
#pragma once
 | 
						|
 | 
						|
#include "ggml.h"
 | 
						|
 | 
						|
// GGML internal header
 | 
						|
 | 
						|
#include <assert.h>
 | 
						|
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 | 
						|
#include <stddef.h>
 | 
						|
#include <stdbool.h>
 | 
						|
#include <string.h> // memcpy
 | 
						|
#include <math.h>   // fabsf
 | 
						|
 | 
						|
#undef MIN
 | 
						|
#undef MAX
 | 
						|
 | 
						|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
 | 
						|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
 | 
						|
 | 
						|
/**
 | 
						|
 * Converts brain16 to float32.
 | 
						|
 *
 | 
						|
 * The bfloat16 floating point format has the following structure:
 | 
						|
 *
 | 
						|
 *       ┌sign
 | 
						|
 *       │
 | 
						|
 *       │   ┌exponent
 | 
						|
 *       │   │
 | 
						|
 *       │   │      ┌mantissa
 | 
						|
 *       │   │      │
 | 
						|
 *       │┌──┴───┐┌─┴───┐
 | 
						|
 *     0b0000000000000000 brain16
 | 
						|
 *
 | 
						|
 * Since bf16 has the same number of exponent bits as a 32bit float,
 | 
						|
 * encoding and decoding numbers becomes relatively straightforward.
 | 
						|
 *
 | 
						|
 *       ┌sign
 | 
						|
 *       │
 | 
						|
 *       │   ┌exponent
 | 
						|
 *       │   │
 | 
						|
 *       │   │      ┌mantissa
 | 
						|
 *       │   │      │
 | 
						|
 *       │┌──┴───┐┌─┴───────────────────┐
 | 
						|
 *     0b00000000000000000000000000000000 IEEE binary32
 | 
						|
 *
 | 
						|
 * For comparison, the standard fp16 format has fewer exponent bits.
 | 
						|
 *
 | 
						|
 *       ┌sign
 | 
						|
 *       │
 | 
						|
 *       │  ┌exponent
 | 
						|
 *       │  │
 | 
						|
 *       │  │    ┌mantissa
 | 
						|
 *       │  │    │
 | 
						|
 *       │┌─┴─┐┌─┴──────┐
 | 
						|
 *     0b0000000000000000 IEEE binary16
 | 
						|
 *
 | 
						|
 * @see IEEE 754-2008
 | 
						|
 */
 | 
						|
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
 | 
						|
    union {
 | 
						|
        float f;
 | 
						|
        uint32_t i;
 | 
						|
    } u;
 | 
						|
    u.i = (uint32_t)h.bits << 16;
 | 
						|
    return u.f;
 | 
						|
}
 | 
						|
 | 
						|
/**
 | 
						|
 * Converts float32 to brain16.
 | 
						|
 *
 | 
						|
 * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
 | 
						|
 * Subnormals shall be flushed to zero, and NANs will be quiet.
 | 
						|
 * This code should vectorize nicely if using modern compilers.
 | 
						|
 */
 | 
						|
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 | 
						|
    ggml_bf16_t h;
 | 
						|
    union {
 | 
						|
        float f;
 | 
						|
        uint32_t i;
 | 
						|
    } u;
 | 
						|
    u.f = s;
 | 
						|
    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
 | 
						|
        h.bits = (u.i >> 16) | 64; /* force to quiet */
 | 
						|
        return h;
 | 
						|
    }
 | 
						|
    if (!(u.i & 0x7f800000)) { /* subnormal */
 | 
						|
        h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
 | 
						|
        return h;
 | 
						|
    }
 | 
						|
    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
 | 
						|
    return h;
 | 
						|
}
 | 
						|
 | 
						|
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 | 
						|
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 | 
						|
 | 
						|
#ifdef __cplusplus
 | 
						|
extern "C" {
 | 
						|
#endif
 | 
						|
 | 
						|
// static_assert should be a #define, but if it's not,
 | 
						|
// fall back to the _Static_assert C11 keyword.
 | 
						|
// if C99 - static_assert is noop
 | 
						|
// ref: https://stackoverflow.com/a/53923785/4039976
 | 
						|
#ifndef __cplusplus
 | 
						|
#ifndef static_assert
 | 
						|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 | 
						|
#define static_assert(cond, msg) _Static_assert(cond, msg)
 | 
						|
#else
 | 
						|
#define static_assert(cond, msg) struct global_scope_noop_trick
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
 | 
						|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 | 
						|
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 | 
						|
#ifndef __FMA__
 | 
						|
#define __FMA__
 | 
						|
#endif
 | 
						|
#ifndef __F16C__
 | 
						|
#define __F16C__
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
 | 
						|
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
 | 
						|
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 | 
						|
#ifndef __SSE3__
 | 
						|
#define __SSE3__
 | 
						|
#endif
 | 
						|
#ifndef __SSSE3__
 | 
						|
#define __SSSE3__
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
 | 
						|
// 16-bit float
 | 
						|
// on Arm, we use __fp16
 | 
						|
// on x86, we use uint16_t
 | 
						|
#if defined(__ARM_NEON)
 | 
						|
 | 
						|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 | 
						|
//
 | 
						|
//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
 | 
						|
//
 | 
						|
#include <arm_neon.h>
 | 
						|
 | 
						|
#ifdef _MSC_VER
 | 
						|
 | 
						|
typedef uint16_t ggml_fp16_internal_t;
 | 
						|
 | 
						|
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
 | 
						|
 | 
						|
#else
 | 
						|
 | 
						|
typedef __fp16 ggml_fp16_internal_t;
 | 
						|
 | 
						|
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
 | 
						|
 | 
						|
#endif // _MSC_VER
 | 
						|
 | 
						|
#if !defined(__aarch64__)
 | 
						|
 | 
						|
// 32-bit ARM compatibility
 | 
						|
 | 
						|
// vaddvq_s16
 | 
						|
// vpaddq_s16
 | 
						|
// vpaddq_s32
 | 
						|
// vaddvq_s32
 | 
						|
// vaddvq_f32
 | 
						|
// vmaxvq_f32
 | 
						|
// vcvtnq_s32_f32
 | 
						|
// vzip1_u8
 | 
						|
// vzip2_u8
 | 
						|
 | 
						|
inline static int32_t vaddvq_s16(int16x8_t v) {
 | 
						|
    return
 | 
						|
        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
 | 
						|
        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
 | 
						|
        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
 | 
						|
        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
 | 
						|
}
 | 
						|
 | 
						|
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
 | 
						|
    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
 | 
						|
    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
 | 
						|
    return vcombine_s16(a0, b0);
 | 
						|
}
 | 
						|
 | 
						|
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
 | 
						|
    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
 | 
						|
    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
 | 
						|
    return vcombine_s32(a0, b0);
 | 
						|
}
 | 
						|
 | 
						|
inline static int32_t vaddvq_s32(int32x4_t v) {
 | 
						|
    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 | 
						|
}
 | 
						|
 | 
						|
inline static float vaddvq_f32(float32x4_t v) {
 | 
						|
    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 | 
						|
}
 | 
						|
 | 
						|
inline static float vmaxvq_f32(float32x4_t v) {
 | 
						|
    return
 | 
						|
        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
 | 
						|
            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 | 
						|
}
 | 
						|
 | 
						|
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
 | 
						|
    int32x4_t res;
 | 
						|
 | 
						|
    res[0] = roundf(vgetq_lane_f32(v, 0));
 | 
						|
    res[1] = roundf(vgetq_lane_f32(v, 1));
 | 
						|
    res[2] = roundf(vgetq_lane_f32(v, 2));
 | 
						|
    res[3] = roundf(vgetq_lane_f32(v, 3));
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
 | 
						|
    uint8x8_t res;
 | 
						|
 | 
						|
    res[0] = a[0]; res[1] = b[0];
 | 
						|
    res[2] = a[1]; res[3] = b[1];
 | 
						|
    res[4] = a[2]; res[5] = b[2];
 | 
						|
    res[6] = a[3]; res[7] = b[3];
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
 | 
						|
    uint8x8_t res;
 | 
						|
 | 
						|
    res[0] = a[4]; res[1] = b[4];
 | 
						|
    res[2] = a[5]; res[3] = b[5];
 | 
						|
    res[4] = a[6]; res[5] = b[6];
 | 
						|
    res[6] = a[7]; res[7] = b[7];
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
// vld1q_s16_x2
 | 
						|
// vld1q_u8_x2
 | 
						|
// vld1q_u8_x4
 | 
						|
// vld1q_s8_x2
 | 
						|
// vld1q_s8_x4
 | 
						|
// TODO: double-check these work correctly
 | 
						|
 | 
						|
typedef struct ggml_int16x8x2_t {
 | 
						|
    int16x8_t val[2];
 | 
						|
} ggml_int16x8x2_t;
 | 
						|
 | 
						|
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
 | 
						|
    ggml_int16x8x2_t res;
 | 
						|
 | 
						|
    res.val[0] = vld1q_s16(ptr + 0);
 | 
						|
    res.val[1] = vld1q_s16(ptr + 8);
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
typedef struct ggml_uint8x16x2_t {
 | 
						|
    uint8x16_t val[2];
 | 
						|
} ggml_uint8x16x2_t;
 | 
						|
 | 
						|
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
 | 
						|
    ggml_uint8x16x2_t res;
 | 
						|
 | 
						|
    res.val[0] = vld1q_u8(ptr + 0);
 | 
						|
    res.val[1] = vld1q_u8(ptr + 16);
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
typedef struct ggml_uint8x16x4_t {
 | 
						|
    uint8x16_t val[4];
 | 
						|
} ggml_uint8x16x4_t;
 | 
						|
 | 
						|
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
 | 
						|
    ggml_uint8x16x4_t res;
 | 
						|
 | 
						|
    res.val[0] = vld1q_u8(ptr + 0);
 | 
						|
    res.val[1] = vld1q_u8(ptr + 16);
 | 
						|
    res.val[2] = vld1q_u8(ptr + 32);
 | 
						|
    res.val[3] = vld1q_u8(ptr + 48);
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
typedef struct ggml_int8x16x2_t {
 | 
						|
    int8x16_t val[2];
 | 
						|
} ggml_int8x16x2_t;
 | 
						|
 | 
						|
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
 | 
						|
    ggml_int8x16x2_t res;
 | 
						|
 | 
						|
    res.val[0] = vld1q_s8(ptr + 0);
 | 
						|
    res.val[1] = vld1q_s8(ptr + 16);
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
typedef struct ggml_int8x16x4_t {
 | 
						|
    int8x16_t val[4];
 | 
						|
} ggml_int8x16x4_t;
 | 
						|
 | 
						|
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
 | 
						|
    ggml_int8x16x4_t res;
 | 
						|
 | 
						|
    res.val[0] = vld1q_s8(ptr + 0);
 | 
						|
    res.val[1] = vld1q_s8(ptr + 16);
 | 
						|
    res.val[2] = vld1q_s8(ptr + 32);
 | 
						|
    res.val[3] = vld1q_s8(ptr + 48);
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
// NOTE: not tested
 | 
						|
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
 | 
						|
    int8x16_t res;
 | 
						|
 | 
						|
    res[ 0] = a[b[ 0]];
 | 
						|
    res[ 1] = a[b[ 1]];
 | 
						|
    res[ 2] = a[b[ 2]];
 | 
						|
    res[ 3] = a[b[ 3]];
 | 
						|
    res[ 4] = a[b[ 4]];
 | 
						|
    res[ 5] = a[b[ 5]];
 | 
						|
    res[ 6] = a[b[ 6]];
 | 
						|
    res[ 7] = a[b[ 7]];
 | 
						|
    res[ 8] = a[b[ 8]];
 | 
						|
    res[ 9] = a[b[ 9]];
 | 
						|
    res[10] = a[b[10]];
 | 
						|
    res[11] = a[b[11]];
 | 
						|
    res[12] = a[b[12]];
 | 
						|
    res[13] = a[b[13]];
 | 
						|
    res[14] = a[b[14]];
 | 
						|
    res[15] = a[b[15]];
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
// NOTE: not tested
 | 
						|
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
 | 
						|
    uint8x16_t res;
 | 
						|
 | 
						|
    res[ 0] = a[b[ 0]];
 | 
						|
    res[ 1] = a[b[ 1]];
 | 
						|
    res[ 2] = a[b[ 2]];
 | 
						|
    res[ 3] = a[b[ 3]];
 | 
						|
    res[ 4] = a[b[ 4]];
 | 
						|
    res[ 5] = a[b[ 5]];
 | 
						|
    res[ 6] = a[b[ 6]];
 | 
						|
    res[ 7] = a[b[ 7]];
 | 
						|
    res[ 8] = a[b[ 8]];
 | 
						|
    res[ 9] = a[b[ 9]];
 | 
						|
    res[10] = a[b[10]];
 | 
						|
    res[11] = a[b[11]];
 | 
						|
    res[12] = a[b[12]];
 | 
						|
    res[13] = a[b[13]];
 | 
						|
    res[14] = a[b[14]];
 | 
						|
    res[15] = a[b[15]];
 | 
						|
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
#else
 | 
						|
 | 
						|
#define ggml_int16x8x2_t  int16x8x2_t
 | 
						|
#define ggml_uint8x16x2_t uint8x16x2_t
 | 
						|
#define ggml_uint8x16x4_t uint8x16x4_t
 | 
						|
#define ggml_int8x16x2_t  int8x16x2_t
 | 
						|
#define ggml_int8x16x4_t  int8x16x4_t
 | 
						|
 | 
						|
#define ggml_vld1q_s16_x2 vld1q_s16_x2
 | 
						|
#define ggml_vld1q_u8_x2  vld1q_u8_x2
 | 
						|
#define ggml_vld1q_u8_x4  vld1q_u8_x4
 | 
						|
#define ggml_vld1q_s8_x2  vld1q_s8_x2
 | 
						|
#define ggml_vld1q_s8_x4  vld1q_s8_x4
 | 
						|
#define ggml_vqtbl1q_s8   vqtbl1q_s8
 | 
						|
#define ggml_vqtbl1q_u8   vqtbl1q_u8
 | 
						|
 | 
						|
#endif // !defined(__aarch64__)
 | 
						|
 | 
						|
#if !defined(__ARM_FEATURE_DOTPROD)
 | 
						|
 | 
						|
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
 | 
						|
    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
 | 
						|
    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
 | 
						|
 | 
						|
    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
 | 
						|
}
 | 
						|
 | 
						|
#else
 | 
						|
 | 
						|
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
 | 
						|
 | 
						|
#endif // !defined(__ARM_FEATURE_DOTPROD)
 | 
						|
 | 
						|
#endif // defined(__ARM_NEON)
 | 
						|
 | 
						|
#if defined(__ARM_NEON) && !defined(_MSC_VER)
 | 
						|
 | 
						|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 | 
						|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 | 
						|
 | 
						|
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 | 
						|
 | 
						|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
 | 
						|
    ggml_fp16_internal_t tmp;
 | 
						|
    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
 | 
						|
    return (float)tmp;
 | 
						|
}
 | 
						|
 | 
						|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 | 
						|
    ggml_fp16_t res;
 | 
						|
    ggml_fp16_internal_t tmp = f;
 | 
						|
    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
 | 
						|
    return res;
 | 
						|
}
 | 
						|
 | 
						|
#else
 | 
						|
 | 
						|
#ifdef __wasm_simd128__
 | 
						|
#include <wasm_simd128.h>
 | 
						|
#else
 | 
						|
#ifdef __POWER9_VECTOR__
 | 
						|
#include <altivec.h>
 | 
						|
#undef bool
 | 
						|
#define bool _Bool
 | 
						|
#else
 | 
						|
#if defined(_MSC_VER) || defined(__MINGW32__)
 | 
						|
#include <intrin.h>
 | 
						|
#else
 | 
						|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
 | 
						|
#if !defined(__riscv)
 | 
						|
#include <immintrin.h>
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef __riscv_v_intrinsic
 | 
						|
#include <riscv_vector.h>
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef __F16C__
 | 
						|
 | 
						|
#ifdef _MSC_VER
 | 
						|
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
 | 
						|
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
 | 
						|
#else
 | 
						|
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 | 
						|
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
 | 
						|
#endif
 | 
						|
 | 
						|
#elif defined(__POWER9_VECTOR__)
 | 
						|
 | 
						|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 | 
						|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 | 
						|
/* the inline asm below is about 12% faster than the lookup method */
 | 
						|
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 | 
						|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 | 
						|
 | 
						|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
 | 
						|
    register float f;
 | 
						|
    register double d;
 | 
						|
    __asm__(
 | 
						|
        "mtfprd %0,%2\n"
 | 
						|
        "xscvhpdp %0,%0\n"
 | 
						|
        "frsp %1,%0\n" :
 | 
						|
        /* temp */ "=d"(d),
 | 
						|
        /* out */  "=f"(f):
 | 
						|
        /* in */   "r"(h));
 | 
						|
    return f;
 | 
						|
}
 | 
						|
 | 
						|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 | 
						|
    register double d;
 | 
						|
    register ggml_fp16_t r;
 | 
						|
    __asm__( /* xscvdphp can work on double or single precision */
 | 
						|
        "xscvdphp %0,%2\n"
 | 
						|
        "mffprd %1,%0\n" :
 | 
						|
        /* temp */ "=d"(d),
 | 
						|
        /* out */  "=r"(r):
 | 
						|
        /* in */   "f"(f));
 | 
						|
    return r;
 | 
						|
}
 | 
						|
 | 
						|
#else
 | 
						|
 | 
						|
// FP16 <-> FP32
 | 
						|
// ref: https://github.com/Maratyszcza/FP16
 | 
						|
 | 
						|
static inline float fp32_from_bits(uint32_t w) {
 | 
						|
    union {
 | 
						|
        uint32_t as_bits;
 | 
						|
        float as_value;
 | 
						|
    } fp32;
 | 
						|
    fp32.as_bits = w;
 | 
						|
    return fp32.as_value;
 | 
						|
}
 | 
						|
 | 
						|
static inline uint32_t fp32_to_bits(float f) {
 | 
						|
    union {
 | 
						|
        float as_value;
 | 
						|
        uint32_t as_bits;
 | 
						|
    } fp32;
 | 
						|
    fp32.as_value = f;
 | 
						|
    return fp32.as_bits;
 | 
						|
}
 | 
						|
 | 
						|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
 | 
						|
    const uint32_t w = (uint32_t) h << 16;
 | 
						|
    const uint32_t sign = w & UINT32_C(0x80000000);
 | 
						|
    const uint32_t two_w = w + w;
 | 
						|
 | 
						|
    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
 | 
						|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
 | 
						|
    const float exp_scale = 0x1.0p-112f;
 | 
						|
#else
 | 
						|
    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
 | 
						|
#endif
 | 
						|
    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
 | 
						|
 | 
						|
    const uint32_t magic_mask = UINT32_C(126) << 23;
 | 
						|
    const float magic_bias = 0.5f;
 | 
						|
    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
 | 
						|
 | 
						|
    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
 | 
						|
    const uint32_t result = sign |
 | 
						|
        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
 | 
						|
    return fp32_from_bits(result);
 | 
						|
}
 | 
						|
 | 
						|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 | 
						|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
 | 
						|
    const float scale_to_inf = 0x1.0p+112f;
 | 
						|
    const float scale_to_zero = 0x1.0p-110f;
 | 
						|
#else
 | 
						|
    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
 | 
						|
    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
 | 
						|
#endif
 | 
						|
    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
 | 
						|
 | 
						|
    const uint32_t w = fp32_to_bits(f);
 | 
						|
    const uint32_t shl1_w = w + w;
 | 
						|
    const uint32_t sign = w & UINT32_C(0x80000000);
 | 
						|
    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
 | 
						|
    if (bias < UINT32_C(0x71000000)) {
 | 
						|
        bias = UINT32_C(0x71000000);
 | 
						|
    }
 | 
						|
 | 
						|
    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
 | 
						|
    const uint32_t bits = fp32_to_bits(base);
 | 
						|
    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
 | 
						|
    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
 | 
						|
    const uint32_t nonsign = exp_bits + mantissa_bits;
 | 
						|
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 | 
						|
}
 | 
						|
 | 
						|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 | 
						|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 | 
						|
 | 
						|
#endif // __F16C__
 | 
						|
 | 
						|
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
 | 
						|
 | 
						|
// precomputed f32 table for f16 (256 KB)
 | 
						|
// defined in ggml.c, initialized in ggml_init()
 | 
						|
extern float ggml_table_f32_f16[1 << 16];
 | 
						|
 | 
						|
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 | 
						|
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 | 
						|
// This is also true for POWER9.
 | 
						|
#if !defined(GGML_FP16_TO_FP32)
 | 
						|
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 | 
						|
    uint16_t s;
 | 
						|
    memcpy(&s, &f, sizeof(uint16_t));
 | 
						|
    return ggml_table_f32_f16[s];
 | 
						|
}
 | 
						|
 | 
						|
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 | 
						|
#endif
 | 
						|
 | 
						|
#if !defined(GGML_FP32_TO_FP16)
 | 
						|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 | 
						|
#endif
 | 
						|
 | 
						|
#define GGML_HASHTABLE_FULL ((size_t)-1)
 | 
						|
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
 | 
						|
 | 
						|
struct ggml_hash_set ggml_hash_set_new(size_t size);
 | 
						|
 | 
						|
bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 | 
						|
 | 
						|
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 | 
						|
size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 | 
						|
 | 
						|
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 | 
						|
size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 | 
						|
 | 
						|
// return index, asserts if table is full
 | 
						|
size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 | 
						|
 | 
						|
#ifdef __cplusplus
 | 
						|
}
 | 
						|
#endif
 |