mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-27 16:29:19 +01:00
ggml : refactoring (llama/#0)
- d6a04f87 - 23e0d70b
This commit is contained in:
parent
d245d7aec7
commit
34291099fb
@ -358,6 +358,7 @@ extern "C" {
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
struct ggml_cgraph;
|
||||
|
||||
// NOTE: always add types at the end of the enum to keep backward compatibility
|
||||
enum ggml_type {
|
||||
@ -579,23 +580,9 @@ extern "C" {
|
||||
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
|
||||
};
|
||||
|
||||
// ggml object
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
enum ggml_object_type type;
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_type type;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
@ -659,7 +646,7 @@ extern "C" {
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
@ -675,35 +662,6 @@ extern "C" {
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
// computation graph
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
// scratch buffer
|
||||
struct ggml_scratch {
|
||||
size_t offs;
|
||||
@ -2021,8 +1979,6 @@ extern "C" {
|
||||
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
||||
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
||||
|
||||
#define GGML_N_TASKS_MAX -1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_map_custom1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@ -2103,7 +2059,6 @@ extern "C" {
|
||||
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate, bool keep);
|
||||
|
||||
@ -2118,25 +2073,31 @@ extern "C" {
|
||||
float wd); // weight decay
|
||||
|
||||
// graph allocation in a context
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
|
||||
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
|
||||
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
||||
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <cstring>
|
||||
#include <mutex>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cann/aclnn_ops.h"
|
||||
#include "ggml-cann/common.h"
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-cuda/common.cuh"
|
||||
|
@ -1,15 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
@ -17,96 +19,6 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
|
||||
#else
|
||||
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This is binary identical with Google Brain float conversion.
|
||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
@ -121,516 +33,10 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
#define __FMA__
|
||||
#endif
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
typedef uint16_t ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||
|
||||
#else
|
||||
|
||||
typedef __fp16 ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 32-bit ARM compatibility
|
||||
|
||||
// vaddlvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
||||
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
||||
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
// vld1q_s8_x2
|
||||
// vld1q_s8_x4
|
||||
// TODO: double-check these work correctly
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s16(ptr + 0);
|
||||
res.val[1] = vld1q_s16(ptr + 8);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
res.val[2] = vld1q_u8(ptr + 32);
|
||||
res.val[3] = vld1q_u8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x2_t {
|
||||
int8x16_t val[2];
|
||||
} ggml_int8x16x2_t;
|
||||
|
||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||
ggml_int8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
res.val[2] = vld1q_s8(ptr + 32);
|
||||
res.val[3] = vld1q_s8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
uint8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||
#define ggml_int8x16x2_t int8x16x2_t
|
||||
#define ggml_int8x16x4_t int8x16x4_t
|
||||
|
||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif // !defined(__aarch64__)
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
ggml_fp16_internal_t tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
ggml_fp16_internal_t tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch64)
|
||||
#if defined(__loongarch_asx)
|
||||
#include <lasxintrin.h>
|
||||
#endif
|
||||
#if defined(__loongarch_sx)
|
||||
#include <lsxintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
|
||||
typedef union {
|
||||
int32_t i;
|
||||
float f;
|
||||
} ft_union;
|
||||
|
||||
/* float type data load instructions */
|
||||
static __m128 __lsx_vreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
|
||||
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_FP32_TO_FP16)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||
@ -656,6 +62,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||
|
||||
@ -745,6 +157,30 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// computation graph
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-kompute.h"
|
||||
|
@ -1,7 +1,7 @@
|
||||
#import "ggml-metal.h"
|
||||
|
||||
#import "ggml-impl.h"
|
||||
#import "ggml-backend-impl.h"
|
||||
#import "ggml.h"
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
@ -885,7 +885,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
// create multiple command buffers and enqueue them
|
||||
// then, we encode the graph into the command buffers in parallel
|
||||
|
||||
const int n_nodes = gf->n_nodes;
|
||||
const int n_nodes = gf->n_nodes;
|
||||
const int n_cb = ctx->n_cb;
|
||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
|
||||
#include <math.h>
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include "ggml-rpc.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <cinttypes>
|
||||
|
@ -33,7 +33,7 @@
|
||||
#include <sycl/half_type.hpp>
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-sycl/backend.hpp"
|
||||
|
@ -21,7 +21,7 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-vulkan-shaders.hpp"
|
||||
|
113
ggml/src/ggml.c
113
ggml/src/ggml.c
@ -3,6 +3,7 @@
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-aarch64.h"
|
||||
@ -288,6 +289,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
#define GGML_DEBUG 0
|
||||
#define GGML_GELU_FP16
|
||||
#define GGML_GELU_QUICK_FP16
|
||||
#define GGML_N_TASKS_MAX (-1)
|
||||
|
||||
#define GGML_SOFT_MAX_UNROLL 4
|
||||
#define GGML_VEC_DOT_UNROLL 2
|
||||
@ -1121,21 +1123,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
#define GGML_F32x4_ADD vaddq_f32
|
||||
#define GGML_F32x4_MUL vmulq_f32
|
||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
@ -1162,30 +1164,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||
#define GGML_F16x8_ADD vaddq_f16
|
||||
#define GGML_F16x8_MUL vmulq_f16
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
||||
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x8
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||
@ -1894,6 +1896,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||
#endif
|
||||
|
||||
//
|
||||
// ggml object
|
||||
//
|
||||
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
enum ggml_object_type type;
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
//
|
||||
// ggml context
|
||||
//
|
||||
@ -19408,6 +19427,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||
}
|
||||
|
||||
int ggml_graph_size(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->size;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
|
||||
if (i < 0) {
|
||||
GGML_ASSERT(cgraph->n_nodes + i >= 0);
|
||||
return cgraph->nodes[cgraph->n_nodes + i];
|
||||
}
|
||||
|
||||
GGML_ASSERT(i < cgraph->n_nodes);
|
||||
return cgraph->nodes[i];
|
||||
}
|
||||
|
||||
struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->nodes;
|
||||
}
|
||||
|
||||
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->n_nodes;
|
||||
}
|
||||
|
||||
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(cgraph->size > cgraph->n_nodes);
|
||||
cgraph->nodes[cgraph->n_nodes] = tensor;
|
||||
cgraph->n_nodes++;
|
||||
}
|
||||
|
||||
// Android's libc implementation "bionic" does not support setting affinity
|
||||
#if defined(__gnu_linux__)
|
||||
static void set_numa_thread_affinity(int thread_n) {
|
||||
|
614
src/ggml-cpu-impl.h
Normal file
614
src/ggml-cpu-impl.h
Normal file
@ -0,0 +1,614 @@
|
||||
#pragma once
|
||||
|
||||
// GGML CPU internal header
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
//#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
|
||||
#else
|
||||
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This is binary identical with Google Brain float conversion.
|
||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
#define __FMA__
|
||||
#endif
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
typedef uint16_t ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||
|
||||
#else
|
||||
|
||||
typedef __fp16 ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 32-bit ARM compatibility
|
||||
|
||||
// vaddlvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
||||
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
||||
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
// vld1q_s8_x2
|
||||
// vld1q_s8_x4
|
||||
// TODO: double-check these work correctly
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s16(ptr + 0);
|
||||
res.val[1] = vld1q_s16(ptr + 8);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
res.val[2] = vld1q_u8(ptr + 32);
|
||||
res.val[3] = vld1q_u8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x2_t {
|
||||
int8x16_t val[2];
|
||||
} ggml_int8x16x2_t;
|
||||
|
||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||
ggml_int8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
res.val[2] = vld1q_s8(ptr + 32);
|
||||
res.val[3] = vld1q_s8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
uint8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||
#define ggml_int8x16x2_t int8x16x2_t
|
||||
#define ggml_int8x16x4_t int8x16x4_t
|
||||
|
||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif // !defined(__aarch64__)
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
ggml_fp16_internal_t tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
ggml_fp16_internal_t tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch64)
|
||||
#if defined(__loongarch_asx)
|
||||
#include <lasxintrin.h>
|
||||
#endif
|
||||
#if defined(__loongarch_sx)
|
||||
#include <lsxintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
|
||||
typedef union {
|
||||
int32_t i;
|
||||
float f;
|
||||
} ft_union;
|
||||
|
||||
/* float type data load instructions */
|
||||
static __m128 __lsx_vreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
|
||||
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_FP32_TO_FP16)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user