From 49a8dd6732760bba127898c4bf77cb5535873b29 Mon Sep 17 00:00:00 2001 From: Thomas Fitzsimmons Date: Tue, 3 Jan 2023 00:45:43 -0500 Subject: [PATCH] ggml : reorganize POWER9 ppc64le SIMD code --- ggml.c | 165 ++++++++------------------------------------------------- 1 file changed, 23 insertions(+), 142 deletions(-) diff --git a/ggml.c b/ggml.c index 3ae6aef4..497e7341 100644 --- a/ggml.c +++ b/ggml.c @@ -528,23 +528,21 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); #elif defined(__POWER9_VECTOR__) -// TODO: uncomment this when it works -//#define GGML_SIMD +#define GGML_SIMD // F32 POWER9 #define GGML_F32_STEP 32 -#define GGML_F32_EPR 8 +#define GGML_F32_EPR 4 -// TODO: not tested !! -#define GGML_F32x4 __vector float -#define GGML_F32x4_ZERO (__vector float){0.0f, 0.0f, 0.0f, 0.0f} -#define GGML_F32x4_SET1(x) (__vector float){x, x, x, x} -#define GGML_F32x4_LOAD vec_vsx_ld -#define GGML_F32x4_STORE vec_vsx_st +#define GGML_F32x4 vector float +#define GGML_F32x4_ZERO 0.0f +#define GGML_F32x4_SET1 vec_splats +#define GGML_F32x4_LOAD(p) vec_xl(0, p) +#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) -#define GGML_F32x4_ADD vec_add -#define GGML_F32x4_MUL vec_mul +#define GGML_F32x4_ADD vec_add +#define GGML_F32x4_MUL vec_mul #define GGML_F32x4_REDUCE(res, x) \ { \ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ @@ -573,8 +571,20 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE // F16 POWER9 -// TODO: implement here -// ... +#define GGML_F16_STEP GGML_F32_STEP +#define GGML_F16_EPR GGML_F32_EPR +#define GGML_F16_VEC GGML_F32x4 +#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F16_VEC_FMA GGML_F32x4_FMA +#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE +// Use vec_xl, not vec_ld, in case the load address is not aligned. +#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ + vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ + vec_extract_fp32_from_shortl(vec_xl(0, p)) +#define GGML_F16_VEC_STORE(p, r, i) \ + if (i & 0x1) \ + vec_xst(vec_pack_to_short_fp32(r[i], r[i - 1]), 0, p - GGML_F16_EPR) #elif defined(__wasm_simd128__) @@ -777,76 +787,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t for (int i = np; i < n; ++i) { sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); } -#elif defined(__POWER9_VECTOR__) - // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without - // being able to test it. hoping someone with access to a POWER9 machine can help out here. - const int n32 = (n & ~31); - - vector float sum0 = vec_splats (0.0f); - vector float sum1 = vec_splats (0.0f); - vector float sum2 = vec_splats (0.0f); - vector float sum3 = vec_splats (0.0f); - vector float sum4 = vec_splats (0.0f); - vector float sum5 = vec_splats (0.0f); - vector float sum6 = vec_splats (0.0f); - vector float sum7 = vec_splats (0.0f); - - for (int i = 0, j = 0; i < n32; i += 32, j += 64) { - // Use vec_xl, not vec_ld, because x is sometimes unaligned. - vector unsigned short x0 = vec_xl(j + 0, x); - vector unsigned short x1 = vec_xl(j + 16, x); - vector unsigned short x2 = vec_xl(j + 32, x); - vector unsigned short x3 = vec_xl(j + 48, x); - - vector unsigned short y0 = vec_ld(j + 0, y); - vector unsigned short y1 = vec_ld(j + 16, y); - vector unsigned short y2 = vec_ld(j + 32, y); - vector unsigned short y3 = vec_ld(j + 48, y); - - vector float fx0l = vec_extract_fp32_from_shortl(x0); - vector float fx0h = vec_extract_fp32_from_shorth(x0); - vector float fx1l = vec_extract_fp32_from_shortl(x1); - vector float fx1h = vec_extract_fp32_from_shorth(x1); - vector float fx2l = vec_extract_fp32_from_shortl(x2); - vector float fx2h = vec_extract_fp32_from_shorth(x2); - vector float fx3l = vec_extract_fp32_from_shortl(x3); - vector float fx3h = vec_extract_fp32_from_shorth(x3); - - vector float fy0l = vec_extract_fp32_from_shortl(y0); - vector float fy0h = vec_extract_fp32_from_shorth(y0); - vector float fy1l = vec_extract_fp32_from_shortl(y1); - vector float fy1h = vec_extract_fp32_from_shorth(y1); - vector float fy2l = vec_extract_fp32_from_shortl(y2); - vector float fy2h = vec_extract_fp32_from_shorth(y2); - vector float fy3l = vec_extract_fp32_from_shortl(y3); - vector float fy3h = vec_extract_fp32_from_shorth(y3); - - sum0 = vec_madd(fx0l, fy0l, sum0); - sum1 = vec_madd(fx0h, fy0h, sum1); - sum2 = vec_madd(fx1l, fy1l, sum2); - sum3 = vec_madd(fx1h, fy1h, sum3); - sum4 = vec_madd(fx2l, fy2l, sum4); - sum5 = vec_madd(fx2h, fy2h, sum5); - sum6 = vec_madd(fx3l, fy3l, sum6); - sum7 = vec_madd(fx3h, fy3h, sum7); - } - - sum0 = vec_add(sum0, sum1); - sum2 = vec_add(sum2, sum3); - sum4 = vec_add(sum4, sum5); - sum6 = vec_add(sum6, sum7); - - sum0 = vec_add(sum0, sum2); - sum4 = vec_add(sum4, sum6); - - sum0 = vec_add(sum0, sum4); - - sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1) - + vec_extract(sum0, 2) + vec_extract(sum0, 3); - - for (int i = n32; i < n; ++i) { - sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); - } #else for (int i = 0; i < n; ++i) { sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); @@ -911,65 +851,6 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_ GGML_ASSERT(false); y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); } -#elif defined(__POWER9_VECTOR__) - // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without - // being able to test it. hoping someone with access to a POWER9 machine can help out here. - const int n32 = (n & ~31); - for (int i = 0, j = 0; i < n32; i += 32, j += 64) { - // Use vec_xl, not vec_ld, because x is sometimes unaligned! - vector unsigned short x0 = vec_xl(j + 0, x); - vector unsigned short x1 = vec_xl(j + 16, x); - vector unsigned short x2 = vec_xl(j + 32, x); - vector unsigned short x3 = vec_xl(j + 48, x); - - vector unsigned short y0 = vec_xl(j + 0, y); - vector unsigned short y1 = vec_xl(j + 16, y); - vector unsigned short y2 = vec_xl(j + 32, y); - vector unsigned short y3 = vec_xl(j + 48, y); - - vector float v4 = vec_splats(v); - - vector float fx0l = vec_extract_fp32_from_shortl(x0); - vector float fx0h = vec_extract_fp32_from_shorth(x0); - vector float fx1l = vec_extract_fp32_from_shortl(x1); - vector float fx1h = vec_extract_fp32_from_shorth(x1); - vector float fx2l = vec_extract_fp32_from_shortl(x2); - vector float fx2h = vec_extract_fp32_from_shorth(x2); - vector float fx3l = vec_extract_fp32_from_shortl(x3); - vector float fx3h = vec_extract_fp32_from_shorth(x3); - - vector float fy0l = vec_extract_fp32_from_shortl(y0); - vector float fy0h = vec_extract_fp32_from_shorth(y0); - vector float fy1l = vec_extract_fp32_from_shortl(y1); - vector float fy1h = vec_extract_fp32_from_shorth(y1); - vector float fy2l = vec_extract_fp32_from_shortl(y2); - vector float fy2h = vec_extract_fp32_from_shorth(y2); - vector float fy3l = vec_extract_fp32_from_shortl(y3); - vector float fy3h = vec_extract_fp32_from_shorth(y3); - - fy0l = vec_madd(fx0l, v4, fy0l); - fy0h = vec_madd(fx0h, v4, fy0h); - fy1l = vec_madd(fx1l, v4, fy1l); - fy1h = vec_madd(fx1h, v4, fy1h); - fy2l = vec_madd(fx2l, v4, fy2l); - fy2h = vec_madd(fx2h, v4, fy2h); - fy3l = vec_madd(fx3l, v4, fy3l); - fy3h = vec_madd(fx3h, v4, fy3h); - - y0 = vec_pack_to_short_fp32(fy0h, fy0l); - y1 = vec_pack_to_short_fp32(fy1h, fy1l); - y2 = vec_pack_to_short_fp32(fy2h, fy2l); - y3 = vec_pack_to_short_fp32(fy3h, fy3l); - - vec_xst(y0, j + 0, y); - vec_xst(y1, j + 16, y); - vec_xst(y2, j + 32, y); - vec_xst(y3, j + 48, y); - } - - for (int i = n32; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); - } #else for (int i = 0; i < n; ++i) { y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);