ggml : riscv: add xtheadvector support (llama/13720)

* ggml : riscv: add xtheadvector support

* ggml : clean up some macro usage
This commit is contained in:
xctan 2025-05-27 21:21:36 +08:00 committed by Georgi Gerganov
parent 2e7a1e3e43
commit 15ae9dc2a4
6 changed files with 455 additions and 35 deletions

View File

@ -129,6 +129,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON) option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF) option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ON) option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)

View File

@ -357,8 +357,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
message(STATUS "RISC-V detected") message(STATUS "RISC-V detected")
if (GGML_RVV) if (GGML_RVV)
if (GGML_RV_ZFH) if (GGML_XTHEADVECTOR)
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d) list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
elseif (GGML_RV_ZFH)
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
else() else()
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
endif() endif()

View File

@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
} }
} }
return; return;
#elif defined(__riscv_v_intrinsic) #elif defined __riscv_v
if (__riscv_vlenb() >= QK4_0) { if (__riscv_vlenb() >= QK4_0) {
const size_t vl = QK4_0; const size_t vl = QK4_0;
@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
} }
return; return;
} }
#elif defined(__riscv_v_intrinsic) #elif defined __riscv_v
if (__riscv_vlenb() >= QK4_0) { if (__riscv_vlenb() >= QK4_0) {
const size_t vl = QK4_0; const size_t vl = QK4_0;

View File

@ -320,21 +320,17 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#ifdef __wasm_simd128__ #ifdef __wasm_simd128__
#include <wasm_simd128.h> #include <wasm_simd128.h>
#else #endif
#ifdef __POWER9_VECTOR__ #ifdef __POWER9_VECTOR__
#include <altivec.h> #include <altivec.h>
#else #endif
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h> #include <intrin.h>
#else #elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
#if !defined(__riscv)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic #ifdef __riscv_v_intrinsic
#include <riscv_vector.h> #include <riscv_vector.h>

View File

@ -883,7 +883,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif #endif
} }
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = QK8_0; size_t vl = QK8_0;
@ -1221,7 +1221,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif #endif
} }
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = QK8_1; size_t vl = QK8_1;
@ -2384,7 +2384,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = qk / 2; size_t vl = qk / 2;
for (; ib < nb; ++ib) { for (; ib < nb; ++ib) {
@ -2774,7 +2774,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_8(acc) + summs; sumf = hsum_float_8(acc) + summs;
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = qk / 2; size_t vl = qk / 2;
for (; ib < nb; ++ib) { for (; ib < nb; ++ib) {
@ -3121,7 +3121,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_8(acc); sumf = hsum_float_8(acc);
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl; size_t vl;
size_t vlenb = __riscv_vlenb(); size_t vlenb = __riscv_vlenb();
@ -3460,7 +3460,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_8(acc) + summs; sumf = hsum_float_8(acc) + summs;
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl; size_t vl;
size_t vlenb = __riscv_vlenb(); size_t vlenb = __riscv_vlenb();
@ -3897,7 +3897,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_8(accum); sumf = hsum_float_8(accum);
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = qk; size_t vl = qk;
for (; ib < nb; ++ib) { for (; ib < nb; ++ib) {
@ -5100,14 +5100,111 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf; *s = sumf;
#elif defined __riscv_v_intrinsic #elif defined __riscv_xtheadvector
float sumf = 0;
uint8_t atmp[16];
for (int i = 0; i < nb; ++i) {
const uint8_t * q2 = x[i].qs;
const int8_t * q8 = y[i].qs;
const uint8_t * sc = x[i].scales;
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
uint8_t *patmp = atmp;
int vsums;
int tmp;
__asm__ __volatile__(
"th.vsetvli zero, %[vl16], e8, m1\n\t"
"th.vmv.v.x v8, zero\n\t"
"th.vlb.v v1, (%[sc])\n\t"
"th.vand.vi v0, v1, 0xF\n\t"
"th.vsrl.vi v1, v1, 4\n\t"
"th.vsb.v v0, (%[scale])\n\t"
"th.vwaddu.vx v16, v1, zero\n\t"
"th.vsetvli zero, %[vl16], e16, m2\n\t"
"th.vlh.v v2, (%[bsums])\n\t"
"th.vwmul.vv v4, v16, v2\n\t"
"th.vsetvli zero, %[vl16], e32, m4\n\t"
"th.vredsum.vs v8, v4, v8\n\t"
"th.vmv.x.s %[vsums], v8"
: [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
: [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
, [vl16] "r" (16)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
sumf += dmin * vsums;
int isum = 0;
for (int j = 0; j < QK_K/128; ++j) {
__asm__ __volatile__(
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vlb.v v0, (%[q2])\n\t"
"th.vsrl.vi v2, v0, 2\n\t"
"th.vsrl.vi v4, v0, 4\n\t"
"th.vsrl.vi v6, v0, 6\n\t"
"th.vand.vi v0, v0, 0x3\n\t"
"th.vand.vi v2, v2, 0x3\n\t"
"th.vand.vi v4, v4, 0x3\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v8, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"th.vsetvli zero, %[vl16], e16, m2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[tmp], 8\n\t"
"th.vsetvli zero, %[tmp], e32, m2\n\t"
"th.vlbu.v v12, (%[scale])\n\t"
"th.vmul.vv v10, v10, v12\n\t"
"th.vredsum.vs v0, v10, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[isum], %[isum], %[tmp]"
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
, [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q2 += 32; q8 += 128; patmp += 8;
}
sumf += dall * isum;
}
*s = sumf;
#elif defined __riscv_v
float sumf = 0;
uint8_t atmp[16];
const int vector_length = __riscv_vlenb() * 8; const int vector_length = __riscv_vlenb() * 8;
float sumf = 0;
uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
uint8_t atmp[16];
switch (vector_length) { switch (vector_length) {
case 256: case 256:
@ -6137,14 +6234,141 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf; *s = sumf;
#elif defined __riscv_v_intrinsic #elif defined __riscv_xtheadvector
uint32_t aux[3];
uint32_t utmp[4]; uint32_t utmp[4];
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0; float sumf = 0;
for (int i = 0; i < nb; ++i) {
const uint8_t * restrict q3 = x[i].qs;
const uint8_t * restrict qh = x[i].hmask;
const int8_t * restrict q8 = y[i].qs;
int8_t * scale = (int8_t *)utmp;
int tmp;
__asm__ __volatile__(
"li %[tmp], 12\n\t"
"th.vsetvli zero, %[tmp], e8, m1\n\t"
"th.vlb.v v0, (%[s6b])\n\t"
"th.vmv.v.v v2, v0\n\t"
"li %[tmp], 2\n\t"
"th.vsetvli zero, %[tmp], e64, m1\n\t"
"th.vmv.v.x v9, %[sh]\n\t"\
"th.vslidedown.vi v1, v0, 1\n\t"
"th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
"th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vid.v v9\n\t"
"th.vmv.x.s %[tmp], v1\n\t"
"th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
"th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
"th.vsrl.vv v4, v1, v9\n\t"
"th.vsrl.vv v2, v0, v8\n\t"
"th.vand.vx v5, v4, %[kmask1]\n\t"
"th.vand.vx v3, v2, %[kmask2]\n\t"
"th.vsll.vi v6, v5, 4\n\t"
"th.vor.vv v7, v6, v3\n\t"
"li %[tmp], 16\n\t"
"th.vsetvli zero, %[tmp], e8, m1\n\t"
"th.vsub.vx v0, v7, %[c]\n\t"
"th.vsb.v v0, (%[scale])"
: [tmp] "=&r" (tmp)
: [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
, [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
uint8_t m = 1;
int isum = 0;
for (int j = 0; j < QK_K; j += 128) {
__asm__ __volatile__(
// fixme: use v0p7 mask layout directly
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vlb.v v8, (%[q3])\n\t"
"th.vsrl.vi v10, v8, 2\n\t"
"th.vsrl.vi v12, v8, 4\n\t"
"th.vsrl.vi v14, v8, 6\n\t"
"th.vand.vi v8, v8, 3\n\t"
"th.vand.vi v10, v10, 3\n\t"
"th.vand.vi v12, v12, 3\n\t"
"th.vlb.v v2, (%[qh])\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v8, v8, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v10, v10, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v12, v12, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v14, v14, -4, v0.t\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v0, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"li %[tmp], 16\n\t"
"th.vsetvli zero, %[tmp], e16, m2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[tmp], 8\n\t"
"th.vsetvli zero, %[tmp], e32, m2\n\t"
"th.vlb.v v12, (%[scale])\n\t"
"th.vmul.vv v10, v10, v12\n\t"
"th.vredsum.vs v0, v10, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[isum], %[isum], %[tmp]"
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q3 += 32; q8 += 128; scale += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
sumf += d * isum;
}
*s = sumf;
#elif defined __riscv_v
uint32_t utmp[4];
float sumf = 0;
uint32_t aux[3];
const int vector_length = __riscv_vlenb() * 8;
switch (vector_length) { switch (vector_length) {
case 256: case 256:
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
@ -6331,7 +6555,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
"vslideup.vi v13, v14, 1\n\t" "vslideup.vi v13, v14, 1\n\t"
"vslideup.vi v10, v8, 2\n\t" "vslideup.vi v10, v8, 2\n\t"
"vslideup.vi v11, v13, 2\n\t" "vslideup.vi v11, v13, 2\n\t"
"vsetivli zero, 8, e32, m2\n\t"\ "vsetivli zero, 8, e32, m2\n\t"
"vle8.v v15, (%[scale])\n\t" "vle8.v v15, (%[scale])\n\t"
"vsext.vf4 v12, v15\n\t" "vsext.vf4 v12, v15\n\t"
"vmul.vv v10, v10, v12\n\t" "vmul.vv v10, v10, v12\n\t"
@ -7180,14 +7404,130 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
#elif defined __riscv_v_intrinsic #elif defined __riscv_xtheadvector
const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2]; const uint8_t * mins = (const uint8_t*)&utmp[2];
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0; float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
int tmp, tmp2, sumi;
__asm__ __volatile__(
"li %[t1], 12\n\t"
"th.vsetvli zero, %[t1], e8, m1\n\t"
"th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
"li %[t1], 4\n\t"
"th.vsetvli zero, %[t1], e32, m1\n\t"
"th.vslidedown.vi v2, v1, 2\n\t"
"th.vmv.v.v v3, v2\n\t"
"th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
"li %[t1], 2\n\t"
"th.vsetvli zero, %[t1], e32, m1\n\t"
"th.vmv.v.i v4, 4\n\t"
"th.vand.vx v8, v1, %[kmask1]\n\t"
"th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
"th.vsrl.vi v6, v1, 6\n\t"
"th.vsrl.vv v7, v2, v5\n\t"
"th.vand.vx v0, v6, %[kmask3]\n\t"
"th.vand.vx v2, v7, %[kmask2]\n\t"
"th.vsll.vi v6, v0, 4\n\t"
"li %[t2], 8\n\t"
"addi %[t1], %[utmp], 4\n\t"
"th.vor.vv v1, v6, v2\n\t"
"th.vssw.v v8, (%[utmp]), %[t2]\n\t"
"th.vssw.v v1, (%[t1]), %[t2]\n\t"
"th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
"th.vlw.v v2, (%[bsums])\n\t"
"th.vsetvli zero, %[t2], e16, m1\n\t"
"th.vnsrl.vi v0, v2, 0\n\t"
"th.vnsrl.vi v1, v2, 16\n\t"
"th.vadd.vv v2, v0, v1\n\t"
"th.vlbu.v v4, (%[mins])\n\t"
"th.vwmul.vv v6, v4, v2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vsetvli zero, %[t2], e32, m2\n\t"
"th.vredsum.vs v0, v6, v0\n\t"
"th.vmv.x.s %[sumi], v0"
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
sumf -= dmin * sumi;
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
sumi = 0;
const uint8_t * scale = scales;
for (int j = 0; j < QK_K/128; ++j) {
int vl128 = 128, vl64 = 64, vl32 = 32;
__asm__ __volatile__(
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v8, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vlb.v v0, (%[q4])\n\t"
"th.vsrl.vi v4, v0, 4\n\t"
"th.vand.vi v0, v0, 0xF\n\t"
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vwmul.vv v28, v6, v14\n\t"
"th.vwmul.vv v20, v4, v10\n\t"
"th.vwmul.vv v24, v2, v12\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vlbu.v v1, (%[scale])\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vsetvli zero, %[vl32], e16, m4\n\t"
"th.vwredsum.vs v6, v24, v0\n\t"
"th.vwredsum.vs v7, v28, v0\n\t"
"th.vwredsum.vs v4, v16, v0\n\t"
"th.vwredsum.vs v5, v20, v0\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v6, v7, 1\n\t"
"th.vslideup.vi v4, v5, 1\n\t"
"th.vslideup.vi v4, v6, 2\n\t"
"th.vmul.vv v8, v4, v1\n\t"
"th.vredsum.vs v0, v8, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[sumi], %[sumi], %[tmp]"
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q4 += 64; q8 += 128; scale += 4;
}
sumf += d * sumi;
}
*s = sumf;
#elif defined __riscv_v
const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2];
float sumf = 0;
const int vector_length = __riscv_vlenb() * 8;
switch (vector_length) { switch (vector_length) {
case 256: case 256:
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
@ -8074,7 +8414,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf; *s = sumf;
#elif defined __riscv_v_intrinsic #elif defined __riscv_v
const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2]; const uint8_t * mins = (const uint8_t*)&utmp[2];
@ -9232,11 +9572,92 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
*s = sumf; *s = sumf;
#elif defined __riscv_v_intrinsic #elif defined __riscv_xtheadvector
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0; float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict q6 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
const int8_t * restrict q8 = y[i].qs;
const int8_t * restrict scale = x[i].scales;
int sum_t = 0;
int t0;
for (int j = 0; j < QK_K/128; ++j) {
__asm__ __volatile__(
"th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
"th.vlb.v v4, (%[qh])\n\t"
"th.vsll.vi v0, v4, 4\n\t"
"th.vsll.vi v2, v4, 2\n\t"
"th.vsrl.vi v6, v4, 2\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
"th.vlb.v v8, (%[q6])\n\t"
"th.vsrl.vi v12, v8, 4\n\t"
"th.vand.vi v8, v8, 0xF\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
"th.vand.vx v0, v0, %[mask]\n\t"
"th.vor.vv v8, v8, v0\n\t"
"th.vlb.v v0, (%[q8])\n\t"
"th.vsub.vx v8, v8, %[vl32]\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"li %[t0], 16\n\t"
"th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[t0], 4\n\t"
"th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[t0], 8\n\t"
"th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
"th.vlb.v v4, (%[scale])\n\t"
"th.vmul.vv v2, v4, v10\n\t"
"th.vredsum.vs v0, v2, v0\n\t"
"th.vmv.x.s %[t0], v0\n\t"
"add %[sumi], %[sumi], %[t0]"
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
, [mask] "r" (0x30)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q6 += 64; qh += 32; q8 += 128; scale += 8;
}
sumf += d * sum_t;
}
*s = sumf;
#elif defined __riscv_v
float sumf = 0;
const int vector_length = __riscv_vlenb() * 8;
switch (vector_length) { switch (vector_length) {
case 256: case 256:
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {

View File

@ -386,7 +386,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
return r; return r;
} }
#elif defined(__riscv) && defined(GGML_RV_ZFH) #elif defined(__riscv) && defined(__riscv_zfhmin)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
float f; float f;