mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-22 10:18:49 +02:00
ggml : sync/merge cmake,riscv,powerpc, add common.cmake (ggml/0)
This commit is contained in:
parent
fc6d343e76
commit
8ca67df291
@ -127,6 +127,7 @@ endif()
|
|||||||
option(GGML_LASX "ggml: enable lasx" ON)
|
option(GGML_LASX "ggml: enable lasx" ON)
|
||||||
option(GGML_LSX "ggml: enable lsx" ON)
|
option(GGML_LSX "ggml: enable lsx" ON)
|
||||||
option(GGML_RVV "ggml: enable rvv" ON)
|
option(GGML_RVV "ggml: enable rvv" ON)
|
||||||
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||||
option(GGML_VXE "ggml: enable vxe" ON)
|
option(GGML_VXE "ggml: enable vxe" ON)
|
||||||
|
|
||||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||||
|
26
ggml/cmake/common.cmake
Normal file
26
ggml/cmake/common.cmake
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
function(ggml_get_flags CCID CCVER)
|
||||||
|
set(C_FLAGS "")
|
||||||
|
set(CXX_FLAGS "")
|
||||||
|
|
||||||
|
if (CCID MATCHES "Clang")
|
||||||
|
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||||
|
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||||
|
|
||||||
|
if (
|
||||||
|
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||||
|
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||||
|
)
|
||||||
|
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||||
|
endif()
|
||||||
|
elseif (CCID STREQUAL "GNU")
|
||||||
|
set(C_FLAGS -Wdouble-promotion)
|
||||||
|
set(CXX_FLAGS -Wno-array-bounds)
|
||||||
|
|
||||||
|
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||||
|
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||||
|
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||||
|
endfunction()
|
@ -287,18 +287,26 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||||
message(STATUS "PowerPC detected")
|
message(STATUS "PowerPC detected")
|
||||||
if (GGML_NATIVE)
|
if (GGML_NATIVE)
|
||||||
execute_process(COMMAND bash -c "grep POWER /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER_M)
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||||
if (${POWER_M} MATCHES "POWER10")
|
file(READ "/proc/cpuinfo" POWER10_M)
|
||||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
|
||||||
elseif (${POWER_M} MATCHES "POWER9")
|
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||||
list(APPEND ARCH_FLAGS -mcpu=power9)
|
endif()
|
||||||
|
|
||||||
|
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
|
||||||
|
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
||||||
|
|
||||||
|
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
||||||
|
list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
|
||||||
|
elseif (EXTRACTED_NUMBER EQUAL 9)
|
||||||
|
list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
||||||
else()
|
else()
|
||||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64 -mtune=native)
|
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
if (GGML_CPU_POWERPC_CPUTYPE)
|
if (GGML_CPU_POWERPC_CPUTYPE)
|
||||||
@ -318,8 +326,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
||||||
message(STATUS "RISC-V detected")
|
message(STATUS "RISC-V detected")
|
||||||
if (GGML_RVV)
|
if (GGML_RVV)
|
||||||
|
if (GGML_RV_ZFH)
|
||||||
|
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
|
||||||
|
else()
|
||||||
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||||
message(STATUS "s390x detected")
|
message(STATUS "s390x detected")
|
||||||
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
|
||||||
|
@ -891,15 +891,15 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|||||||
}
|
}
|
||||||
#elif defined(__riscv_v_intrinsic)
|
#elif defined(__riscv_v_intrinsic)
|
||||||
|
|
||||||
size_t vl = __riscv_vsetvl_e32m4(QK8_0);
|
size_t vl = QK8_0;
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
// load elements
|
// load elements
|
||||||
vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
|
vfloat32m8_t v_x = __riscv_vle32_v_f32m8(x+i*QK8_0, vl);
|
||||||
|
|
||||||
vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
|
vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl);
|
||||||
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
|
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
|
||||||
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
|
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl);
|
||||||
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
|
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
|
||||||
|
|
||||||
const float d = amax / ((1 << 7) - 1);
|
const float d = amax / ((1 << 7) - 1);
|
||||||
@ -907,14 +907,14 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|||||||
|
|
||||||
y[i].d = GGML_FP32_TO_FP16(d);
|
y[i].d = GGML_FP32_TO_FP16(d);
|
||||||
|
|
||||||
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
|
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
|
||||||
|
|
||||||
// convert to integer
|
// convert to integer
|
||||||
vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
|
vint16m4_t vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl);
|
||||||
vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
|
vint8m2_t vs = __riscv_vncvt_x_x_w_i8m2(vi, vl);
|
||||||
|
|
||||||
// store result
|
// store result
|
||||||
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
|
__riscv_vse8_v_i8m2(y[i].qs , vs, vl);
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
#elif defined(__POWER9_VECTOR__)
|
||||||
@ -1229,15 +1229,15 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|||||||
}
|
}
|
||||||
#elif defined(__riscv_v_intrinsic)
|
#elif defined(__riscv_v_intrinsic)
|
||||||
|
|
||||||
size_t vl = __riscv_vsetvl_e32m4(QK8_1);
|
size_t vl = QK8_1;
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
// load elements
|
// load elements
|
||||||
vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
|
vfloat32m8_t v_x = __riscv_vle32_v_f32m8(x+i*QK8_1, vl);
|
||||||
|
|
||||||
vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
|
vfloat32m8_t vfabs = __riscv_vfabs_v_f32m8(v_x, vl);
|
||||||
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
|
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
|
||||||
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
|
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m8_f32m1(vfabs, tmp, vl);
|
||||||
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
|
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
|
||||||
|
|
||||||
const float d = amax / ((1 << 7) - 1);
|
const float d = amax / ((1 << 7) - 1);
|
||||||
@ -1245,18 +1245,18 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
|
|||||||
|
|
||||||
y[i].d = GGML_FP32_TO_FP16(d);
|
y[i].d = GGML_FP32_TO_FP16(d);
|
||||||
|
|
||||||
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
|
vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
|
||||||
|
|
||||||
// convert to integer
|
// convert to integer
|
||||||
vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
|
vint16m4_t vi = __riscv_vfncvt_x_f_w_i16m4(x0, vl);
|
||||||
vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
|
vint8m2_t vs = __riscv_vncvt_x_x_w_i8m2(vi, vl);
|
||||||
|
|
||||||
// store result
|
// store result
|
||||||
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
|
__riscv_vse8_v_i8m2(y[i].qs , vs, vl);
|
||||||
|
|
||||||
// compute sum for y[i].s
|
// compute sum for y[i].s
|
||||||
vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
|
vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
|
||||||
vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
|
vint16m1_t vwrs = __riscv_vwredsum_vs_i8m2_i16m1(vs, tmp2, vl);
|
||||||
|
|
||||||
// set y[i].s
|
// set y[i].s
|
||||||
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
||||||
@ -2391,33 +2391,31 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
|
||||||
#elif defined(__riscv_v_intrinsic)
|
#elif defined(__riscv_v_intrinsic)
|
||||||
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
size_t vl = qk / 2;
|
||||||
|
|
||||||
for (; ib < nb; ++ib) {
|
for (; ib < nb; ++ib) {
|
||||||
// load elements
|
// load elements
|
||||||
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
|
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl);
|
||||||
|
|
||||||
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
|
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
|
||||||
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
|
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl);
|
||||||
|
|
||||||
// mask and store lower part of x, and then upper part
|
// mask and store lower part of x, and then upper part
|
||||||
vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
|
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
||||||
vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
|
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
||||||
|
|
||||||
vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
|
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
||||||
vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
|
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
||||||
|
|
||||||
// subtract offset
|
// subtract offset
|
||||||
vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
|
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
|
||||||
vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
|
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
|
||||||
|
|
||||||
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
|
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
||||||
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
|
vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl);
|
||||||
|
|
||||||
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
||||||
|
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
||||||
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
|
|
||||||
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
|
|
||||||
|
|
||||||
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
||||||
|
|
||||||
@ -2783,29 +2781,27 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
sumf = hsum_float_8(acc) + summs;
|
sumf = hsum_float_8(acc) + summs;
|
||||||
#elif defined(__riscv_v_intrinsic)
|
#elif defined(__riscv_v_intrinsic)
|
||||||
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
size_t vl = qk / 2;
|
||||||
|
|
||||||
for (; ib < nb; ++ib) {
|
for (; ib < nb; ++ib) {
|
||||||
// load elements
|
// load elements
|
||||||
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
|
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[ib].qs, vl);
|
||||||
|
|
||||||
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
|
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
|
||||||
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
|
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[ib].qs+16, vl);
|
||||||
|
|
||||||
// mask and store lower part of x, and then upper part
|
// mask and store lower part of x, and then upper part
|
||||||
vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
|
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
|
||||||
vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
|
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
|
||||||
|
|
||||||
vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
|
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
|
||||||
vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
|
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
|
||||||
|
|
||||||
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
|
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
|
||||||
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
|
vint16m2_t vec_mul2 = __riscv_vwmacc_vv_i16m2(vec_mul1, v1, y1, vl);
|
||||||
|
|
||||||
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
||||||
|
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
|
||||||
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
|
|
||||||
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
|
|
||||||
|
|
||||||
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
||||||
|
|
||||||
@ -3132,63 +3128,31 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
sumf = hsum_float_8(acc);
|
sumf = hsum_float_8(acc);
|
||||||
#elif defined(__riscv_v_intrinsic)
|
#elif defined(__riscv_v_intrinsic)
|
||||||
uint32_t qh;
|
size_t vl;
|
||||||
|
size_t vlenb = __riscv_vlenb();
|
||||||
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
|
||||||
|
|
||||||
// These temporary registers are for masking and shift operations
|
|
||||||
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
|
|
||||||
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
|
|
||||||
|
|
||||||
vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
|
|
||||||
vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
|
|
||||||
|
|
||||||
for (; ib < nb; ++ib) {
|
for (; ib < nb; ++ib) {
|
||||||
memcpy(&qh, x[ib].qh, sizeof(uint32_t));
|
vl = qk / 2;
|
||||||
|
vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl);
|
||||||
|
vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl));
|
||||||
|
vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl));
|
||||||
|
vint8m2_t v0c;
|
||||||
|
if (vlenb == 16) {
|
||||||
|
v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h);
|
||||||
|
} else {
|
||||||
|
v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32);
|
||||||
|
v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l);
|
||||||
|
}
|
||||||
|
|
||||||
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
vl = qk;
|
||||||
vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
|
vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl);
|
||||||
vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
|
qh = __riscv_vmnand_mm_b4(qh, qh, vl);
|
||||||
vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
|
vint8m2_t v0f = __riscv_vsub_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl);
|
||||||
|
vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
|
||||||
// ((qh & (1u << (j + 16))) >> (j + 12));
|
vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl);
|
||||||
vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
|
vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl);
|
||||||
vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
|
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
|
||||||
|
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
|
||||||
// narrowing
|
|
||||||
vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
|
|
||||||
vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
|
|
||||||
|
|
||||||
vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
|
|
||||||
vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
|
|
||||||
|
|
||||||
// load
|
|
||||||
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
|
|
||||||
|
|
||||||
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
|
|
||||||
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
|
|
||||||
|
|
||||||
vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
|
|
||||||
vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
|
|
||||||
|
|
||||||
vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
|
|
||||||
vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
|
|
||||||
|
|
||||||
vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
|
|
||||||
vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
|
|
||||||
|
|
||||||
vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
|
|
||||||
vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
|
|
||||||
|
|
||||||
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
|
|
||||||
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
|
|
||||||
|
|
||||||
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
|
||||||
|
|
||||||
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
|
|
||||||
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
|
|
||||||
|
|
||||||
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
|
||||||
|
|
||||||
sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi;
|
||||||
}
|
}
|
||||||
@ -3503,60 +3467,30 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
sumf = hsum_float_8(acc) + summs;
|
sumf = hsum_float_8(acc) + summs;
|
||||||
#elif defined(__riscv_v_intrinsic)
|
#elif defined(__riscv_v_intrinsic)
|
||||||
uint32_t qh;
|
size_t vl;
|
||||||
|
size_t vlenb = __riscv_vlenb();
|
||||||
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
|
||||||
|
|
||||||
// temporary registers for shift operations
|
|
||||||
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
|
|
||||||
vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
|
|
||||||
|
|
||||||
for (; ib < nb; ++ib) {
|
for (; ib < nb; ++ib) {
|
||||||
memcpy(&qh, x[ib].qh, sizeof(uint32_t));
|
vl = qk / 2;
|
||||||
|
vuint8m1_t v0 = __riscv_vle8_v_u8m1(x[ib].qs, vl);
|
||||||
|
vint8m1_t v0l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(v0, 0x0F, vl));
|
||||||
|
vint8m1_t v0h = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(v0, 4, vl));
|
||||||
|
vint8m2_t v0c;
|
||||||
|
if (vlenb == 16) {
|
||||||
|
v0c = __riscv_vcreate_v_i8m1_i8m2(v0l, v0h);
|
||||||
|
} else {
|
||||||
|
v0l = __riscv_vslideup_vx_i8m1(v0l, v0h, 16, 32);
|
||||||
|
v0c = __riscv_vlmul_ext_v_i8m1_i8m2(v0l);
|
||||||
|
}
|
||||||
|
|
||||||
// load qh
|
vl = qk;
|
||||||
vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
|
vbool4_t qh = __riscv_vlm_v_b4(x[ib].qh, vl);
|
||||||
|
vint8m2_t v0f = __riscv_vor_vx_i8m2_mu(qh, v0c, v0c, 0x10, vl);
|
||||||
// ((qh >> (j + 0)) << 4) & 0x10;
|
vint8m2_t v1 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
|
||||||
vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
|
vint16m4_t mul = __riscv_vwmul_vv_i16m4(v0f, v1, vl);
|
||||||
vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
|
vint32m1_t zero = __riscv_vmv_v_x_i32m1(0, vl);
|
||||||
vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
|
vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
|
||||||
|
int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
|
||||||
// ((qh >> (j + 12)) ) & 0x10;
|
|
||||||
vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
|
|
||||||
vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
|
|
||||||
|
|
||||||
// narrowing
|
|
||||||
vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
|
|
||||||
vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
|
|
||||||
|
|
||||||
vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
|
|
||||||
vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
|
|
||||||
|
|
||||||
// load
|
|
||||||
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
|
|
||||||
|
|
||||||
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
|
|
||||||
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
|
|
||||||
|
|
||||||
vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
|
|
||||||
vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
|
|
||||||
|
|
||||||
vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
|
|
||||||
vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
|
|
||||||
|
|
||||||
vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
|
|
||||||
vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
|
|
||||||
|
|
||||||
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
|
|
||||||
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
|
|
||||||
|
|
||||||
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
|
||||||
|
|
||||||
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
|
|
||||||
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
|
|
||||||
|
|
||||||
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
|
||||||
|
|
||||||
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
|
||||||
}
|
}
|
||||||
@ -3970,17 +3904,17 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
sumf = hsum_float_8(accum);
|
sumf = hsum_float_8(accum);
|
||||||
#elif defined(__riscv_v_intrinsic)
|
#elif defined(__riscv_v_intrinsic)
|
||||||
size_t vl = __riscv_vsetvl_e8m1(qk);
|
size_t vl = qk;
|
||||||
|
|
||||||
for (; ib < nb; ++ib) {
|
for (; ib < nb; ++ib) {
|
||||||
// load elements
|
// load elements
|
||||||
vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[ib].qs, vl);
|
vint8m2_t bx_0 = __riscv_vle8_v_i8m2(x[ib].qs, vl);
|
||||||
vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[ib].qs, vl);
|
vint8m2_t by_0 = __riscv_vle8_v_i8m2(y[ib].qs, vl);
|
||||||
|
|
||||||
vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
|
vint16m4_t vw_mul = __riscv_vwmul_vv_i16m4(bx_0, by_0, vl);
|
||||||
|
|
||||||
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
|
||||||
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
|
vint32m1_t v_sum = __riscv_vwredsum_vs_i16m4_i32m1(vw_mul, v_zero, vl);
|
||||||
|
|
||||||
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
|
||||||
|
|
||||||
@ -5174,12 +5108,16 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
#elif defined __riscv_v_intrinsic
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
const int vector_length = __riscv_vlenb() * 8;
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||||
|
uint8_t atmp[16];
|
||||||
|
|
||||||
|
switch (vector_length) {
|
||||||
|
case 256:
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const uint8_t * q2 = x[i].qs;
|
const uint8_t * q2 = x[i].qs;
|
||||||
const int8_t * q8 = y[i].qs;
|
const int8_t * q8 = y[i].qs;
|
||||||
const uint8_t * sc = x[i].scales;
|
const uint8_t * sc = x[i].scales;
|
||||||
@ -5246,12 +5184,106 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
isum += __riscv_vmv_x_s_i32m1_i32(isum1);
|
isum += __riscv_vmv_x_s_i32m1_i32(isum1);
|
||||||
|
|
||||||
q2+=32; q8+=128; is=8;
|
q2 += 32;
|
||||||
|
q8 += 128;
|
||||||
|
is = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
sumf += dall * isum;
|
sumf += dall * isum;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const uint8_t * q2 = x[i].qs;
|
||||||
|
const int8_t * q8 = y[i].qs;
|
||||||
|
const uint8_t * sc = x[i].scales;
|
||||||
|
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||||
|
uint8_t *patmp = atmp;
|
||||||
|
int vsums;
|
||||||
|
int tmp;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"vsetivli zero, 16, e8, m1\n\t"
|
||||||
|
"vmv.v.x v8, zero\n\t"
|
||||||
|
"vle8.v v1, (%[sc])\n\t"
|
||||||
|
"vand.vi v0, v1, 0xF\n\t"
|
||||||
|
"vsrl.vi v1, v1, 4\n\t"
|
||||||
|
"vse8.v v0, (%[scale])\n\t"
|
||||||
|
"vsetivli zero, 16, e16, m2\n\t"
|
||||||
|
"vle16.v v2, (%[bsums])\n\t"
|
||||||
|
"vzext.vf2 v0, v1\n\t"
|
||||||
|
"vwmul.vv v4, v0, v2\n\t"
|
||||||
|
"vsetivli zero, 16, e32, m4\n\t"
|
||||||
|
"vredsum.vs v8, v4, v8\n\t"
|
||||||
|
"vmv.x.s %[vsums], v8"
|
||||||
|
: [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
|
||||||
|
: [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
|
||||||
|
: "memory"
|
||||||
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||||
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||||
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||||
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||||
|
);
|
||||||
|
sumf += dmin * vsums;
|
||||||
|
int isum = 0;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"vsetvli zero, %[vl32], e8, m2\n\t"
|
||||||
|
"vle8.v v0, (%[q2])\n\t"
|
||||||
|
"vsrl.vi v2, v0, 2\n\t"
|
||||||
|
"vsrl.vi v4, v0, 4\n\t"
|
||||||
|
"vsrl.vi v6, v0, 6\n\t"
|
||||||
|
"vand.vi v0, v0, 0x3\n\t"
|
||||||
|
"vand.vi v2, v2, 0x3\n\t"
|
||||||
|
"vand.vi v4, v4, 0x3\n\t"
|
||||||
|
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||||
|
"vle8.v v8, (%[q8])\n\t"
|
||||||
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||||
|
"vwmul.vv v16, v0, v8\n\t"
|
||||||
|
"vwmul.vv v24, v4, v12\n\t"
|
||||||
|
"vsetivli zero, 16, e16, m2\n\t"
|
||||||
|
"vmv.v.x v0, zero\n\t"
|
||||||
|
"vwredsum.vs v10, v16, v0\n\t"
|
||||||
|
"vwredsum.vs v9, v18, v0\n\t"
|
||||||
|
"vwredsum.vs v8, v20, v0\n\t"
|
||||||
|
"vwredsum.vs v7, v22, v0\n\t"
|
||||||
|
"vwredsum.vs v11, v24, v0\n\t"
|
||||||
|
"vwredsum.vs v12, v26, v0\n\t"
|
||||||
|
"vwredsum.vs v13, v28, v0\n\t"
|
||||||
|
"vwredsum.vs v14, v30, v0\n\t"
|
||||||
|
"vsetivli zero, 4, e32, m1\n\t"
|
||||||
|
"vslideup.vi v10, v9, 1\n\t"
|
||||||
|
"vslideup.vi v8, v7, 1\n\t"
|
||||||
|
"vslideup.vi v11, v12, 1\n\t"
|
||||||
|
"vslideup.vi v13, v14, 1\n\t"
|
||||||
|
"vslideup.vi v10, v8, 2\n\t"
|
||||||
|
"vslideup.vi v11, v13, 2\n\t"
|
||||||
|
"vsetivli zero, 8, e32, m2\n\t"
|
||||||
|
"vle8.v v15, (%[scale])\n\t"
|
||||||
|
"vzext.vf4 v12, v15\n\t"
|
||||||
|
"vmul.vv v10, v10, v12\n\t"
|
||||||
|
"vredsum.vs v0, v10, v0\n\t"
|
||||||
|
"vmv.x.s %[tmp], v0\n\t"
|
||||||
|
"add %[isum], %[isum], %[tmp]"
|
||||||
|
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
|
||||||
|
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
|
||||||
|
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||||
|
: "memory"
|
||||||
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||||
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||||
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||||
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||||
|
);
|
||||||
|
q2 += 32; q8 += 128; patmp += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += dall * isum;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(false && "Unsupported vector length");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
@ -6116,7 +6148,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
uint32_t aux[3];
|
uint32_t aux[3];
|
||||||
uint32_t utmp[4];
|
uint32_t utmp[4];
|
||||||
|
|
||||||
|
const int vector_length = __riscv_vlenb() * 8;
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
|
switch (vector_length) {
|
||||||
|
case 256:
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
const uint8_t * GGML_RESTRICT q3 = x[i].qs;
|
||||||
@ -6208,6 +6244,126 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
sumf += d*sum_t;
|
sumf += d*sum_t;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const uint8_t * restrict q3 = x[i].qs;
|
||||||
|
const uint8_t * restrict qh = x[i].hmask;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
int8_t * scale = (int8_t *)utmp;
|
||||||
|
int tmp;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"vsetivli zero, 12, e8, m1\n\t"
|
||||||
|
"vle8.v v0, (%[s6b])\n\t"
|
||||||
|
"vmv1r.v v2, v0\n\t"
|
||||||
|
"vsetivli zero, 2, e64, m1\n\t"
|
||||||
|
"vmv.v.x v9, %[sh]\n\t"\
|
||||||
|
"vslidedown.vi v1, v0, 1\n\t"
|
||||||
|
"vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
|
||||||
|
"vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
|
||||||
|
"vsetivli zero, 4, e32, m1\n\t"
|
||||||
|
"vid.v v9\n\t"
|
||||||
|
"vmv.x.s %[tmp], v1\n\t"
|
||||||
|
"vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
|
||||||
|
"vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
|
||||||
|
"vsrl.vv v4, v1, v9\n\t"
|
||||||
|
"vsrl.vv v2, v0, v8\n\t"
|
||||||
|
"vand.vx v5, v4, %[kmask1]\n\t"
|
||||||
|
"vand.vx v3, v2, %[kmask2]\n\t"
|
||||||
|
"vsll.vi v6, v5, 4\n\t"
|
||||||
|
"vor.vv v7, v6, v3\n\t"
|
||||||
|
"vsetivli zero, 16, e8, m1\n\t"
|
||||||
|
"vsub.vx v0, v7, %[c]\n\t"
|
||||||
|
"vse8.v v0, (%[scale])"
|
||||||
|
: [tmp] "=&r" (tmp)
|
||||||
|
: [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
|
||||||
|
, [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
|
||||||
|
: "memory"
|
||||||
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||||
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||||
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||||
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||||
|
);
|
||||||
|
|
||||||
|
uint8_t m = 1;
|
||||||
|
int isum = 0;
|
||||||
|
for (int j = 0; j < QK_K; j += 128) {
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
|
||||||
|
"vle8.v v8, (%[q3])\n\t"
|
||||||
|
"vsrl.vi v10, v8, 2\n\t"
|
||||||
|
"vsrl.vi v12, v8, 4\n\t"
|
||||||
|
"vsrl.vi v14, v8, 6\n\t"
|
||||||
|
"vand.vi v8, v8, 3\n\t"
|
||||||
|
"vand.vi v10, v10, 3\n\t"
|
||||||
|
"vand.vi v12, v12, 3\n\t"
|
||||||
|
"vle8.v v2, (%[qh])\n\t"
|
||||||
|
"vand.vx v4, v2, %[m]\n\t"
|
||||||
|
"slli %[m], %[m], 1\n\t"
|
||||||
|
"vmseq.vx v0, v4, zero\n\t"
|
||||||
|
"vadd.vi v8, v8, -4, v0.t\n\t"
|
||||||
|
"vand.vx v4, v2, %[m]\n\t"
|
||||||
|
"slli %[m], %[m], 1\n\t"
|
||||||
|
"vmseq.vx v0, v4, zero\n\t"
|
||||||
|
"vadd.vi v10, v10, -4, v0.t\n\t"
|
||||||
|
"vand.vx v4, v2, %[m]\n\t"
|
||||||
|
"slli %[m], %[m], 1\n\t"
|
||||||
|
"vmseq.vx v0, v4, zero\n\t"
|
||||||
|
"vadd.vi v12, v12, -4, v0.t\n\t"
|
||||||
|
"vand.vx v4, v2, %[m]\n\t"
|
||||||
|
"slli %[m], %[m], 1\n\t"
|
||||||
|
"vmseq.vx v0, v4, zero\n\t"
|
||||||
|
"vadd.vi v14, v14, -4, v0.t\n\t"
|
||||||
|
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||||
|
"vle8.v v0, (%[q8])\n\t"
|
||||||
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||||
|
"vwmul.vv v16, v0, v8\n\t"
|
||||||
|
"vwmul.vv v24, v4, v12\n\t"
|
||||||
|
"vsetivli zero, 16, e16, m2\n\t"
|
||||||
|
"vmv.v.x v0, zero\n\t"
|
||||||
|
"vwredsum.vs v10, v16, v0\n\t"
|
||||||
|
"vwredsum.vs v9, v18, v0\n\t"
|
||||||
|
"vwredsum.vs v8, v20, v0\n\t"
|
||||||
|
"vwredsum.vs v7, v22, v0\n\t"
|
||||||
|
"vwredsum.vs v11, v24, v0\n\t"
|
||||||
|
"vwredsum.vs v12, v26, v0\n\t"
|
||||||
|
"vwredsum.vs v13, v28, v0\n\t"
|
||||||
|
"vwredsum.vs v14, v30, v0\n\t"
|
||||||
|
"vsetivli zero, 4, e32, m1\n\t"
|
||||||
|
"vslideup.vi v10, v9, 1\n\t"
|
||||||
|
"vslideup.vi v8, v7, 1\n\t"
|
||||||
|
"vslideup.vi v11, v12, 1\n\t"
|
||||||
|
"vslideup.vi v13, v14, 1\n\t"
|
||||||
|
"vslideup.vi v10, v8, 2\n\t"
|
||||||
|
"vslideup.vi v11, v13, 2\n\t"
|
||||||
|
"vsetivli zero, 8, e32, m2\n\t"\
|
||||||
|
"vle8.v v15, (%[scale])\n\t"
|
||||||
|
"vsext.vf4 v12, v15\n\t"
|
||||||
|
"vmul.vv v10, v10, v12\n\t"
|
||||||
|
"vredsum.vs v0, v10, v0\n\t"
|
||||||
|
"vmv.x.s %[tmp], v0\n\t"
|
||||||
|
"add %[isum], %[isum], %[tmp]"
|
||||||
|
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
|
||||||
|
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
|
||||||
|
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
|
||||||
|
: "memory"
|
||||||
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||||
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||||
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||||
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||||
|
);
|
||||||
|
q3 += 32; q8 += 128; scale += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
sumf += d * isum;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(false && "Unsupported vector length");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
|
|
||||||
@ -6924,8 +7080,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||||
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
||||||
|
|
||||||
|
const int vector_length = __riscv_vlenb() * 8;
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
|
switch (vector_length) {
|
||||||
|
case 256:
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
size_t vl = 8;
|
size_t vl = 8;
|
||||||
@ -6988,6 +7147,115 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
sumf += d*(sum_1 + sum_2);
|
sumf += d*(sum_1 + sum_2);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||||
|
|
||||||
|
int tmp, tmp2, sumi;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"vsetivli zero, 12, e8, m1\n\t"
|
||||||
|
"vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
|
||||||
|
"vsetivli zero, 4, e32, m1\n\t"
|
||||||
|
"vslidedown.vi v2, v1, 2\n\t"
|
||||||
|
"vmv1r.v v3, v2\n\t"
|
||||||
|
"vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
|
||||||
|
"vsetivli zero, 2, e32, m1\n\t"
|
||||||
|
"vmv.v.i v4, 4\n\t"
|
||||||
|
"vand.vx v8, v1, %[kmask1]\n\t"
|
||||||
|
"vslide1up.vx v5, v4, zero\n\t" // {0, 4}
|
||||||
|
"vsrl.vi v6, v1, 6\n\t"
|
||||||
|
"vsrl.vv v7, v2, v5\n\t"
|
||||||
|
"vand.vx v0, v6, %[kmask3]\n\t"
|
||||||
|
"vand.vx v2, v7, %[kmask2]\n\t"
|
||||||
|
"vsll.vi v6, v0, 4\n\t"
|
||||||
|
"li %[t2], 8\n\t"
|
||||||
|
"addi %[t1], %[utmp], 4\n\t"
|
||||||
|
"vor.vv v1, v6, v2\n\t"
|
||||||
|
"vsse32.v v8, (%[utmp]), %[t2]\n\t"
|
||||||
|
"vsse32.v v1, (%[t1]), %[t2]\n\t"
|
||||||
|
"vsetivli zero, 8, e16, m1\n\t"
|
||||||
|
"vle32.v v2, (%[bsums])\n\t"
|
||||||
|
"vnsrl.wi v0, v2, 0\n\t"
|
||||||
|
"vnsrl.wi v1, v2, 16\n\t"
|
||||||
|
"vadd.vv v2, v0, v1\n\t"
|
||||||
|
"vle8.v v3, (%[mins])\n\t"
|
||||||
|
"vzext.vf2 v4, v3\n\t"
|
||||||
|
"vwmul.vv v6, v4, v2\n\t"
|
||||||
|
"vmv.v.x v0, zero\n\t"
|
||||||
|
"vsetivli zero, 8, e32, m2\n\t"
|
||||||
|
"vredsum.vs v0, v6, v0\n\t"
|
||||||
|
"vmv.x.s %[sumi], v0"
|
||||||
|
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
|
||||||
|
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
|
||||||
|
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
|
||||||
|
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
|
||||||
|
: "memory"
|
||||||
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||||
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||||
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||||
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||||
|
);
|
||||||
|
sumf -= dmin * sumi;
|
||||||
|
|
||||||
|
const uint8_t * restrict q4 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
sumi = 0;
|
||||||
|
const uint8_t * scale = scales;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
int vl128 = 128, vl64 = 64, vl32 = 32;
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||||
|
"vle8.v v8, (%[q8])\n\t"
|
||||||
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||||
|
"vle8.v v0, (%[q4])\n\t"
|
||||||
|
"vsrl.vi v4, v0, 4\n\t"
|
||||||
|
"vand.vi v0, v0, 0xF\n\t"
|
||||||
|
"vsetvli zero, %[vl32], e8, m2\n\t"
|
||||||
|
"vwmul.vv v28, v6, v14\n\t"
|
||||||
|
"vwmul.vv v20, v4, v10\n\t"
|
||||||
|
"vwmul.vv v24, v2, v12\n\t"
|
||||||
|
"vwmul.vv v16, v0, v8\n\t"
|
||||||
|
"vsetivli zero, 4, e32, m1\n\t"
|
||||||
|
"vle8.v v2, (%[scale])\n\t"
|
||||||
|
"vmv.v.x v0, zero\n\t"
|
||||||
|
"vzext.vf4 v1, v2\n\t"
|
||||||
|
"vsetvli zero, %[vl32], e16, m4\n\t"
|
||||||
|
"vwredsum.vs v6, v24, v0\n\t"
|
||||||
|
"vwredsum.vs v7, v28, v0\n\t"
|
||||||
|
"vwredsum.vs v4, v16, v0\n\t"
|
||||||
|
"vwredsum.vs v5, v20, v0\n\t"
|
||||||
|
"vsetivli zero, 4, e32, m1\n\t"
|
||||||
|
"vslideup.vi v6, v7, 1\n\t"
|
||||||
|
"vslideup.vi v4, v5, 1\n\t"
|
||||||
|
"vslideup.vi v4, v6, 2\n\t"
|
||||||
|
"vmul.vv v8, v4, v1\n\t"
|
||||||
|
"vredsum.vs v0, v8, v0\n\t"
|
||||||
|
"vmv.x.s %[tmp], v0\n\t"
|
||||||
|
"add %[sumi], %[sumi], %[tmp]"
|
||||||
|
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
|
||||||
|
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
|
||||||
|
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
|
||||||
|
: "memory"
|
||||||
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||||
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||||
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||||
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||||
|
);
|
||||||
|
|
||||||
|
q4 += 64; q8 += 128; scale += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += d * sumi;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(false && "Unsupported vector length");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
|
|
||||||
@ -7722,9 +7990,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
|
||||||
|
|
||||||
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
|
||||||
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
|
||||||
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
vint16m1_t q8sums = __riscv_vadd_vv_i16m1(q8sums_0, q8sums_1, vl);
|
||||||
|
|
||||||
memcpy(utmp, x[i].scales, 12);
|
memcpy(utmp, x[i].scales, 12);
|
||||||
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||||
@ -7733,11 +8001,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
utmp[2] = uaux;
|
utmp[2] = uaux;
|
||||||
utmp[0] &= kmask1;
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
vuint8mf2_t mins8 = __riscv_vle8_v_u8mf2(mins, vl);
|
||||||
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
vint16m1_t v_mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
|
||||||
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, v_mins, vl);
|
||||||
|
|
||||||
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
vint32m1_t sumi = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
||||||
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
||||||
|
|
||||||
vl = 32;
|
vl = 32;
|
||||||
@ -7746,43 +8014,42 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
uint8_t m = 1;
|
uint8_t m = 1;
|
||||||
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
||||||
vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
|
vuint8m2_t vqh = __riscv_vle8_v_u8m2(hm, vl);
|
||||||
|
|
||||||
for (int j = 0; j < QK_K/64; ++j) {
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
// load Q5 and Q8
|
// load Q5 and Q8
|
||||||
vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
|
vuint8m2_t q5_x = __riscv_vle8_v_u8m2(q5, vl);
|
||||||
vint8m1_t q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
|
vint8m2_t q8_y1 = __riscv_vle8_v_i8m2(q8, vl);
|
||||||
vint8m1_t q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
|
vint8m2_t q8_y2 = __riscv_vle8_v_i8m2(q8+32, vl);
|
||||||
|
|
||||||
// compute mask for addition
|
// compute mask for addition
|
||||||
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
vint8m2_t q5_a = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vx_u8m2(q5_x, 0x0F, vl));
|
||||||
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
vuint8m2_t qh_m1 = __riscv_vand_vx_u8m2(vqh, m, vl);
|
||||||
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
vbool4_t vmask_1 = __riscv_vmsne_vx_u8m2_b4(qh_m1, 0, vl);
|
||||||
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
|
vint8m2_t q5_m1 = __riscv_vadd_vx_i8m2_mu(vmask_1, q5_a, q5_a, 16, vl);
|
||||||
m <<= 1;
|
m <<= 1;
|
||||||
|
|
||||||
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
vint8m2_t q5_l = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vsrl_vx_u8m2(q5_x, 0x04, vl));
|
||||||
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
vuint8m2_t qh_m2 = __riscv_vand_vx_u8m2(vqh, m, vl);
|
||||||
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
vbool4_t vmask_2 = __riscv_vmsne_vx_u8m2_b4(qh_m2, 0, vl);
|
||||||
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
|
vint8m2_t q5_m2 = __riscv_vadd_vx_i8m2_mu(vmask_2, q5_l, q5_l, 16, vl);
|
||||||
m <<= 1;
|
m <<= 1;
|
||||||
|
|
||||||
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
vint16m4_t v0 = __riscv_vwmul_vv_i16m4(q5_m1, q8_y1, vl);
|
||||||
vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
|
vint16m4_t v1 = __riscv_vwmul_vv_i16m4(q5_m2, q8_y2, vl);
|
||||||
|
|
||||||
vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
|
vint32m8_t vs1 = __riscv_vwmul_vx_i32m8(v0, scales[is++], vl);
|
||||||
vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
|
vint32m8_t vs2 = __riscv_vwmul_vx_i32m8(v1, scales[is++], vl);
|
||||||
|
|
||||||
vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
|
vint32m1_t vacc1 = __riscv_vredsum_vs_i32m8_i32m1(vs1, vzero, vl);
|
||||||
vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
|
vint32m1_t vacc2 = __riscv_vredsum_vs_i32m8_i32m1(vs2, vacc1, vl);
|
||||||
|
|
||||||
aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
|
aux32 += __riscv_vmv_x_s_i32m1_i32(vacc2);
|
||||||
q5 += 32; q8 += 64;
|
q5 += 32; q8 += 64;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
|
sums += aux32 * d;
|
||||||
sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -8667,7 +8934,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
|
|
||||||
#elif defined __riscv_v_intrinsic
|
#elif defined __riscv_v_intrinsic
|
||||||
|
|
||||||
|
const int vector_length = __riscv_vlenb() * 8;
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
|
switch (vector_length) {
|
||||||
|
case 256:
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
@ -8747,6 +9018,85 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|||||||
sumf += d * sum_t;
|
sumf += d * sum_t;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
|
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||||
|
|
||||||
|
const uint8_t * restrict q6 = x[i].ql;
|
||||||
|
const uint8_t * restrict qh = x[i].qh;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
|
const int8_t * restrict scale = x[i].scales;
|
||||||
|
|
||||||
|
int sum_t = 0;
|
||||||
|
int t0;
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
__asm__ __volatile__(
|
||||||
|
"vsetvli zero, %[vl32], e8, m2\n\t"
|
||||||
|
"vle8.v v4, (%[qh])\n\t"
|
||||||
|
"vsll.vi v0, v4, 4\n\t"
|
||||||
|
"vsll.vi v2, v4, 2\n\t"
|
||||||
|
"vsrl.vi v6, v4, 2\n\t"
|
||||||
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||||
|
"vle8.v v8, (%[q6])\n\t"
|
||||||
|
"vsrl.vi v12, v8, 4\n\t"
|
||||||
|
"vand.vi v8, v8, 0xF\n\t"
|
||||||
|
"vsetvli zero, %[vl128], e8, m8\n\t"
|
||||||
|
"vand.vx v0, v0, %[mask]\n\t"
|
||||||
|
"vor.vv v8, v8, v0\n\t"
|
||||||
|
"vle8.v v0, (%[q8])\n\t"
|
||||||
|
"vsub.vx v8, v8, %[vl32]\n\t"
|
||||||
|
"vsetvli zero, %[vl64], e8, m4\n\t"
|
||||||
|
"vwmul.vv v16, v0, v8\n\t"
|
||||||
|
"vwmul.vv v24, v4, v12\n\t"
|
||||||
|
"vsetivli zero, 16, e16, m2\n\t"
|
||||||
|
"vmv.v.x v0, zero\n\t"
|
||||||
|
"vwredsum.vs v10, v16, v0\n\t"
|
||||||
|
"vwredsum.vs v9, v18, v0\n\t"
|
||||||
|
"vwredsum.vs v8, v20, v0\n\t"
|
||||||
|
"vwredsum.vs v7, v22, v0\n\t"
|
||||||
|
"vwredsum.vs v11, v24, v0\n\t"
|
||||||
|
"vwredsum.vs v12, v26, v0\n\t"
|
||||||
|
"vwredsum.vs v13, v28, v0\n\t"
|
||||||
|
"vwredsum.vs v14, v30, v0\n\t"
|
||||||
|
"vsetivli zero, 4, e32, m1\n\t"
|
||||||
|
"vslideup.vi v10, v9, 1\n\t"
|
||||||
|
"vslideup.vi v8, v7, 1\n\t"
|
||||||
|
"vslideup.vi v11, v12, 1\n\t"
|
||||||
|
"vslideup.vi v13, v14, 1\n\t"
|
||||||
|
"vslideup.vi v10, v8, 2\n\t"
|
||||||
|
"vslideup.vi v11, v13, 2\n\t"
|
||||||
|
"vsetivli zero, 8, e32, m2\n\t"
|
||||||
|
"vle8.v v2, (%[scale])\n\t"
|
||||||
|
"vsext.vf4 v4, v2\n\t"
|
||||||
|
"vmul.vv v2, v4, v10\n\t"
|
||||||
|
"vredsum.vs v0, v2, v0\n\t"
|
||||||
|
"vmv.x.s %[t0], v0\n\t"
|
||||||
|
"add %[sumi], %[sumi], %[t0]"
|
||||||
|
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
|
||||||
|
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
|
||||||
|
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
|
||||||
|
, [mask] "r" (0x30)
|
||||||
|
: "memory"
|
||||||
|
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
|
||||||
|
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
|
||||||
|
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||||
|
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
|
||||||
|
);
|
||||||
|
q6 += 64; qh += 32; q8 += 128; scale += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += d * sum_t;
|
||||||
|
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(false && "Unsupported vector length");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
|
|
||||||
|
@ -381,6 +381,35 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#elif defined(__riscv) && defined(GGML_RV_ZFH)
|
||||||
|
|
||||||
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
float f;
|
||||||
|
__asm__(
|
||||||
|
"fmv.h.x %[f], %[h]\n\t"
|
||||||
|
"fcvt.s.h %[f], %[f]"
|
||||||
|
: [f] "=&f" (f)
|
||||||
|
: [h] "r" (h)
|
||||||
|
);
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||||
|
ggml_fp16_t res;
|
||||||
|
__asm__(
|
||||||
|
"fcvt.h.s %[f], %[f]\n\t"
|
||||||
|
"fmv.x.h %[h], %[f]"
|
||||||
|
: [h] "=&r" (res)
|
||||||
|
: [f] "f" (f)
|
||||||
|
);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||||
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||||
|
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||||
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
// FP16 <-> FP32
|
// FP16 <-> FP32
|
||||||
|
Loading…
Reference in New Issue
Block a user