mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-08-10 10:37:56 +02:00
@ -1,50 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_KOMPUTE_MAX_DEVICES 16
|
||||
|
||||
struct ggml_vk_device {
|
||||
int index;
|
||||
int type; // same as VkPhysicalDeviceType
|
||||
size_t heapSize;
|
||||
const char * name;
|
||||
const char * vendor;
|
||||
int subgroupSize;
|
||||
uint64_t bufferAlignment;
|
||||
uint64_t maxAlloc;
|
||||
};
|
||||
|
||||
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
|
||||
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
|
||||
bool ggml_vk_has_vulkan(void);
|
||||
bool ggml_vk_has_device(void);
|
||||
struct ggml_vk_device ggml_vk_current_device(void);
|
||||
|
||||
//
|
||||
// backend API
|
||||
//
|
||||
|
||||
// forward declaration
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -1,30 +0,0 @@
|
||||
file(GLOB SRC_FILES
|
||||
get_row_f32.cpp
|
||||
get_row_f16.cpp
|
||||
get_row_q4_0.cpp
|
||||
get_row_q8_0.cpp
|
||||
quantize_f32_q8_0.cpp
|
||||
quantize_f16_q8_0.cpp
|
||||
quantize_float_to_q4_0.cpp
|
||||
dup.cpp
|
||||
)
|
||||
|
||||
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
|
||||
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
|
||||
|
||||
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
|
||||
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
|
||||
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
|
||||
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
|
||||
else()
|
||||
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
|
||||
endif()
|
||||
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
|
||||
|
||||
ascendc_library(ascendc_kernels STATIC
|
||||
${SRC_FILES}
|
||||
)
|
||||
|
||||
message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
|
||||
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
||||
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
|
@ -1,19 +0,0 @@
|
||||
#ifndef ASCENDC_KERNELS_H
|
||||
#define ASCENDC_KERNELS_H
|
||||
|
||||
#include "aclrtlaunch_ascendc_get_row_f32.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_f16.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_get_row_q4_0.h"
|
||||
|
||||
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
|
||||
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
|
||||
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
|
||||
#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
|
||||
|
||||
#endif // ASCENDC_KERNELS_H
|
@ -1,234 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
|
||||
|
||||
template <typename SRC_T, typename DST_T>
|
||||
class DupByRows {
|
||||
public:
|
||||
__aicore__ inline DupByRows() {}
|
||||
__aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
|
||||
size_t *input_nb_ub) {
|
||||
/* Dup by rows when src is contigous on first dimension and dst is
|
||||
contiguous, each kernel process one row.
|
||||
*/
|
||||
|
||||
// Input has four dims.
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
// param
|
||||
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
|
||||
num_elem = input_ne_ub[0];
|
||||
|
||||
// index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
|
||||
idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
|
||||
idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
|
||||
/ (input_ne_ub[1]);
|
||||
idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
|
||||
- idx_ne2 * input_ne_ub[1];
|
||||
|
||||
// src may not contiguous in dim [1,2,3], so stride decited by ne&nb
|
||||
src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
|
||||
+ input_nb_ub[1] * idx_ne1;
|
||||
|
||||
// dst is contiguous
|
||||
dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
|
||||
|
||||
src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
|
||||
src_stride));
|
||||
dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
|
||||
dst_stride));
|
||||
|
||||
pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
|
||||
32 - 1) / 32 * 32);
|
||||
pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
|
||||
32 - 1) / 32 * 32);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in() {
|
||||
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
|
||||
const size_t elem_per_block = 32 / sizeof(SRC_T);
|
||||
size_t tail = num_elem % elem_per_block;
|
||||
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
|
||||
DataCopy(src_local, src_gm, cpy_elements_len);
|
||||
src_queue.EnQue(src_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out() {
|
||||
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
|
||||
#ifdef ASCEND_310P
|
||||
const size_t elem_per_block = 32 / sizeof(DST_T);
|
||||
size_t tail = num_elem % elem_per_block;
|
||||
size_t len = num_elem & ~(elem_per_block - 1);
|
||||
if (len > 0) {
|
||||
DataCopy(dst_gm, dst_local, len);
|
||||
}
|
||||
if(tail != 0) {
|
||||
for (size_t i = tail; i < elem_per_block; i++) {
|
||||
dst_local[len + i].SetValue(0, 0);
|
||||
}
|
||||
SetAtomicAdd<float>();
|
||||
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
|
||||
SetAtomicNone();
|
||||
}
|
||||
#else
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
|
||||
DataCopyPad(dst_gm, dst_local, dataCopyParams);
|
||||
#endif
|
||||
dst_queue.FreeTensor(dst_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void dup() {
|
||||
// main process, copy one row data from src to dst.
|
||||
copy_in();
|
||||
|
||||
LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
|
||||
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
|
||||
|
||||
int32_t BLOCK_NUM = 32 / sizeof(DST_T);
|
||||
DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
|
||||
/ BLOCK_NUM * BLOCK_NUM);
|
||||
dst_queue.EnQue<DST_T>(dst_local);
|
||||
|
||||
src_queue.FreeTensor(src_local);
|
||||
copy_out();
|
||||
}
|
||||
|
||||
__aicore__ inline void dup_with_cast() {
|
||||
// main process, copy one row data from src to dst.
|
||||
// cast dtype from src to dst.
|
||||
copy_in();
|
||||
|
||||
LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
|
||||
LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
|
||||
|
||||
Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
|
||||
dst_queue.EnQue<DST_T>(dst_local);
|
||||
|
||||
src_queue.FreeTensor(src_local);
|
||||
copy_out();
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<SRC_T> src_gm;
|
||||
GlobalTensor<DST_T> dst_gm;
|
||||
|
||||
int64_t num_rows;
|
||||
int64_t num_elem;
|
||||
int64_t idx_ne3;
|
||||
int64_t idx_ne2;
|
||||
int64_t idx_ne1;
|
||||
int64_t src_stride;
|
||||
int64_t dst_stride;
|
||||
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<half, half> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<float, float> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<float, half> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup_with_cast();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
|
||||
GM_ADDR src_gm,
|
||||
GM_ADDR dst_gm,
|
||||
GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm,
|
||||
GM_ADDR output_ne_gm,
|
||||
GM_ADDR output_nb_gm) {
|
||||
|
||||
// copy params from gm to ub.
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
DupByRows<half, float> op;
|
||||
op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
|
||||
op.dup_with_cast();
|
||||
}
|
@ -1,197 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
class GET_ROW_F16 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_F16() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
||||
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
||||
// TODO, use template for F16/f32
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ half *)input);
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
|
||||
& ~31);
|
||||
uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
|
||||
& ~31);
|
||||
|
||||
local_buffer_elems = input_local_buffer_size / sizeof(half);
|
||||
|
||||
// TODO, consider long row that can't put in UB.
|
||||
// All data should asign to 32. It's ok because all data is align to 32.
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
||||
size_t origin_len = len;
|
||||
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
||||
const size_t elem_per_block = 32 / sizeof(half);
|
||||
size_t tail = len % elem_per_block;
|
||||
len = len & ~(elem_per_block - 1);
|
||||
if(tail != 0) {
|
||||
len += elem_per_block;
|
||||
}
|
||||
DataCopy(input_local, input_gm[offset], len);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
const size_t elem_per_block = 32 / sizeof(float);
|
||||
size_t tail = len % elem_per_block;
|
||||
len = len & ~(elem_per_block - 1);
|
||||
if (len > 0) {
|
||||
DataCopy(output_gm[offset], output_local, len);
|
||||
}
|
||||
|
||||
if(tail != 0) {
|
||||
#ifdef ASCEND_310P
|
||||
for (size_t i = tail; i < elem_per_block; i++) {
|
||||
output_local[len + i].SetValue(0, 0);
|
||||
}
|
||||
SetAtomicAdd<float>();
|
||||
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
||||
SetAtomicNone();
|
||||
#else
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = tail * sizeof(float);
|
||||
DataCopyPad(output_gm[offset + len], output_local[len],
|
||||
dataCopyParams);
|
||||
#endif
|
||||
}
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_row(int64_t idx) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3];
|
||||
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3];
|
||||
|
||||
copy_in(input_offset, input_ne[0]);
|
||||
LocalTensor<half> input_local = input_queue.DeQue<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
Cast(output_local, input_local, RoundMode::CAST_NONE,
|
||||
local_buffer_elems);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset, input_ne[0]);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
calculate_row(i);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
size_t local_buffer_elems;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<half> input_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
int64_t op_block_idx;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_f16(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
|
||||
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_F16 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
|
||||
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
@ -1,190 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
class GET_ROW_F32 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_F32() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *indices_ne_ub, size_t *indices_nb_ub,
|
||||
int64_t *output_ne_ub, size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ float *)input);
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
|
||||
local_buffer_elems = local_buffer_size / sizeof(float);
|
||||
|
||||
// TODO, consider long row that can't put in UB.
|
||||
// All data should asign to 32. It's ok because all data is align to 32.
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
||||
const size_t elem_per_block = 32 / sizeof(float);
|
||||
size_t tail = len % elem_per_block;
|
||||
len = len & ~(elem_per_block - 1);
|
||||
if(tail != 0) {
|
||||
len += elem_per_block;
|
||||
}
|
||||
DataCopy(input_local, input_gm[offset], len);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
const size_t elem_per_block = 32 / sizeof(float);
|
||||
size_t tail = len % elem_per_block;
|
||||
len = len & ~(elem_per_block - 1);
|
||||
if (len > 0) {
|
||||
DataCopy(output_gm[offset], output_local, len);
|
||||
}
|
||||
|
||||
if(tail != 0) {
|
||||
#ifdef ASCEND_310P
|
||||
for (size_t i = tail; i < elem_per_block; i++) {
|
||||
output_local[len + i].SetValue(0, 0);
|
||||
}
|
||||
SetAtomicAdd<float>();
|
||||
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
|
||||
SetAtomicNone();
|
||||
#else
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = tail * sizeof(float);
|
||||
DataCopyPad(output_gm[offset + len], output_local[len],
|
||||
dataCopyParams);
|
||||
#endif
|
||||
}
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_row(int64_t idx) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3];
|
||||
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3];
|
||||
|
||||
copy_in(input_offset, input_ne[0]);
|
||||
LocalTensor<float> input_local = input_queue.DeQue<float>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
DataCopy(output_local, input_local, local_buffer_elems);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset, input_ne[0]);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
calculate_row(i);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
size_t local_buffer_elems;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<float> input_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
int64_t op_block_idx;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_f32(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
|
||||
GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_F32 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
|
||||
indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
@ -1,204 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
#ifdef ASCEND_310P // 310P not support 4bit get row
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support 4bit get row.\n");
|
||||
}
|
||||
#else
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
#define QK4_0 32
|
||||
|
||||
class GET_ROW_Q4_0 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_Q4_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, int64_t *indices_ne_ub,
|
||||
size_t *indices_nb_ub, int64_t *output_ne_ub,
|
||||
size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
scale_ne[i] = input_ne_ub[i];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// one scale for a group.
|
||||
scale_ne[0] /= QK4_0;
|
||||
|
||||
input_stride[0] = 1;
|
||||
scale_stride[0] = 1;
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
group_size_in_row = input_ne[0] / QK4_0;
|
||||
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
|
||||
input_ne[3] / 2;
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
|
||||
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
|
||||
// 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
|
||||
DataCopy(input_local, input_gm[offset], QK4_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
DataCopy(output_gm[offset], output_local, QK4_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3] +
|
||||
group * QK4_0;
|
||||
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
|
||||
indices_ne1_idx * scale_stride[2] +
|
||||
indices_ne2_idx * scale_stride[3] + group;
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3] +
|
||||
group * QK4_0;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
// TODO: cast more data to speed up.
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
|
||||
|
||||
// Only mul need compile by group.
|
||||
half scale = scale_gm.GetValue(scale_offset);
|
||||
|
||||
Muls(output_local, output_local, (float)scale, QK4_0);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
output_queue.EnQue(output_local);
|
||||
|
||||
copy_out(output_offset);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
calculate_group(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t scale_ne[4];
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<int4b_t> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_Q4_0 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
|
||||
indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
#endif // #ifdef ASCEND_310P
|
@ -1,191 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
// optimize me. Use template to avoid copy code.
|
||||
using namespace AscendC;
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
|
||||
#define QK8_0 32
|
||||
|
||||
class GET_ROW_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline GET_ROW_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
|
||||
int64_t *input_ne_ub, int64_t *indices_ne_ub,
|
||||
size_t *indices_nb_ub, int64_t *output_ne_ub,
|
||||
size_t *output_nb_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
indices_ne[i] = indices_ne_ub[i];
|
||||
indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
|
||||
scale_ne[i] = input_ne_ub[i];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
|
||||
}
|
||||
|
||||
// one scale for a group.
|
||||
scale_ne[0] /= QK8_0;
|
||||
|
||||
input_stride[0] = 1;
|
||||
scale_stride[0] = 1;
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
group_size_in_row = input_ne[0] / QK8_0;
|
||||
int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
|
||||
input_ne[3] * sizeof(int8_t);
|
||||
|
||||
// Indices has two dims. n_elements = all rows should get.
|
||||
// dr, all rows should this thread get.
|
||||
uint64_t n_elements =
|
||||
indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
|
||||
dr = n_elements / op_block_num;
|
||||
|
||||
uint64_t tails = n_elements % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
|
||||
indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
|
||||
output_gm.SetGlobalBuffer((__gm__ float *)output);
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<float> output_local = output_queue.DeQue<float>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate_group(int64_t idx, int64_t group) {
|
||||
const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
|
||||
const int64_t indices_ne1_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
|
||||
indices_ne[0];
|
||||
const int64_t indices_ne0_idx =
|
||||
(idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
|
||||
indices_ne1_idx * indices_ne[0]);
|
||||
|
||||
const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
|
||||
indices_ne1_idx * indices_stride[1] +
|
||||
indices_ne2_idx * indices_stride[2];
|
||||
const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
|
||||
|
||||
const int64_t input_offset = selected_row_idx * input_stride[1] +
|
||||
indices_ne1_idx * input_stride[2] +
|
||||
indices_ne2_idx * input_stride[3] +
|
||||
group * QK8_0;
|
||||
const int64_t scale_offset = selected_row_idx * scale_stride[1] +
|
||||
indices_ne1_idx * scale_stride[2] +
|
||||
indices_ne2_idx * scale_stride[3] + group;
|
||||
const int64_t output_offset = indices_ne0_idx * output_stride[1] +
|
||||
indices_ne1_idx * output_stride[2] +
|
||||
indices_ne2_idx * output_stride[3] +
|
||||
group * QK8_0;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
|
||||
|
||||
// TODO: cast more data to speed up.
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
|
||||
|
||||
// Only mul need compile by group.
|
||||
half scale = scale_gm.GetValue(scale_offset);
|
||||
Muls(output_local, output_local, (float)scale, QK8_0);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
output_queue.EnQue(output_local);
|
||||
|
||||
copy_out(output_offset);
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
calculate_group(i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t scale_ne[4];
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t indices_ne[4];
|
||||
size_t indices_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<int8_t> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int32_t> indices_gm;
|
||||
GlobalTensor<float> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
|
||||
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
|
||||
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
int64_t indices_ne_ub[4];
|
||||
size_t indices_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
size_t output_nb_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
|
||||
copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
copy_to_ub(output_nb_gm, output_nb_ub, 32);
|
||||
|
||||
GET_ROW_Q8_0 op;
|
||||
op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
|
||||
indices_nb_ub, output_ne_ub, output_nb_ub);
|
||||
op.calculate();
|
||||
}
|
@ -1,218 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
#ifdef ASCEND_310P
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support f16->8bit quantization.\n");
|
||||
}
|
||||
#else
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define QK8_0 32
|
||||
|
||||
class QUANTIZE_F16_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_F16_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / QK8_0;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t);
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ half *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
|
||||
group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(work_queue, 1, 32);
|
||||
pipe.InitBuffer(max_queue, 1, 32);
|
||||
pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(scale_queue, 1, 32);
|
||||
pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + QK8_0 * group;
|
||||
|
||||
const int64_t output_offset = i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] + QK8_0 * group;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<half> input_local = input_queue.DeQue<half>();
|
||||
LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
|
||||
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
|
||||
Abs(abs_local, cast_local, QK8_0);
|
||||
ReduceMax(max_local, abs_local, work_local, QK8_0);
|
||||
|
||||
pipe_barrier(PIPE_ALL);
|
||||
float d = max_local.GetValue(0);
|
||||
d = d / ((1 << 7) - 1);
|
||||
if (d != 0) {
|
||||
Muls(cast_local, cast_local, 1.0f / d, QK8_0);
|
||||
}
|
||||
|
||||
Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
abs_queue.FreeTensor(abs_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
if (scale_local_offset == 16) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local, 16);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<half> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, 1> work_queue;
|
||||
TQue<QuePosition::VECOUT, 1> max_queue;
|
||||
TQue<QuePosition::VECIN, 1> abs_queue;
|
||||
TQue<QuePosition::VECOUT, 1> scale_queue;
|
||||
TQue<QuePosition::VECOUT, 1> cast_queue;
|
||||
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_F16_Q8_0 op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
#endif // #ifdef ASCEND_310P
|
@ -1,216 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
#ifdef ASCEND_310P // 310P not support f32->8bit quantization
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support f32->8bit quantization.\n");
|
||||
}
|
||||
#else
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define QK8_0 32
|
||||
|
||||
class QUANTIZE_F32_Q8_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_F32_Q8_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / QK8_0;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t);
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ float *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
|
||||
ir * group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
|
||||
pipe.InitBuffer(work_queue, 1, 32);
|
||||
pipe.InitBuffer(max_queue, 1, 32);
|
||||
pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
|
||||
pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
|
||||
pipe.InitBuffer(scale_queue, 1, 32);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
|
||||
DataCopy(input_local, input_gm[offset], QK8_0);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
|
||||
DataCopy(output_gm[offset], output_local, QK8_0);
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + QK8_0 * group;
|
||||
|
||||
const int64_t output_offset = i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] + QK8_0 * group;
|
||||
|
||||
copy_in(input_offset);
|
||||
LocalTensor<float> input_local = input_queue.DeQue<float>();
|
||||
LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
|
||||
|
||||
Abs(abs_local, input_local, QK8_0);
|
||||
ReduceMax(max_local, abs_local, work_local, QK8_0);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
float d = max_local.GetValue(0);
|
||||
d = d / ((1 << 7) - 1);
|
||||
if (d != 0) {
|
||||
Muls(input_local, input_local, 1.0f / d, QK8_0);
|
||||
}
|
||||
|
||||
Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
abs_queue.FreeTensor(abs_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
if (scale_local_offset == 16) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local, 16);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<float> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, 1> work_queue;
|
||||
TQue<QuePosition::VECOUT, 1> max_queue;
|
||||
TQue<QuePosition::VECIN, 1> abs_queue;
|
||||
TQue<QuePosition::VECIN, 1> cast_queue;
|
||||
TQue<QuePosition::VECOUT, 1> scale_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_F32_Q8_0 op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
#endif // #ifdef ASCEND_310P
|
@ -1,295 +0,0 @@
|
||||
#include "kernel_operator.h"
|
||||
|
||||
using namespace AscendC;
|
||||
#ifdef ASCEND_310P // 310P not support float->4bit quantization
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support f32->4bit quantization.\n");
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
|
||||
printf("Ascend310P not support f16->4bit quantization.\n");
|
||||
}
|
||||
#else
|
||||
|
||||
#define BUFFER_NUM 2
|
||||
#define Group_Size 32
|
||||
|
||||
template <typename SRC_T>
|
||||
class QUANTIZE_FLOAT_TO_Q4_0 {
|
||||
public:
|
||||
__aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
|
||||
__aicore__ inline void init(GM_ADDR input, GM_ADDR output,
|
||||
int64_t *input_ne_ub, size_t *input_nb_ub,
|
||||
int64_t *output_ne_ub) {
|
||||
// TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
|
||||
// permute=[0,0,0,0]):
|
||||
// [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
|
||||
int64_t op_block_num = GetBlockNum();
|
||||
int64_t op_block_idx = GetBlockIdx();
|
||||
|
||||
// input stride of data elements
|
||||
for (int i = 0; i < 4; i++) {
|
||||
input_ne[i] = input_ne_ub[i];
|
||||
input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
|
||||
output_ne[i] = output_ne_ub[i];
|
||||
}
|
||||
|
||||
// output stride of data elements
|
||||
output_stride[0] = 1;
|
||||
for (int i = 1; i < 4; i++) {
|
||||
output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
|
||||
}
|
||||
|
||||
// scale saved one by one after data:. [group1_scale, group2_scale, ...]
|
||||
scale_ne = input_ne;
|
||||
scale_stride[0] = 1;
|
||||
scale_stride[1] = input_ne[0] / Group_Size;
|
||||
for (int i = 2; i < 4; i++) {
|
||||
scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// split input tensor by rows.
|
||||
uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
|
||||
dr = nr / op_block_num;
|
||||
|
||||
uint64_t tails = nr % op_block_num;
|
||||
if (op_block_idx < tails) {
|
||||
dr += 1;
|
||||
ir = dr * op_block_idx;
|
||||
} else {
|
||||
ir = dr * op_block_idx + tails;
|
||||
}
|
||||
|
||||
group_size_in_row = scale_stride[1];
|
||||
int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
|
||||
output_ne[3] * sizeof(uint8_t) / 2;
|
||||
|
||||
input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
|
||||
output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
|
||||
scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
|
||||
group_size_in_row *
|
||||
sizeof(half)));
|
||||
|
||||
pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
|
||||
pipe.InitBuffer(output_queue, BUFFER_NUM,
|
||||
Group_Size * sizeof(int8_t) / 2);
|
||||
pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
|
||||
pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
|
||||
pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
|
||||
pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_in(uint32_t offset) {
|
||||
LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
|
||||
DataCopy(input_local, input_gm[offset], Group_Size);
|
||||
input_queue.EnQue(input_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void copy_out(uint32_t offset) {
|
||||
// reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
|
||||
// and using DataCopyPad to avoid 32 bits align.
|
||||
LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
|
||||
LocalTensor<int8_t> output_int8_local =
|
||||
output_local.ReinterpretCast<int8_t>();
|
||||
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
|
||||
DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
|
||||
|
||||
output_queue.FreeTensor(output_local);
|
||||
}
|
||||
|
||||
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
||||
LocalTensor<float> input_local) {
|
||||
DataCopy(cast_local, input_local, Group_Size);
|
||||
}
|
||||
|
||||
__aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
|
||||
LocalTensor<half> input_local) {
|
||||
Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
|
||||
}
|
||||
|
||||
__aicore__ inline half calculate_group(int64_t row, int64_t group) {
|
||||
const int64_t i3 = row / (input_ne[1] * input_ne[2]);
|
||||
const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
|
||||
const int64_t i1 =
|
||||
row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
|
||||
|
||||
const int64_t input_offset = i1 * input_stride[1] +
|
||||
i2 * input_stride[2] +
|
||||
i3 * input_stride[3] + Group_Size * group;
|
||||
|
||||
// output_offset is stride for output_gm which datatype is int8_t and
|
||||
// divided by 2 is needed for int4b_t.
|
||||
const int64_t output_offset = (i1 * output_stride[1] +
|
||||
i2 * output_stride[2] +
|
||||
i3 * output_stride[3] +
|
||||
Group_Size * group) / 2;
|
||||
copy_in(input_offset);
|
||||
|
||||
LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
|
||||
LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
|
||||
LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
|
||||
LocalTensor<float> work_local = work_queue.AllocTensor<float>();
|
||||
LocalTensor<float> max_local = max_queue.AllocTensor<float>();
|
||||
LocalTensor<float> min_local = min_queue.AllocTensor<float>();
|
||||
LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
|
||||
LocalTensor<half> half_local = half_queue.AllocTensor<half>();
|
||||
|
||||
input_to_cast(cast_local, input_local);
|
||||
|
||||
ReduceMax(max_local, cast_local, work_local, Group_Size);
|
||||
ReduceMin(min_local, cast_local, work_local, Group_Size);
|
||||
const float max_value = max_local.GetValue(0);
|
||||
const float min_value = min_local.GetValue(0);
|
||||
float d = max_value;
|
||||
if (min_value < 0 && (-1 * min_value) > max_value) {
|
||||
d = min_value;
|
||||
}
|
||||
|
||||
d = d / (-8);
|
||||
if (d != 0) {
|
||||
Muls(cast_local, cast_local, 1.0f / d, Group_Size);
|
||||
}
|
||||
|
||||
// range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
|
||||
float scalar = 8.5f;
|
||||
Adds(cast_local, cast_local, scalar, Group_Size);
|
||||
Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
|
||||
scalar = 15.0f;
|
||||
Mins(cast_local, cast_local, scalar, Group_Size);
|
||||
scalar = -8.0f;
|
||||
Adds(cast_local, cast_local, scalar, Group_Size);
|
||||
|
||||
// float->half->int4b
|
||||
Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
|
||||
Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
|
||||
|
||||
output_queue.EnQue(output_local);
|
||||
copy_out(output_offset);
|
||||
|
||||
input_queue.FreeTensor(input_local);
|
||||
work_queue.FreeTensor(work_local);
|
||||
max_queue.FreeTensor(max_local);
|
||||
min_queue.FreeTensor(min_local);
|
||||
int8_queue.FreeTensor(int8_local);
|
||||
half_queue.FreeTensor(half_local);
|
||||
cast_queue.FreeTensor(cast_local);
|
||||
return (half)d;
|
||||
}
|
||||
|
||||
__aicore__ inline void calculate() {
|
||||
LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
|
||||
uint32_t scale_local_offset = 0;
|
||||
uint32_t scale_global_offset = 0;
|
||||
for (int64_t i = ir; i < ir + dr; i++) {
|
||||
for (int64_t j = 0; j < group_size_in_row; j++) {
|
||||
half scale = calculate_group(i, j);
|
||||
scale_local.SetValue(scale_local_offset++, scale);
|
||||
// Copy Group_Size/2 length data each time.
|
||||
if (scale_local_offset == Group_Size / 2) {
|
||||
scale_local_offset = 0;
|
||||
// TODO: OPTIMIZE ME
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopy(scale_gm[scale_global_offset], scale_local,
|
||||
Group_Size / 2);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
scale_global_offset += Group_Size / 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (scale_local_offset != 0) {
|
||||
pipe_barrier(PIPE_ALL);
|
||||
DataCopyExtParams dataCopyParams;
|
||||
dataCopyParams.blockCount = 1;
|
||||
dataCopyParams.blockLen = scale_local_offset * sizeof(half);
|
||||
DataCopyPad(scale_gm[scale_global_offset], scale_local,
|
||||
dataCopyParams);
|
||||
pipe_barrier(PIPE_ALL);
|
||||
}
|
||||
scale_queue.FreeTensor(scale_local);
|
||||
}
|
||||
|
||||
private:
|
||||
int64_t input_ne[4];
|
||||
size_t input_stride[4];
|
||||
|
||||
int64_t *scale_ne;
|
||||
size_t scale_stride[4];
|
||||
|
||||
int64_t output_ne[4];
|
||||
size_t output_stride[4];
|
||||
|
||||
int64_t group_size_in_row;
|
||||
|
||||
int64_t ir;
|
||||
int64_t dr;
|
||||
|
||||
TPipe pipe;
|
||||
GlobalTensor<SRC_T> input_gm;
|
||||
GlobalTensor<half> scale_gm;
|
||||
GlobalTensor<int8_t> output_gm;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
|
||||
TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
|
||||
TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
|
||||
auto gm_ptr = (__gm__ uint8_t *)gm;
|
||||
auto ub_ptr = (uint8_t *)(ub);
|
||||
for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
|
||||
*ub_ptr = *gm_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_FLOAT_TO_Q4_0<half> op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
|
||||
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
|
||||
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
|
||||
int64_t input_ne_ub[4];
|
||||
size_t input_nb_ub[4];
|
||||
int64_t output_ne_ub[4];
|
||||
|
||||
copy_to_ub(input_ne_gm, input_ne_ub, 32);
|
||||
copy_to_ub(input_nb_gm, input_nb_ub, 32);
|
||||
copy_to_ub(output_ne_gm, output_ne_ub, 32);
|
||||
|
||||
QUANTIZE_FLOAT_TO_Q4_0<float> op;
|
||||
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
|
||||
op.calculate();
|
||||
}
|
||||
|
||||
#endif // #ifdef ASCEND_310P
|
@ -1,166 +0,0 @@
|
||||
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
|
||||
|
||||
if (NOT glslc_executable)
|
||||
message(FATAL_ERROR "glslc not found")
|
||||
endif()
|
||||
|
||||
ggml_add_backend_library(ggml-kompute
|
||||
ggml-kompute.cpp
|
||||
../../include/ggml-kompute.h
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
|
||||
target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
|
||||
function(compile_shader)
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(multiValueArgs SOURCES)
|
||||
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
foreach(source ${compile_shader_SOURCES})
|
||||
get_filename_component(filename ${source} NAME)
|
||||
set(spv_file ${filename}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${spv_file}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
COMMENT "Compiling ${source} to ${spv_file}"
|
||||
)
|
||||
|
||||
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
|
||||
set(FILE_NAME "shader${RAW_FILE_NAME}")
|
||||
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
|
||||
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
|
||||
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
|
||||
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
|
||||
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
|
||||
)
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||
message(STATUS "Kompute found")
|
||||
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
|
||||
add_subdirectory(kompute)
|
||||
|
||||
# Compile our shaders
|
||||
compile_shader(SOURCES
|
||||
kompute-shaders/op_scale.comp
|
||||
kompute-shaders/op_scale_8.comp
|
||||
kompute-shaders/op_add.comp
|
||||
kompute-shaders/op_addrow.comp
|
||||
kompute-shaders/op_mul.comp
|
||||
kompute-shaders/op_silu.comp
|
||||
kompute-shaders/op_relu.comp
|
||||
kompute-shaders/op_gelu.comp
|
||||
kompute-shaders/op_softmax.comp
|
||||
kompute-shaders/op_norm.comp
|
||||
kompute-shaders/op_rmsnorm.comp
|
||||
kompute-shaders/op_diagmask.comp
|
||||
kompute-shaders/op_mul_mat_mat_f32.comp
|
||||
kompute-shaders/op_mul_mat_f16.comp
|
||||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q4_k.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f32.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
kompute-shaders/op_getrows_q4_0.comp
|
||||
kompute-shaders/op_getrows_q4_1.comp
|
||||
kompute-shaders/op_getrows_q6_k.comp
|
||||
kompute-shaders/op_rope_norm_f16.comp
|
||||
kompute-shaders/op_rope_norm_f32.comp
|
||||
kompute-shaders/op_rope_neox_f16.comp
|
||||
kompute-shaders/op_rope_neox_f32.comp
|
||||
kompute-shaders/op_cpy_f16_f16.comp
|
||||
kompute-shaders/op_cpy_f16_f32.comp
|
||||
kompute-shaders/op_cpy_f32_f16.comp
|
||||
kompute-shaders/op_cpy_f32_f32.comp
|
||||
)
|
||||
|
||||
# Create a custom target for our generated shaders
|
||||
add_custom_target(generated_shaders DEPENDS
|
||||
shaderop_scale.h
|
||||
shaderop_scale_8.h
|
||||
shaderop_add.h
|
||||
shaderop_addrow.h
|
||||
shaderop_mul.h
|
||||
shaderop_silu.h
|
||||
shaderop_relu.h
|
||||
shaderop_gelu.h
|
||||
shaderop_softmax.h
|
||||
shaderop_norm.h
|
||||
shaderop_rmsnorm.h
|
||||
shaderop_diagmask.h
|
||||
shaderop_mul_mat_mat_f32.h
|
||||
shaderop_mul_mat_f16.h
|
||||
shaderop_mul_mat_q8_0.h
|
||||
shaderop_mul_mat_q4_0.h
|
||||
shaderop_mul_mat_q4_1.h
|
||||
shaderop_mul_mat_q4_k.h
|
||||
shaderop_mul_mat_q6_k.h
|
||||
shaderop_getrows_f32.h
|
||||
shaderop_getrows_f16.h
|
||||
shaderop_getrows_q4_0.h
|
||||
shaderop_getrows_q4_1.h
|
||||
shaderop_getrows_q6_k.h
|
||||
shaderop_rope_norm_f16.h
|
||||
shaderop_rope_norm_f32.h
|
||||
shaderop_rope_neox_f16.h
|
||||
shaderop_rope_neox_f32.h
|
||||
shaderop_cpy_f16_f16.h
|
||||
shaderop_cpy_f16_f32.h
|
||||
shaderop_cpy_f32_f16.h
|
||||
shaderop_cpy_f32_f32.h
|
||||
)
|
||||
|
||||
# Create a custom command that depends on the generated_shaders
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
DEPENDS generated_shaders
|
||||
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
|
||||
)
|
||||
|
||||
# Add the stamp to the main sources to ensure dependency tracking
|
||||
target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
File diff suppressed because it is too large
Load Diff
@ -1,112 +0,0 @@
|
||||
#extension GL_EXT_shader_16bit_storage: require
|
||||
#extension GL_EXT_shader_8bit_storage: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
|
||||
#extension GL_EXT_control_flow_attributes: enable
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
#define QK4_0 32
|
||||
#define QK4_1 32
|
||||
|
||||
#define GELU_COEF_A 0.044715
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
|
||||
#define TWOPI_F 6.283185307179586f
|
||||
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
||||
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
||||
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
|
||||
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
|
||||
|
||||
#define sizeof_block_q4_0 0x12
|
||||
struct block_q4_0 {
|
||||
float16_t d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.f * xb.d;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
|
||||
reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_1 0x14
|
||||
struct block_q4_1 {
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb.m;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
|
||||
reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_k 144
|
||||
struct block_q4_k {
|
||||
float16_t d;
|
||||
float16_t dmin;
|
||||
uint8_t scales[K_SCALE_SIZE];
|
||||
uint8_t qs[QK_K/2];
|
||||
};
|
||||
|
||||
#define sizeof_block_q6_k 210
|
||||
struct block_q6_k {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
float16_t d; // super-block scale
|
||||
};
|
||||
mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
|
||||
const float16_t d_all = xb.d;
|
||||
|
||||
const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
const uint qhIndex = 32*(il/8) + 16*(il&1);
|
||||
float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
|
||||
const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
|
||||
const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
|
||||
const float16_t ml = float16_t(d_all * sc * 32.f);
|
||||
const float16_t dl = float16_t(d_all * sc * coef);
|
||||
mat4 reg;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
|
||||
: ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
|
||||
reg[i/4][i%4] = dl * q - ml;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
|
||||
#define QK8_0 32
|
||||
// struct block_q8_0 {
|
||||
// float16_t d; // delta
|
||||
// int8_t qs[QK8_0]; // quants
|
||||
// };
|
||||
#define sizeof_block_q8_0 34
|
@ -1,58 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
//int offs; // TODO: needed for GGML_OP_ACC, see metal code
|
||||
} pcs;
|
||||
|
||||
// general-purpose kernel for addition of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
||||
// cons: not very efficient
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
int offs = 0; // TMP (see above)
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
@ -1,25 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
uint row;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint n_past;
|
||||
int ne00;
|
||||
int ne01;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i02 = gl_WorkGroupID.z;
|
||||
const uint i01 = gl_WorkGroupID.y;
|
||||
const uint i00 = gl_WorkGroupID.x;
|
||||
|
||||
const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
|
||||
|
||||
if (i00 > pcs.n_past + i01) {
|
||||
out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
|
||||
} else {
|
||||
out_[index + pcs.outOff] = in_[index + pcs.inOff];
|
||||
}
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
|
||||
}
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
int z = 0;
|
||||
for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
|
||||
const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
|
||||
const mat4 result = dequantize_block(inIndex, ind%NL);
|
||||
for (uint j = 0; j < 4; ++j) {
|
||||
for (uint k = 0; k < 4; ++k) {
|
||||
const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
|
||||
out_[outIndex] = result[j][k];
|
||||
++z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
out_[y + j] = inA[x + j];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
out_[y + j] = inA[x + j];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
|
||||
}
|
@ -1,38 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_0 get_unaligned_block_q4_0(uint index) {
|
||||
block_q4_0 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
|
||||
fres.qs[it] = inA[index+2+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_0 block = get_unaligned_block_q4_0(index);
|
||||
return dequantize_q4_0(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
@ -1,39 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
||||
block_q4_1 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
fres.m = u8BufToFloat16(inA, index+2);
|
||||
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
||||
fres.qs[it] = inA[index+4+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_1 block = get_unaligned_block_q4_1(index);
|
||||
return dequantize_q4_1(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
@ -1,44 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 16
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q6_k get_unaligned_block_q6_k(uint index) {
|
||||
block_q6_k fres;
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
|
||||
fres.ql[it] = inA[index + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
|
||||
fres.qh[it] = inA[index + QK_K/2 + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
|
||||
fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
|
||||
}
|
||||
fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q6_k block = get_unaligned_block_q6_k(index);
|
||||
return dequantize_q6_k(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
@ -1,69 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb10;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
void main() {
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint rb = gl_WorkGroupID.y*N_F16_F32;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
|
||||
|
||||
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
|
||||
|
||||
for (uint row = 0; row < N_F16_F32; ++row) {
|
||||
uint r1 = rb + row;
|
||||
if (r1 >= pcs.ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf = 0;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sumf += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,51 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
// device subgroup size
|
||||
layout (local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) readonly buffer tensorInA { float inA[]; };
|
||||
layout(binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
}
|
||||
pcs;
|
||||
|
||||
|
||||
void main() {
|
||||
uvec3 gid = gl_WorkGroupID;
|
||||
|
||||
uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
|
||||
uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
|
||||
|
||||
const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
|
||||
const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
|
||||
float sum = 0.0f;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sum += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sum);
|
||||
if (subgroupElect()) {
|
||||
out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_0
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_0 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (sumy * -8.f + acc[0] + acc[1]);
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
@ -1,35 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_1
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_1 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float m = float(u8BufToFloat16(inA, index+2));
|
||||
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (acc[0] + acc[1]) + sumy * m;
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
@ -1,140 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define N_DST 4
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_k
|
||||
|
||||
layout(local_size_x = 4) in;
|
||||
layout(local_size_y = 8) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint16_t kmask1 = uint16_t(0x3f3f);
|
||||
const uint16_t kmask2 = uint16_t(0x0f0f);
|
||||
const uint16_t kmask3 = uint16_t(0xc0c0);
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/8; // 0...3
|
||||
const uint it = gl_SubgroupInvocationID%8; // 0...7
|
||||
const uint iq = it/4; // 0 or 1
|
||||
const uint ir = it%4; // 0...3
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = r0 * N_DST;
|
||||
const uint ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
|
||||
|
||||
const uint xblk = offset0 + pcs.inAOff;
|
||||
const uint y = (offset1 / 4) + pcs.inBOff;
|
||||
|
||||
float yl[16];
|
||||
float yh[16];
|
||||
float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
|
||||
float all_sum = 0.f;
|
||||
|
||||
uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 4) {
|
||||
const uint blk_idx = ib + xblk;
|
||||
|
||||
float sumy[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0];
|
||||
yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
|
||||
yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
|
||||
yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
|
||||
|
||||
uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
|
||||
uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
|
||||
uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
|
||||
uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
|
||||
uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
|
||||
|
||||
uint16_t sc16[4];
|
||||
sc16[0] = sc_0 & kmask1;
|
||||
sc16[1] = sc_2 & kmask1;
|
||||
sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
|
||||
sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
|
||||
|
||||
float acc1[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
float acc2[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
|
||||
uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
|
||||
acc1[0] += yl[i+0] * (q1 & 0x000F);
|
||||
acc1[1] += yl[i+1] * (q1 & 0x0F00);
|
||||
acc1[2] += yl[i+8] * (q1 & 0x00F0);
|
||||
acc1[3] += yl[i+9] * (q1 & 0xF000);
|
||||
acc2[0] += yh[i+0] * (q2 & 0x000F);
|
||||
acc2[1] += yh[i+1] * (q2 & 0x0F00);
|
||||
acc2[2] += yh[i+8] * (q2 & 0x00F0);
|
||||
acc2[3] += yh[i+9] * (q2 & 0xF000);
|
||||
}
|
||||
|
||||
uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
|
||||
uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
|
||||
uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
|
||||
uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
|
||||
uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
|
||||
uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
|
||||
uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
|
||||
uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
|
||||
|
||||
float dall = float(inA[blk_idx + row_idx].d);
|
||||
float dmin = float(inA[blk_idx + row_idx].dmin);
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
|
||||
(acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
|
||||
(acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
|
||||
(acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
|
||||
dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
|
||||
}
|
||||
|
||||
y4 += 4 * QK_K;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,106 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y_id = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint8_t kmask1 = uint8_t(0x03);
|
||||
const uint8_t kmask2 = uint8_t(0x0C);
|
||||
const uint8_t kmask3 = uint8_t(0x30);
|
||||
const uint8_t kmask4 = uint8_t(0xC0);
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
// bits of invocation ID for gl_SubgroupSize=32:
|
||||
// x x x x x
|
||||
// 4 3 2 1 0
|
||||
// ( tid ) ix
|
||||
// ip ( il )
|
||||
|
||||
const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
|
||||
const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
|
||||
const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
|
||||
const uint ip = tid/8; // first or second half of block (0 or 1)
|
||||
const uint il = tid%8; // each half has 8 parts, one per scale
|
||||
const uint n = 4; // 4 scales at a time (and 4 sums)
|
||||
const uint l0 = n*il; // offset into half-block, 0..28
|
||||
const uint is = 8*ip + l0/16; // 0, 1, 8, 9
|
||||
|
||||
const uint y_offset = 128*ip + l0;
|
||||
const uint q_offset_l = 64*ip + l0;
|
||||
const uint q_offset_h = 32*ip + l0;
|
||||
|
||||
for (uint i = ix; i < nb; i += block_stride) {
|
||||
|
||||
const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
|
||||
const uint qlIndex = q_offset_l;
|
||||
const uint q2Index = qlIndex + QK_K/8;
|
||||
const uint qhIndex = q_offset_h;
|
||||
const uint y = yy + i * QK_K + y_offset;
|
||||
|
||||
float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
for (uint l = 0; l < n; ++l) {
|
||||
const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
|
||||
const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
|
||||
const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
|
||||
|
||||
sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
|
||||
sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
|
||||
sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
|
||||
sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
|
||||
}
|
||||
|
||||
float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
|
||||
}
|
||||
|
||||
const float tot = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
@ -1,73 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
#define SIZE_OF_D 2
|
||||
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
|
||||
#define NB_Q8_0 8
|
||||
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const int nr = N_DST;
|
||||
const int nsg = N_SIMDGROUP;
|
||||
const int nw = N_SIMDWIDTH;
|
||||
|
||||
const int nb = pcs.ne00/QK8_0;
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
|
||||
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
|
||||
|
||||
float yl[NB_Q8_0];
|
||||
float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID.x/4;
|
||||
const uint il = gl_SubgroupInvocationID.x%4;
|
||||
|
||||
uint yb = y + ix * QK8_0 + NB_Q8_0*il;
|
||||
|
||||
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
||||
for (uint ib = ix; ib < nb; ib += nw/4) {
|
||||
for (int i = 0; i < NB_Q8_0; ++i) {
|
||||
yl[i] = inB[yb + i];
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; row++) {
|
||||
const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
|
||||
float sumq = 0.f;
|
||||
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
||||
const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
|
||||
sumq += qs_iq * yl[iq];
|
||||
}
|
||||
const float16_t d = u8BufToFloat16(inA, x + block_offset);
|
||||
sumf[row] += sumq*d;
|
||||
}
|
||||
|
||||
yb += NB_Q8_0 * nw;
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect() && first_row + row < pcs.ne01) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
// pointers to src0 rows
|
||||
uint ax[N_ROWS];
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
||||
|
||||
ax[row] = offset0 + pcs.inAOff;
|
||||
}
|
||||
|
||||
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
||||
|
||||
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/2;
|
||||
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
|
||||
|
||||
uint yb = y + ix * BLOCKS_IN_QUANT + il;
|
||||
|
||||
//debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
|
||||
// gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
|
||||
// gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 16) {
|
||||
for (int row = 0; row < N_ROWS; row++) {
|
||||
sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
|
||||
}
|
||||
|
||||
yb += BLOCKS_IN_QUANT * 16;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (first_row + row < pcs.ne01 && subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y = 8) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne10;
|
||||
int ne12;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb13;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
@ -1,84 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 256) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
// MEAN
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float mean = sum[0];
|
||||
|
||||
// recenter
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] - mean;
|
||||
}
|
||||
|
||||
// VARIANCE
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float variance = sum[0];
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + pcs.eps);
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] *= scale;
|
||||
}
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
|
||||
}
|
||||
}
|
@ -1,53 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 512) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
|
||||
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
|
||||
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] * scale;
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+pcs.n_dims/2]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+pcs.n_dims/2];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+1]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
@ -1,52 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
||||
layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
float theta_base = float(inB[pcs.inBOff + i2]);
|
||||
float inv_ndims = -1.f/pcs.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
||||
if (i0 < pcs.n_dims) {
|
||||
uint ic = i0/2;
|
||||
|
||||
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+1];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data] = inA[src];
|
||||
out_[dst_data+1] = inA[src+1];
|
||||
}
|
||||
}
|
||||
}
|
@ -1,19 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
@ -1,23 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = y / (1.0 + exp(-y));
|
||||
}
|
||||
}
|
@ -1,72 +0,0 @@
|
||||
// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
|
||||
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
float scale;
|
||||
float max_bias;
|
||||
float m0;
|
||||
float m1;
|
||||
uint n_head_log2;
|
||||
int mask;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
|
||||
const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
|
||||
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
|
||||
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (pcs.max_bias > 0.0f) {
|
||||
int64_t h = i02;
|
||||
|
||||
float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
|
||||
int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, float(exp));
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float localMax = uintBitsToFloat(0xFF800000);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
|
||||
}
|
||||
float max_ = subgroupMax(localMax);
|
||||
|
||||
// parallel sum
|
||||
float localSum = 0.0f;
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
|
||||
localSum += exp_psrc0;
|
||||
out_[pdst + i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
const float sum = subgroupAdd(localSum);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
out_[pdst + i00] /= sum;
|
||||
}
|
||||
}
|
@ -1,71 +0,0 @@
|
||||
#include "common.comp"
|
||||
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
|
||||
// TODO: use a local size of 32 or more (Metal uses 1024)
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint inCOff;
|
||||
uint outOff;
|
||||
int n_dims;
|
||||
int mode;
|
||||
int n_ctx_orig;
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
bool has_freq_factors;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const float i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
|
||||
out float cos_theta, out float sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||
}
|
||||
cos_theta = cos(theta) * mscale;
|
||||
sin_theta = sin(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base));
|
||||
}
|
||||
|
||||
void rope_yarn_corr_dims(
|
||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
|
||||
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
|
||||
}
|
Reference in New Issue
Block a user