mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-04 16:30:58 +02:00
Compare commits
19 Commits
grammar-de
...
fix-bench
Author | SHA1 | Date | |
---|---|---|---|
09a6325de5 | |||
39c4fc59dd | |||
9b14418863 | |||
6ddc727fac | |||
acb5278cc8 | |||
0839209cab | |||
b39809668a | |||
3e9edc6845 | |||
bfc73f1fa2 | |||
f00c9bba33 | |||
b55b505690 | |||
2818de21ff | |||
aed5d40607 | |||
afa5477d1c | |||
01fcd42431 | |||
f990610776 | |||
64cb45fd79 | |||
ace6c12ec6 | |||
cac75be05b |
6
.github/workflows/build.yml
vendored
6
.github/workflows/build.yml
vendored
@ -428,15 +428,15 @@ jobs:
|
|||||||
|
|
||||||
- name: Publish package
|
- name: Publish package
|
||||||
if: ${{ github.ref == 'refs/heads/master' }}
|
if: ${{ github.ref == 'refs/heads/master' }}
|
||||||
uses: gradle/gradle-build-action@v2
|
uses: gradle/gradle-build-action@v2.4.2
|
||||||
with:
|
with:
|
||||||
arguments: publish
|
arguments: publish
|
||||||
build-root-directory: bindings/java
|
build-root-directory: bindings/java
|
||||||
env:
|
env:
|
||||||
MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
|
MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
|
||||||
MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
|
MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
|
||||||
# MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
|
PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
|
||||||
# MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
|
PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
|
||||||
|
|
||||||
quantize:
|
quantize:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -11,6 +11,7 @@ build/
|
|||||||
build-em/
|
build-em/
|
||||||
build-debug/
|
build-debug/
|
||||||
build-release/
|
build-release/
|
||||||
|
build-rwdi/
|
||||||
build-static/
|
build-static/
|
||||||
build-cublas/
|
build-cublas/
|
||||||
build-no-accel/
|
build-no-accel/
|
||||||
|
@ -321,6 +321,53 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# POSIX conformance
|
||||||
|
#
|
||||||
|
|
||||||
|
# clock_gettime came in POSIX.1b (1993)
|
||||||
|
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||||
|
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||||
|
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||||
|
add_compile_definitions(_XOPEN_SOURCE=600)
|
||||||
|
|
||||||
|
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||||
|
# some string functions rely on locale_t availability,
|
||||||
|
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
||||||
|
remove_definitions(-D_XOPEN_SOURCE=600)
|
||||||
|
add_compile_definitions(_XOPEN_SOURCE=700)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Data types, macros and functions related to controlling CPU affinity
|
||||||
|
# are available on Linux through GNU extensions in libc
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||||
|
add_compile_definitions(_GNU_SOURCE)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||||
|
# and on macOS its availability depends on enabling Darwin extensions
|
||||||
|
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
|
||||||
|
add_compile_definitions(_DARWIN_C_SOURCE)
|
||||||
|
endif()
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
|
||||||
|
add_compile_definitions(_DARWIN_C_SOURCE)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# alloca is a non-standard interface that is not visible on BSDs when
|
||||||
|
# POSIX conformance is specified, but not all of them provide a clean way
|
||||||
|
# to enable it in such cases
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||||
|
add_compile_definitions(__BSD_VISIBLE)
|
||||||
|
endif()
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
||||||
|
add_compile_definitions(_NETBSD_SOURCE)
|
||||||
|
endif()
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
||||||
|
add_compile_definitions(_BSD_SOURCE)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (WHISPER_PERF)
|
if (WHISPER_PERF)
|
||||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
|
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
|
||||||
endif()
|
endif()
|
||||||
|
47
Makefile
47
Makefile
@ -42,18 +42,55 @@ CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
|||||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
|
||||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/37
|
# clock_gettime came in POSIX.1b (1993)
|
||||||
ifneq ($(wildcard /usr/include/musl/*),)
|
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||||
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
|
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||||
CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
|
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||||
|
CFLAGS += -D_XOPEN_SOURCE=600
|
||||||
|
CXXFLAGS += -D_XOPEN_SOURCE=600
|
||||||
|
|
||||||
|
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||||
|
# some string functions rely on locale_t availability,
|
||||||
|
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||||
|
ifeq ($(UNAME_S),OpenBSD)
|
||||||
|
CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||||
|
CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Data types, macros and functions related to controlling CPU affinity
|
||||||
|
# are available on Linux through GNU extensions in libc
|
||||||
|
ifeq ($(UNAME_S),Linux)
|
||||||
|
CFLAGS += -D_GNU_SOURCE
|
||||||
|
CXXFLAGS += -D_GNU_SOURCE
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||||
# and on macOS its availability depends on enabling Darwin extensions
|
# and on macOS its availability depends on enabling Darwin extensions
|
||||||
|
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
CFLAGS += -D_DARWIN_C_SOURCE
|
CFLAGS += -D_DARWIN_C_SOURCE
|
||||||
CXXFLAGS += -D_DARWIN_C_SOURCE
|
CXXFLAGS += -D_DARWIN_C_SOURCE
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(UNAME_S),DragonFly)
|
||||||
|
CFLAGS += -D__BSD_VISIBLE
|
||||||
|
CXXFLAGS += -D__BSD_VISIBLE
|
||||||
|
endif
|
||||||
|
|
||||||
|
# alloca is a non-standard interface that is not visible on BSDs when
|
||||||
|
# POSIX conformance is specified, but not all of them provide a clean way
|
||||||
|
# to enable it in such cases
|
||||||
|
ifeq ($(UNAME_S),FreeBSD)
|
||||||
|
CFLAGS += -D__BSD_VISIBLE
|
||||||
|
CXXFLAGS += -D__BSD_VISIBLE
|
||||||
|
endif
|
||||||
|
ifeq ($(UNAME_S),NetBSD)
|
||||||
|
CFLAGS += -D_NETBSD_SOURCE
|
||||||
|
CXXFLAGS += -D_NETBSD_SOURCE
|
||||||
|
endif
|
||||||
|
ifeq ($(UNAME_S),OpenBSD)
|
||||||
|
CFLAGS += -D_BSD_SOURCE
|
||||||
|
CXXFLAGS += -D_BSD_SOURCE
|
||||||
|
endif
|
||||||
|
|
||||||
# OS specific
|
# OS specific
|
||||||
# TODO: support Windows
|
# TODO: support Windows
|
||||||
@ -67,7 +104,7 @@ endif
|
|||||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
CPUINFO_CMD := sysctl machdep.cpu.features
|
CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
|
||||||
else ifeq ($(UNAME_S),Linux)
|
else ifeq ($(UNAME_S),Linux)
|
||||||
CPUINFO_CMD := cat /proc/cpuinfo
|
CPUINFO_CMD := cat /proc/cpuinfo
|
||||||
else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
|
else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
|
||||||
|
@ -2,6 +2,7 @@ plugins {
|
|||||||
id 'java'
|
id 'java'
|
||||||
id 'java-library'
|
id 'java-library'
|
||||||
id 'maven-publish'
|
id 'maven-publish'
|
||||||
|
id 'signing'
|
||||||
}
|
}
|
||||||
|
|
||||||
archivesBaseName = 'whispercpp'
|
archivesBaseName = 'whispercpp'
|
||||||
@ -109,4 +110,23 @@ publishing {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
repositories {
|
||||||
|
maven {
|
||||||
|
def releasesRepoUrl = 'https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/'
|
||||||
|
def snapshotsRepoUrl = 'https://s01.oss.sonatype.org/content/repositories/snapshots/'
|
||||||
|
url = version.endsWith('-SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl
|
||||||
|
credentials {
|
||||||
|
username = System.getenv("MAVEN_USERNAME")
|
||||||
|
password = System.getenv("MAVEN_PASSWORD")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
signing {
|
||||||
|
def signingKey = System.getenv("PGP_SECRET")
|
||||||
|
def signingPassword = System.getenv("PGP_PASSPHRASE")
|
||||||
|
useInMemoryPgpKeys(signingKey, signingPassword)
|
||||||
|
sign publishing.publications.mavenJava
|
||||||
}
|
}
|
||||||
|
@ -6,8 +6,8 @@
|
|||||||
// ref: https://github.com/ggerganov/whisper.cpp/issues/171
|
// ref: https://github.com/ggerganov/whisper.cpp/issues/171
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include "common-sdl.h"
|
#include "common-sdl.h"
|
||||||
|
#include "common.h"
|
||||||
#include "whisper.h"
|
#include "whisper.h"
|
||||||
|
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sam_print_usage(int argc, char ** argv, const sam_params & params) {
|
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
|
@ -324,7 +324,7 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
|
|||||||
commandset_list.push_back(cs);
|
commandset_list.push_back(cs);
|
||||||
return json{{"index",index}};
|
return json{{"index",index}};
|
||||||
}
|
}
|
||||||
json seek(struct whisper_context * ctx, audio_async &audio, json params) {
|
json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
|
||||||
// whisper_state has the pertinent offsets, but there also seem to be a large
|
// whisper_state has the pertinent offsets, but there also seem to be a large
|
||||||
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
|
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
|
||||||
// I'll give this a another pass once everything else is implemented,
|
// I'll give this a another pass once everything else is implemented,
|
||||||
@ -412,7 +412,7 @@ void process_loop(struct whisper_context * ctx, audio_async &audio, const whispe
|
|||||||
jobqueue.pop_front();
|
jobqueue.pop_front();
|
||||||
// send response
|
// send response
|
||||||
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
|
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||||
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
|
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str());
|
||||||
std::cout.flush();
|
std::cout.flush();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
|
|||||||
|
|
||||||
return speaker;
|
return speaker;
|
||||||
}
|
}
|
||||||
void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
|
void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
||||||
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
|
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
|
||||||
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
|
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
|
||||||
if (progress >= *progress_prev + progress_step) {
|
if (progress >= *progress_prev + progress_step) {
|
||||||
@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
|
||||||
std::ofstream fout(fname);
|
std::ofstream fout(fname);
|
||||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||||
|
|
||||||
|
@ -3,8 +3,8 @@
|
|||||||
// A very quick-n-dirty implementation serving mainly as a proof of concept.
|
// A very quick-n-dirty implementation serving mainly as a proof of concept.
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include "common-sdl.h"
|
#include "common-sdl.h"
|
||||||
|
#include "common.h"
|
||||||
#include "whisper.h"
|
#include "whisper.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -1,11 +1,3 @@
|
|||||||
// Defines fileno on msys:
|
|
||||||
#ifndef _GNU_SOURCE
|
|
||||||
#define _GNU_SOURCE
|
|
||||||
#include <cstddef>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cstdio>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "llama-util.h"
|
#include "llama-util.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -1164,7 +1156,7 @@ static bool llama_eval_internal(
|
|||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
const int n_tokens,
|
const int n_tokens,
|
||||||
const int n_past,
|
const int n_past,
|
||||||
const int n_threads) {
|
int n_threads) {
|
||||||
|
|
||||||
// enforce that the first token is BOS
|
// enforce that the first token is BOS
|
||||||
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
||||||
@ -1190,6 +1182,8 @@ static bool llama_eval_internal(
|
|||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||||
|
|
||||||
|
const float eps = 5e-6f; // TODO: take from hparams
|
||||||
|
|
||||||
auto & mem_per_token = lctx.mem_per_token;
|
auto & mem_per_token = lctx.mem_per_token;
|
||||||
auto & buf_compute = lctx.buf_compute;
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
|
||||||
@ -1204,7 +1198,7 @@ static bool llama_eval_internal(
|
|||||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
ggml_set_name(embd, "embd");
|
ggml_set_name(embd, "embd");
|
||||||
@ -1221,7 +1215,7 @@ static bool llama_eval_internal(
|
|||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpL);
|
cur = ggml_rms_norm(ctx0, inpL, eps);
|
||||||
|
|
||||||
// cur = cur*attention_norm(broadcasted)
|
// cur = cur*attention_norm(broadcasted)
|
||||||
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
||||||
@ -1329,7 +1323,7 @@ static bool llama_eval_internal(
|
|||||||
{
|
{
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpFF);
|
cur = ggml_rms_norm(ctx0, inpFF, eps);
|
||||||
|
|
||||||
// cur = cur*ffn_norm(broadcasted)
|
// cur = cur*ffn_norm(broadcasted)
|
||||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
||||||
@ -1367,7 +1361,7 @@ static bool llama_eval_internal(
|
|||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
|
|
||||||
inpL = ggml_rms_norm(ctx0, inpL);
|
inpL = ggml_rms_norm(ctx0, inpL, eps);
|
||||||
|
|
||||||
// inpL = inpL*norm(broadcasted)
|
// inpL = inpL*norm(broadcasted)
|
||||||
inpL = ggml_mul(ctx0, inpL, model.norm);
|
inpL = ggml_mul(ctx0, inpL, model.norm);
|
||||||
@ -1385,7 +1379,7 @@ static bool llama_eval_internal(
|
|||||||
|
|
||||||
// run the computation
|
// run the computation
|
||||||
ggml_build_forward_expand (&gf, inpL);
|
ggml_build_forward_expand (&gf, inpL);
|
||||||
ggml_graph_compute (ctx0, &gf);
|
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||||
|
|
||||||
#ifdef GGML_PERF
|
#ifdef GGML_PERF
|
||||||
// print timing information per ggml operation (for debugging purposes)
|
// print timing information per ggml operation (for debugging purposes)
|
||||||
@ -2488,8 +2482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||||
gf.n_threads = n_threads;
|
ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);
|
||||||
ggml_graph_compute(lora_ctx, &gf);
|
|
||||||
|
|
||||||
// we won't need these tensors again, reset the context to save memory
|
// we won't need these tensors again, reset the context to save memory
|
||||||
ggml_free(lora_ctx);
|
ggml_free(lora_ctx);
|
||||||
@ -2635,7 +2628,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||||
kout3d->data = out;
|
kout3d->data = out;
|
||||||
@ -2655,7 +2647,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||||
ggml_graph_compute(cpy_ctx, &gf);
|
ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
@ -2743,7 +2735,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||||
kin3d->data = (void *) inp;
|
kin3d->data = (void *) inp;
|
||||||
@ -2763,7 +2754,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||||
ggml_graph_compute(cpy_ctx, &gf);
|
ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
// Talk with AI
|
// Talk with AI
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include "common-sdl.h"
|
#include "common-sdl.h"
|
||||||
|
#include "common.h"
|
||||||
#include "whisper.h"
|
#include "whisper.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -649,7 +649,10 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
text_to_speak = ::replace(text_to_speak, "\"", "");
|
text_to_speak = ::replace(text_to_speak, "\"", "");
|
||||||
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||||
|
if (ret != 0) {
|
||||||
|
fprintf(stderr, "%s: failed to speak\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
audio.clear();
|
audio.clear();
|
||||||
|
|
||||||
|
@ -191,9 +191,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
.mem_size = ctx_size,
|
/*.mem_size =*/ ctx_size,
|
||||||
.mem_buffer = NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
.no_alloc = false,
|
/*.no_alloc =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
// Talk with AI
|
// Talk with AI
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include "common-sdl.h"
|
#include "common-sdl.h"
|
||||||
|
#include "common.h"
|
||||||
#include "whisper.h"
|
#include "whisper.h"
|
||||||
#include "gpt-2.h"
|
#include "gpt-2.h"
|
||||||
|
|
||||||
@ -349,7 +349,10 @@ int main(int argc, char ** argv) {
|
|||||||
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
||||||
|
|
||||||
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
|
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
|
||||||
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||||
|
if (ret != 0) {
|
||||||
|
fprintf(stderr, "%s: system() failed!\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
audio.clear();
|
audio.clear();
|
||||||
|
|
||||||
|
2
examples/whisper.android/.idea/compiler.xml
generated
2
examples/whisper.android/.idea/compiler.xml
generated
@ -1,6 +1,6 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="CompilerConfiguration">
|
<component name="CompilerConfiguration">
|
||||||
<bytecodeTargetLevel target="11" />
|
<bytecodeTargetLevel target="17" />
|
||||||
</component>
|
</component>
|
||||||
</project>
|
</project>
|
4
examples/whisper.android/.idea/gradle.xml
generated
4
examples/whisper.android/.idea/gradle.xml
generated
@ -4,15 +4,15 @@
|
|||||||
<component name="GradleSettings">
|
<component name="GradleSettings">
|
||||||
<option name="linkedExternalProjectsSettings">
|
<option name="linkedExternalProjectsSettings">
|
||||||
<GradleProjectSettings>
|
<GradleProjectSettings>
|
||||||
<option name="testRunner" value="GRADLE" />
|
|
||||||
<option name="distributionType" value="DEFAULT_WRAPPED" />
|
|
||||||
<option name="externalProjectPath" value="$PROJECT_DIR$" />
|
<option name="externalProjectPath" value="$PROJECT_DIR$" />
|
||||||
|
<option name="gradleJvm" value="#GRADLE_LOCAL_JAVA_HOME" />
|
||||||
<option name="modules">
|
<option name="modules">
|
||||||
<set>
|
<set>
|
||||||
<option value="$PROJECT_DIR$" />
|
<option value="$PROJECT_DIR$" />
|
||||||
<option value="$PROJECT_DIR$/app" />
|
<option value="$PROJECT_DIR$/app" />
|
||||||
</set>
|
</set>
|
||||||
</option>
|
</option>
|
||||||
|
<option name="resolveExternalAnnotations" value="false" />
|
||||||
</GradleProjectSettings>
|
</GradleProjectSettings>
|
||||||
</option>
|
</option>
|
||||||
</component>
|
</component>
|
||||||
|
2
examples/whisper.android/.idea/misc.xml
generated
2
examples/whisper.android/.idea/misc.xml
generated
@ -1,7 +1,7 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ExternalStorageConfigurationManager" enabled="true" />
|
<component name="ExternalStorageConfigurationManager" enabled="true" />
|
||||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="Android Studio default JDK" project-jdk-type="JavaSDK">
|
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="jbr-17" project-jdk-type="JavaSDK">
|
||||||
<output url="file://$PROJECT_DIR$/build/classes" />
|
<output url="file://$PROJECT_DIR$/build/classes" />
|
||||||
</component>
|
</component>
|
||||||
<component name="ProjectType">
|
<component name="ProjectType">
|
||||||
|
@ -5,12 +5,12 @@ plugins {
|
|||||||
|
|
||||||
android {
|
android {
|
||||||
namespace 'com.whispercppdemo'
|
namespace 'com.whispercppdemo'
|
||||||
compileSdk 33
|
compileSdk 34
|
||||||
|
|
||||||
defaultConfig {
|
defaultConfig {
|
||||||
applicationId "com.whispercppdemo"
|
applicationId "com.whispercppdemo"
|
||||||
minSdk 26
|
minSdk 26
|
||||||
targetSdk 32
|
targetSdk 34
|
||||||
versionCode 1
|
versionCode 1
|
||||||
versionName "1.0"
|
versionName "1.0"
|
||||||
|
|
||||||
@ -31,19 +31,19 @@ android {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
compileOptions {
|
compileOptions {
|
||||||
sourceCompatibility JavaVersion.VERSION_1_8
|
sourceCompatibility JavaVersion.VERSION_17
|
||||||
targetCompatibility JavaVersion.VERSION_1_8
|
targetCompatibility JavaVersion.VERSION_17
|
||||||
}
|
}
|
||||||
kotlinOptions {
|
kotlinOptions {
|
||||||
jvmTarget = '1.8'
|
jvmTarget = '17'
|
||||||
}
|
}
|
||||||
buildFeatures {
|
buildFeatures {
|
||||||
compose true
|
compose true
|
||||||
}
|
}
|
||||||
composeOptions {
|
composeOptions {
|
||||||
kotlinCompilerExtensionVersion '1.3.1'
|
kotlinCompilerExtensionVersion '1.5.0'
|
||||||
}
|
}
|
||||||
ndkVersion "25.1.8937393"
|
ndkVersion "25.2.9519653"
|
||||||
externalNativeBuild {
|
externalNativeBuild {
|
||||||
cmake {
|
cmake {
|
||||||
path = file("src/main/jni/whisper/CMakeLists.txt")
|
path = file("src/main/jni/whisper/CMakeLists.txt")
|
||||||
@ -57,19 +57,19 @@ android {
|
|||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation 'androidx.activity:activity-compose:1.6.1'
|
implementation 'androidx.activity:activity-compose:1.7.2'
|
||||||
implementation 'androidx.compose.material:material-icons-core:1.3.1'
|
implementation 'androidx.compose.material:material-icons-core:1.5.0'
|
||||||
implementation 'androidx.compose.material3:material3:1.0.1'
|
implementation 'androidx.compose.material3:material3:1.1.1'
|
||||||
implementation "androidx.compose.ui:ui:1.3.2"
|
implementation "androidx.compose.ui:ui:1.5.0"
|
||||||
implementation "androidx.compose.ui:ui-tooling-preview:1.3.2"
|
implementation "androidx.compose.ui:ui-tooling-preview:1.5.0"
|
||||||
implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.5.1'
|
implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.6.1'
|
||||||
implementation "com.google.accompanist:accompanist-permissions:0.28.0"
|
implementation "com.google.accompanist:accompanist-permissions:0.28.0"
|
||||||
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.6.4'
|
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.2'
|
||||||
|
|
||||||
testImplementation 'junit:junit:4.13.2'
|
testImplementation 'junit:junit:4.13.2'
|
||||||
androidTestImplementation 'androidx.test.ext:junit:1.1.4'
|
androidTestImplementation 'androidx.test.ext:junit:1.1.5'
|
||||||
androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.0'
|
androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.1'
|
||||||
androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.3.2"
|
androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.5.0"
|
||||||
debugImplementation "androidx.compose.ui:ui-tooling:1.3.2"
|
debugImplementation "androidx.compose.ui:ui-tooling:1.5.0"
|
||||||
debugImplementation "androidx.compose.ui:ui-test-manifest:1.3.2"
|
debugImplementation "androidx.compose.ui:ui-test-manifest:1.5.0"
|
||||||
}
|
}
|
@ -66,7 +66,7 @@ private fun MainScreen(
|
|||||||
|
|
||||||
@Composable
|
@Composable
|
||||||
private fun MessageLog(log: String) {
|
private fun MessageLog(log: String) {
|
||||||
SelectionContainer() {
|
SelectionContainer {
|
||||||
Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
|
Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private suspend fun printSystemInfo() {
|
private suspend fun printSystemInfo() {
|
||||||
printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
|
printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()))
|
||||||
}
|
}
|
||||||
|
|
||||||
private suspend fun loadData() {
|
private suspend fun loadData() {
|
||||||
|
@ -13,7 +13,7 @@ import androidx.compose.runtime.SideEffect
|
|||||||
import androidx.compose.ui.graphics.toArgb
|
import androidx.compose.ui.graphics.toArgb
|
||||||
import androidx.compose.ui.platform.LocalContext
|
import androidx.compose.ui.platform.LocalContext
|
||||||
import androidx.compose.ui.platform.LocalView
|
import androidx.compose.ui.platform.LocalView
|
||||||
import androidx.core.view.ViewCompat
|
import androidx.core.view.WindowCompat
|
||||||
|
|
||||||
private val DarkColorScheme = darkColorScheme(
|
private val DarkColorScheme = darkColorScheme(
|
||||||
primary = Purple80,
|
primary = Purple80,
|
||||||
@ -55,8 +55,9 @@ fun WhisperCppDemoTheme(
|
|||||||
val view = LocalView.current
|
val view = LocalView.current
|
||||||
if (!view.isInEditMode) {
|
if (!view.isInEditMode) {
|
||||||
SideEffect {
|
SideEffect {
|
||||||
(view.context as Activity).window.statusBarColor = colorScheme.primary.toArgb()
|
val window = (view.context as Activity).window
|
||||||
ViewCompat.getWindowInsetsController(view)?.isAppearanceLightStatusBars = darkTheme
|
window.statusBarColor = colorScheme.primary.toArgb()
|
||||||
|
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,7 +18,9 @@ class WhisperContext private constructor(private var ptr: Long) {
|
|||||||
|
|
||||||
suspend fun transcribeData(data: FloatArray): String = withContext(scope.coroutineContext) {
|
suspend fun transcribeData(data: FloatArray): String = withContext(scope.coroutineContext) {
|
||||||
require(ptr != 0L)
|
require(ptr != 0L)
|
||||||
WhisperLib.fullTranscribe(ptr, data)
|
val numThreads = WhisperCpuConfig.preferredThreadCount
|
||||||
|
Log.d(LOG_TAG, "Selecting $numThreads threads")
|
||||||
|
WhisperLib.fullTranscribe(ptr, numThreads, data)
|
||||||
val textCount = WhisperLib.getTextSegmentCount(ptr)
|
val textCount = WhisperLib.getTextSegmentCount(ptr)
|
||||||
return@withContext buildString {
|
return@withContext buildString {
|
||||||
for (i in 0 until textCount) {
|
for (i in 0 until textCount) {
|
||||||
@ -126,7 +128,7 @@ private class WhisperLib {
|
|||||||
external fun initContextFromAsset(assetManager: AssetManager, assetPath: String): Long
|
external fun initContextFromAsset(assetManager: AssetManager, assetPath: String): Long
|
||||||
external fun initContext(modelPath: String): Long
|
external fun initContext(modelPath: String): Long
|
||||||
external fun freeContext(contextPtr: Long)
|
external fun freeContext(contextPtr: Long)
|
||||||
external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
|
external fun fullTranscribe(contextPtr: Long, numThreads: Int, audioData: FloatArray)
|
||||||
external fun getTextSegmentCount(contextPtr: Long): Int
|
external fun getTextSegmentCount(contextPtr: Long): Int
|
||||||
external fun getTextSegment(contextPtr: Long, index: Int): String
|
external fun getTextSegment(contextPtr: Long, index: Int): String
|
||||||
external fun getSystemInfo(): String
|
external fun getSystemInfo(): String
|
||||||
|
@ -0,0 +1,73 @@
|
|||||||
|
package com.whispercppdemo.whisper
|
||||||
|
|
||||||
|
import android.util.Log
|
||||||
|
import java.io.BufferedReader
|
||||||
|
import java.io.FileReader
|
||||||
|
|
||||||
|
object WhisperCpuConfig {
|
||||||
|
val preferredThreadCount: Int
|
||||||
|
// Always use at least 2 threads:
|
||||||
|
get() = CpuInfo.getHighPerfCpuCount().coerceAtLeast(2)
|
||||||
|
}
|
||||||
|
|
||||||
|
private class CpuInfo(private val lines: List<String>) {
|
||||||
|
private fun getHighPerfCpuCount(): Int = try {
|
||||||
|
getHighPerfCpuCountByFrequencies()
|
||||||
|
} catch (e: Exception) {
|
||||||
|
Log.d(LOG_TAG, "Couldn't read CPU frequencies", e)
|
||||||
|
getHighPerfCpuCountByVariant()
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun getHighPerfCpuCountByFrequencies(): Int =
|
||||||
|
getCpuValues(property = "processor") { getMaxCpuFrequency(it.toInt()) }
|
||||||
|
.also { Log.d(LOG_TAG, "Binned cpu frequencies (frequency, count): ${it.binnedValues()}") }
|
||||||
|
.countDroppingMin()
|
||||||
|
|
||||||
|
private fun getHighPerfCpuCountByVariant(): Int =
|
||||||
|
getCpuValues(property = "CPU variant") { it.substringAfter("0x").toInt(radix = 16) }
|
||||||
|
.also { Log.d(LOG_TAG, "Binned cpu variants (variant, count): ${it.binnedValues()}") }
|
||||||
|
.countKeepingMin()
|
||||||
|
|
||||||
|
private fun List<Int>.binnedValues() = groupingBy { it }.eachCount()
|
||||||
|
|
||||||
|
private fun getCpuValues(property: String, mapper: (String) -> Int) = lines
|
||||||
|
.asSequence()
|
||||||
|
.filter { it.startsWith(property) }
|
||||||
|
.map { mapper(it.substringAfter(':').trim()) }
|
||||||
|
.sorted()
|
||||||
|
.toList()
|
||||||
|
|
||||||
|
|
||||||
|
private fun List<Int>.countDroppingMin(): Int {
|
||||||
|
val min = min()
|
||||||
|
return count { it > min }
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun List<Int>.countKeepingMin(): Int {
|
||||||
|
val min = min()
|
||||||
|
return count { it == min }
|
||||||
|
}
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
private const val LOG_TAG = "WhisperCpuConfig"
|
||||||
|
|
||||||
|
fun getHighPerfCpuCount(): Int = try {
|
||||||
|
readCpuInfo().getHighPerfCpuCount()
|
||||||
|
} catch (e: Exception) {
|
||||||
|
Log.d(LOG_TAG, "Couldn't read CPU info", e)
|
||||||
|
// Our best guess -- just return the # of CPUs minus 4.
|
||||||
|
(Runtime.getRuntime().availableProcessors() - 4).coerceAtLeast(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun readCpuInfo() = CpuInfo(
|
||||||
|
BufferedReader(FileReader("/proc/cpuinfo"))
|
||||||
|
.useLines { it.toList() }
|
||||||
|
)
|
||||||
|
|
||||||
|
private fun getMaxCpuFrequency(cpuIndex: Int): Int {
|
||||||
|
val path = "/sys/devices/system/cpu/cpu${cpuIndex}/cpufreq/cpuinfo_max_freq"
|
||||||
|
val maxFreq = BufferedReader(FileReader(path)).use { it.readLine() }
|
||||||
|
return maxFreq.toInt()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -163,16 +163,12 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext(
|
|||||||
|
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
|
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
|
||||||
JNIEnv *env, jobject thiz, jlong context_ptr, jfloatArray audio_data) {
|
JNIEnv *env, jobject thiz, jlong context_ptr, jint num_threads, jfloatArray audio_data) {
|
||||||
UNUSED(thiz);
|
UNUSED(thiz);
|
||||||
struct whisper_context *context = (struct whisper_context *) context_ptr;
|
struct whisper_context *context = (struct whisper_context *) context_ptr;
|
||||||
jfloat *audio_data_arr = (*env)->GetFloatArrayElements(env, audio_data, NULL);
|
jfloat *audio_data_arr = (*env)->GetFloatArrayElements(env, audio_data, NULL);
|
||||||
const jsize audio_data_length = (*env)->GetArrayLength(env, audio_data);
|
const jsize audio_data_length = (*env)->GetArrayLength(env, audio_data);
|
||||||
|
|
||||||
// Leave 2 processors free (i.e. the high-efficiency cores).
|
|
||||||
int max_threads = max(1, min(8, get_nprocs() - 2));
|
|
||||||
LOGI("Selecting %d threads", max_threads);
|
|
||||||
|
|
||||||
// The below adapted from the Objective-C iOS sample
|
// The below adapted from the Objective-C iOS sample
|
||||||
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||||
params.print_realtime = true;
|
params.print_realtime = true;
|
||||||
@ -181,7 +177,7 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
|
|||||||
params.print_special = false;
|
params.print_special = false;
|
||||||
params.translate = false;
|
params.translate = false;
|
||||||
params.language = "en";
|
params.language = "en";
|
||||||
params.n_threads = max_threads;
|
params.n_threads = num_threads;
|
||||||
params.offset_ms = 0;
|
params.offset_ms = 0;
|
||||||
params.no_context = true;
|
params.no_context = true;
|
||||||
params.single_segment = false;
|
params.single_segment = false;
|
||||||
|
@ -1,10 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="utf-8"?>
|
|
||||||
<resources>
|
|
||||||
<color name="purple_200">#FFBB86FC</color>
|
|
||||||
<color name="purple_500">#FF6200EE</color>
|
|
||||||
<color name="purple_700">#FF3700B3</color>
|
|
||||||
<color name="teal_200">#FF03DAC5</color>
|
|
||||||
<color name="teal_700">#FF018786</color>
|
|
||||||
<color name="black">#FF000000</color>
|
|
||||||
<color name="white">#FFFFFFFF</color>
|
|
||||||
</resources>
|
|
@ -1,6 +1,6 @@
|
|||||||
// Top-level build file where you can add configuration options common to all sub-projects/modules.
|
// Top-level build file where you can add configuration options common to all sub-projects/modules.
|
||||||
plugins {
|
plugins {
|
||||||
id 'com.android.application' version '7.3.1' apply false
|
id 'com.android.application' version '8.1.1' apply false
|
||||||
id 'com.android.library' version '7.3.1' apply false
|
id 'com.android.library' version '8.1.1' apply false
|
||||||
id 'org.jetbrains.kotlin.android' version '1.7.10' apply false
|
id 'org.jetbrains.kotlin.android' version '1.9.0' apply false
|
||||||
}
|
}
|
@ -1,6 +1,6 @@
|
|||||||
#Wed Dec 14 10:37:24 EST 2022
|
#Wed Dec 14 10:37:24 EST 2022
|
||||||
distributionBase=GRADLE_USER_HOME
|
distributionBase=GRADLE_USER_HOME
|
||||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
|
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
|
||||||
distributionPath=wrapper/dists
|
distributionPath=wrapper/dists
|
||||||
zipStorePath=wrapper/dists
|
zipStorePath=wrapper/dists
|
||||||
zipStoreBase=GRADLE_USER_HOME
|
zipStoreBase=GRADLE_USER_HOME
|
||||||
|
32
ggml-cuda.cu
32
ggml-cuda.cu
@ -4086,7 +4086,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
|||||||
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||||
|
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
||||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
const int half_n_dims = ncols/4;
|
const int half_n_dims = ncols/4;
|
||||||
|
|
||||||
@ -4098,8 +4099,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|||||||
const int i = row*ncols + col;
|
const int i = row*ncols + col;
|
||||||
|
|
||||||
const float col_theta_scale = powf(theta_scale, col);
|
const float col_theta_scale = powf(theta_scale, col);
|
||||||
|
const float p = p0 + p_delta*(row/p_delta_rows);
|
||||||
|
|
||||||
const float theta = p*col_theta_scale;
|
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
|
||||||
const float sin_theta = sinf(theta);
|
const float sin_theta = sinf(theta);
|
||||||
const float cos_theta = cosf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
|
|
||||||
@ -4109,7 +4111,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
|||||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
||||||
|
|
||||||
const float block_theta = block_p*col_theta_scale;
|
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
|
||||||
const float sin_block_theta = sinf(block_theta);
|
const float sin_block_theta = sinf(block_theta);
|
||||||
const float cos_block_theta = cosf(block_theta);
|
const float cos_block_theta = cosf(block_theta);
|
||||||
|
|
||||||
@ -4984,12 +4986,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
|
|||||||
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
||||||
GGML_ASSERT(nrows % 4 == 0);
|
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
||||||
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
GGML_ASSERT(ncols % 4 == 0);
|
||||||
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
||||||
|
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
||||||
const dim3 block_nums(num_blocks_x, nrows, 1);
|
const dim3 block_nums(num_blocks_x, nrows, 1);
|
||||||
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
||||||
@ -5723,22 +5726,18 @@ inline void ggml_cuda_op_rope(
|
|||||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
if (is_glm) {
|
if (is_glm) {
|
||||||
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, n_ctx, cudaStream_main);
|
||||||
const float id_p = min(p, n_ctx - 2.f);
|
|
||||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
|
||||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
|
||||||
} else if (is_neox) {
|
} else if (is_neox) {
|
||||||
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
||||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
|
||||||
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||||
} else {
|
} else {
|
||||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
|
||||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6400,10 +6399,7 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
|
|||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
||||||
|
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, true);
|
||||||
const bool is_glm = mode & 4;
|
|
||||||
|
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
@ -327,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||||||
|
|
||||||
void * ggml_metal_host_malloc(size_t n) {
|
void * ggml_metal_host_malloc(size_t n) {
|
||||||
void * data = NULL;
|
void * data = NULL;
|
||||||
const int result = posix_memalign((void **) &data, getpagesize(), n);
|
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -401,7 +401,7 @@ bool ggml_metal_add_buffer(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t size_page = getpagesize();
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||||
|
|
||||||
size_t size_aligned = size;
|
size_t size_aligned = size;
|
||||||
if ((size_aligned % size_page) != 0) {
|
if ((size_aligned % size_page) != 0) {
|
||||||
@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
||||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
|
@ -220,14 +220,10 @@ kernel void kernel_norm(
|
|||||||
}
|
}
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
}
|
}
|
||||||
//// broadcast
|
const float mean = sum[0] / ne00;
|
||||||
//if (tpitg == 0) {
|
|
||||||
// sum[0] /= ne00;
|
|
||||||
//}
|
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
const float mean = sum[0];
|
|
||||||
|
|
||||||
// recenter and VARIANCE
|
// recenter and VARIANCE
|
||||||
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
device float * y = dst + tgpig*ne00;
|
device float * y = dst + tgpig*ne00;
|
||||||
sum[tpitg] = 0.0f;
|
sum[tpitg] = 0.0f;
|
||||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
@ -235,12 +231,6 @@ kernel void kernel_norm(
|
|||||||
sum[tpitg] += y[i00] * y[i00];
|
sum[tpitg] += y[i00] * y[i00];
|
||||||
}
|
}
|
||||||
|
|
||||||
//// VARIANCE
|
|
||||||
//// parallel sum
|
|
||||||
//sum[tpitg] = 0.0f;
|
|
||||||
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
||||||
// sum[tpitg] += y[i00] * y[i00];
|
|
||||||
//}
|
|
||||||
// reduce
|
// reduce
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
for (uint i = ntg/2; i > 0; i /= 2) {
|
for (uint i = ntg/2; i > 0; i /= 2) {
|
||||||
@ -249,12 +239,7 @@ kernel void kernel_norm(
|
|||||||
}
|
}
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
}
|
}
|
||||||
//// broadcast
|
const float variance = sum[0] / ne00;
|
||||||
//if (tpitg == 0) {
|
|
||||||
// sum[0] /= ne00;
|
|
||||||
//}
|
|
||||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
const float variance = sum[0];
|
|
||||||
|
|
||||||
const float scale = 1.0f/sqrt(variance + eps);
|
const float scale = 1.0f/sqrt(variance + eps);
|
||||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
@ -262,7 +247,6 @@ kernel void kernel_norm(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
kernel void kernel_rms_norm(
|
kernel void kernel_rms_norm(
|
||||||
device const void * src0,
|
device const void * src0,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_alibi_f32(
|
kernel void kernel_alibi_f32(
|
||||||
@ -699,25 +682,27 @@ kernel void kernel_rope(
|
|||||||
constant int & mode,
|
constant int & mode,
|
||||||
constant float & freq_base,
|
constant float & freq_base,
|
||||||
constant float & freq_scale,
|
constant float & freq_scale,
|
||||||
uint3 tpig[[thread_position_in_grid]]) {
|
uint tiitg[[thread_index_in_threadgroup]],
|
||||||
const int64_t i3 = tpig[2];
|
uint3 tptg[[threads_per_threadgroup]],
|
||||||
const int64_t i2 = tpig[1];
|
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||||
const int64_t i1 = tpig[0];
|
const int64_t i3 = tgpig[2];
|
||||||
|
const int64_t i2 = tgpig[1];
|
||||||
|
const int64_t i1 = tgpig[0];
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
|
||||||
|
|
||||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||||
|
|
||||||
float theta = freq_scale * (float)p;
|
const float theta_0 = freq_scale * (float)p;
|
||||||
|
const float inv_ndims = -1.f/n_dims;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
||||||
|
|
||||||
|
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
||||||
const float cos_theta = cos(theta);
|
const float cos_theta = cos(theta);
|
||||||
const float sin_theta = sin(theta);
|
const float sin_theta = sin(theta);
|
||||||
|
|
||||||
theta *= theta_scale;
|
|
||||||
|
|
||||||
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
@ -729,12 +714,12 @@ kernel void kernel_rope(
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
||||||
|
|
||||||
|
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
||||||
const float cos_theta = cos(theta);
|
const float cos_theta = cos(theta);
|
||||||
const float sin_theta = sin(theta);
|
const float sin_theta = sin(theta);
|
||||||
|
|
||||||
theta *= theta_scale;
|
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
const int64_t i0 = ib*n_dims + ic/2;
|
||||||
|
|
||||||
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
47
ggml.c
47
ggml.c
@ -1,4 +1,3 @@
|
|||||||
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
|
||||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
@ -107,6 +106,9 @@ typedef void * thread_ret_t;
|
|||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
#include <hbwmalloc.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||||
@ -196,9 +198,15 @@ typedef void * thread_ret_t;
|
|||||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||||
#else
|
#else
|
||||||
inline static void * ggml_aligned_malloc(size_t size) {
|
inline static void * ggml_aligned_malloc(size_t size) {
|
||||||
|
if (size == 0) {
|
||||||
|
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
void * aligned_memory = NULL;
|
void * aligned_memory = NULL;
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_CPU_HBM
|
||||||
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
||||||
|
#elif GGML_USE_METAL
|
||||||
|
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
||||||
#else
|
#else
|
||||||
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
||||||
#endif
|
#endif
|
||||||
@ -219,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||||||
return aligned_memory;
|
return aligned_memory;
|
||||||
}
|
}
|
||||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
||||||
|
#else
|
||||||
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||||
@ -4572,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// allow to call ggml_init with 0 size
|
||||||
|
if (params.mem_size == 0) {
|
||||||
|
params.mem_size = GGML_MEM_ALIGN;
|
||||||
|
}
|
||||||
|
|
||||||
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
||||||
|
|
||||||
*ctx = (struct ggml_context) {
|
*ctx = (struct ggml_context) {
|
||||||
@ -4774,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|||||||
|
|
||||||
size_t obj_alloc_size = 0;
|
size_t obj_alloc_size = 0;
|
||||||
|
|
||||||
if (view_src == NULL && ctx->no_alloc == false) {
|
if (view_src == NULL && !ctx->no_alloc) {
|
||||||
if (ctx->scratch.data != NULL) {
|
if (ctx->scratch.data != NULL) {
|
||||||
// allocate tensor data in the scratch buffer
|
// allocate tensor data in the scratch buffer
|
||||||
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
||||||
@ -5475,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (inplace) {
|
if (inplace) {
|
||||||
GGML_ASSERT(is_node == false);
|
GGML_ASSERT(!is_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
@ -5518,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (inplace) {
|
if (inplace) {
|
||||||
GGML_ASSERT(is_node == false);
|
GGML_ASSERT(!is_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
@ -17266,10 +17283,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||||||
} else {
|
} else {
|
||||||
// wait for other threads to finish
|
// wait for other threads to finish
|
||||||
const int last = node_n;
|
const int last = node_n;
|
||||||
do {
|
while (true) {
|
||||||
//sched_yield();
|
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||||
|
// depending on the workload and the operating system.
|
||||||
|
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||||
|
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
sched_yield();
|
||||||
|
#endif
|
||||||
|
|
||||||
node_n = atomic_load(&state->shared->node_n);
|
node_n = atomic_load(&state->shared->node_n);
|
||||||
} while (node_n == last);
|
if (node_n != last) break;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if we should stop
|
// check if we should stop
|
||||||
@ -19962,7 +19987,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
|
|
||||||
struct ggml_tensor * data = NULL;
|
struct ggml_tensor * data = NULL;
|
||||||
|
|
||||||
if (params.no_alloc == false) {
|
if (!params.no_alloc) {
|
||||||
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
||||||
|
|
||||||
ok = ok && data != NULL;
|
ok = ok && data != NULL;
|
||||||
@ -20003,7 +20028,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
}
|
}
|
||||||
|
|
||||||
// point the data member to the appropriate location in the binary blob using the tensor infos
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
||||||
if (params.no_alloc == false) {
|
if (!params.no_alloc) {
|
||||||
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
||||||
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,28 @@ function get_script_path() {
|
|||||||
models_path="$(get_script_path)"
|
models_path="$(get_script_path)"
|
||||||
|
|
||||||
# Whisper models
|
# Whisper models
|
||||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small.en-tdrz" "small" "medium.en" "medium" "large-v1" "large" )
|
models=(
|
||||||
|
"tiny.en"
|
||||||
|
"tiny"
|
||||||
|
"tiny-q5_1"
|
||||||
|
"tiny.en-q5_1"
|
||||||
|
"base.en"
|
||||||
|
"base"
|
||||||
|
"base-q5_1"
|
||||||
|
"base.en-q5_1"
|
||||||
|
"small.en"
|
||||||
|
"small.en-tdrz"
|
||||||
|
"small"
|
||||||
|
"small-q5_1"
|
||||||
|
"small.en-q5_1"
|
||||||
|
"medium"
|
||||||
|
"medium.en"
|
||||||
|
"medium-q5_0"
|
||||||
|
"medium.en-q5_0"
|
||||||
|
"large-v1"
|
||||||
|
"large"
|
||||||
|
"large-q5_0"
|
||||||
|
)
|
||||||
|
|
||||||
# list available models
|
# list available models
|
||||||
function list_models {
|
function list_models {
|
||||||
|
207
whisper.cpp
207
whisper.cpp
@ -18,6 +18,7 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -117,6 +118,21 @@ static void byteswap_tensor(ggml_tensor * tensor) {
|
|||||||
#define WHISPER_USE_SCRATCH
|
#define WHISPER_USE_SCRATCH
|
||||||
#define WHISPER_MAX_SCRATCH_BUFFERS 16
|
#define WHISPER_MAX_SCRATCH_BUFFERS 16
|
||||||
|
|
||||||
|
//
|
||||||
|
// ggml helpers
|
||||||
|
//
|
||||||
|
|
||||||
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
// available whisper models
|
// available whisper models
|
||||||
enum e_model {
|
enum e_model {
|
||||||
MODEL_UNKNOWN,
|
MODEL_UNKNOWN,
|
||||||
@ -537,6 +553,7 @@ struct whisper_kv_cache {
|
|||||||
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
|
|
||||||
|
// buf points to the memory allocated for both ggml_tensor 'k' and 'v' (see kv_cache_init)
|
||||||
std::vector<uint8_t> buf;
|
std::vector<uint8_t> buf;
|
||||||
|
|
||||||
int n; // number of tokens currently in the cache
|
int n; // number of tokens currently in the cache
|
||||||
@ -602,7 +619,7 @@ struct whisper_sequence {
|
|||||||
|
|
||||||
// TAGS: WHISPER_DECODER_INIT
|
// TAGS: WHISPER_DECODER_INIT
|
||||||
struct whisper_decoder {
|
struct whisper_decoder {
|
||||||
// each decoders keeps its own KV-cache
|
// each decoder keeps its own KV-cache
|
||||||
whisper_kv_cache kv_self;
|
whisper_kv_cache kv_self;
|
||||||
|
|
||||||
// the currently generated sequence of tokens
|
// the currently generated sequence of tokens
|
||||||
@ -622,6 +639,24 @@ struct whisper_decoder {
|
|||||||
std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
|
std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// replace std::pair by using customized pair struct (reason: std::pair is very slow)
|
||||||
|
template<typename A, typename B>
|
||||||
|
struct whisper_pair {
|
||||||
|
A first;
|
||||||
|
B second;
|
||||||
|
|
||||||
|
// Define a constructor that takes two arguments.
|
||||||
|
whisper_pair(const A& a, const B& b) : first(a), second(b) {}
|
||||||
|
// Define a constructor that takes no argument.
|
||||||
|
whisper_pair() : first(A()), second(B()) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// beam-search helpers
|
||||||
|
struct kv_buf {
|
||||||
|
std::vector<uint8_t> k;
|
||||||
|
std::vector<uint8_t> v;
|
||||||
|
};
|
||||||
|
|
||||||
struct whisper_state {
|
struct whisper_state {
|
||||||
int64_t t_sample_us = 0;
|
int64_t t_sample_us = 0;
|
||||||
int64_t t_encode_us = 0;
|
int64_t t_encode_us = 0;
|
||||||
@ -641,8 +676,12 @@ struct whisper_state {
|
|||||||
|
|
||||||
whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
|
whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
|
||||||
|
|
||||||
|
// buffer for swapping KV caches between decoders during beam-search
|
||||||
|
std::vector<kv_buf> kv_swap_bufs;
|
||||||
|
|
||||||
// memory buffers used by encode / decode contexts
|
// memory buffers used by encode / decode contexts
|
||||||
std::vector<uint8_t> buf_compute;
|
std::vector<uint8_t> buf_compute;
|
||||||
|
std::vector<uint8_t> buf_work;
|
||||||
std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
|
std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
|
||||||
|
|
||||||
int buf_last = 0;
|
int buf_last = 0;
|
||||||
@ -655,7 +694,7 @@ struct whisper_state {
|
|||||||
std::vector<whisper_token> prompt_past;
|
std::vector<whisper_token> prompt_past;
|
||||||
|
|
||||||
// work container used to avoid memory allocations
|
// work container used to avoid memory allocations
|
||||||
std::vector<std::pair<double, whisper_vocab::id>> logits_id;
|
std::vector<whisper_pair<double, whisper_vocab::id>> logits_id;
|
||||||
|
|
||||||
mutable std::mt19937 rng; // used for sampling at t > 0.0
|
mutable std::mt19937 rng; // used for sampling at t > 0.0
|
||||||
|
|
||||||
@ -1808,7 +1847,7 @@ static bool whisper_encode_internal(
|
|||||||
struct ggml_cgraph gf = {};
|
struct ggml_cgraph gf = {};
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, cur);
|
ggml_build_forward_expand(&gf, cur);
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
||||||
|
|
||||||
//ggml_graph_print(&gf);
|
//ggml_graph_print(&gf);
|
||||||
}
|
}
|
||||||
@ -1893,7 +1932,7 @@ static bool whisper_encode_internal(
|
|||||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
||||||
//ggml_graph_print(&gf);
|
//ggml_graph_print(&gf);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2307,7 +2346,7 @@ static bool whisper_decode_internal(
|
|||||||
// run the computation
|
// run the computation
|
||||||
{
|
{
|
||||||
ggml_build_forward_expand(&gf, logits);
|
ggml_build_forward_expand(&gf, logits);
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract logits for all N tokens
|
// extract logits for all N tokens
|
||||||
@ -3975,17 +4014,21 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
|
|||||||
|
|
||||||
auto & logits_id = state.logits_id;
|
auto & logits_id = state.logits_id;
|
||||||
|
|
||||||
logits_id.clear();
|
logits_id.resize(n_logits);
|
||||||
for (int i = 0; i < n_logits; ++i) {
|
for (int i = 0; i < n_logits; ++i) {
|
||||||
logits_id.push_back({ logits[i], i });
|
logits_id[i].first = logits[i];
|
||||||
|
logits_id[i].second = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
|
||||||
std::partial_sort(
|
std::partial_sort(
|
||||||
logits_id.begin(),
|
logits_id.begin(),
|
||||||
logits_id.begin() + k, logits_id.end(),
|
logits_id.begin() + k, logits_id.end(),
|
||||||
[](const std::pair<double, whisper_token> & a, const std::pair<double, whisper_token> & b) {
|
[](const pair_type & a, const pair_type & b) {
|
||||||
return a.first > b.first;
|
return a.first > b.first;
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<whisper_token_data> result;
|
std::vector<whisper_token_data> result;
|
||||||
result.reserve(k);
|
result.reserve(k);
|
||||||
@ -4080,6 +4123,115 @@ static void whisper_sequence_score(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool whisper_kv_swap_fast(
|
||||||
|
std::vector<int> & view,
|
||||||
|
whisper_decoder src[],
|
||||||
|
std::vector<kv_buf> & kv_swap_bufs,
|
||||||
|
const int & n_decoders) {
|
||||||
|
WHISPER_PRINT_DEBUG("%s: n_decoders %d\n", __func__, n_decoders);
|
||||||
|
|
||||||
|
// (decoder->buffer->decoder or decoder->buffer + decoder->decoder)
|
||||||
|
std::set<int> two_copy; // decoder indices require two copies to safely modify KV caches
|
||||||
|
|
||||||
|
// (buffer->decoder or decoder->decoder)
|
||||||
|
std::set<int> one_copy; // decoder indices require one copy to safely modify KV caches
|
||||||
|
|
||||||
|
// (decoder<->decoder)
|
||||||
|
std::set<int> p_swap_set; // decoder indices able to swap KV-cache pointers
|
||||||
|
std::vector<whisper_pair<int, int>> p_swap_vec;
|
||||||
|
p_swap_vec.reserve(n_decoders);
|
||||||
|
|
||||||
|
// see https://github.com/ggerganov/whisper.cpp/wiki
|
||||||
|
for (int i = 0; i < n_decoders; i++) {
|
||||||
|
// zero-copy (no modification)
|
||||||
|
if (i == view[i] || view[i] < 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_one_copy = true;
|
||||||
|
// since we modify data sequentially, we only consider decoder indices after current index
|
||||||
|
for (int j = i + 1; j < n_decoders; j++) {
|
||||||
|
if (i == view[j]) {
|
||||||
|
// detect symmetric diagram
|
||||||
|
if (j == view[i]) {
|
||||||
|
p_swap_set.insert(i);
|
||||||
|
p_swap_set.insert(j);
|
||||||
|
p_swap_vec.emplace_back(i, j);
|
||||||
|
} else {
|
||||||
|
two_copy.insert(i);
|
||||||
|
is_one_copy = false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (is_one_copy) {
|
||||||
|
one_copy.insert(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kv_swap_bufs.resize(n_decoders);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_decoders; i++) {
|
||||||
|
kv_swap_bufs[i].k.resize(ggml_nbytes(src[i].kv_self.k));
|
||||||
|
kv_swap_bufs[i].v.resize(ggml_nbytes(src[i].kv_self.v));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto & i : two_copy) {
|
||||||
|
// make a copy of KV caches
|
||||||
|
WHISPER_PRINT_DEBUG("%s: store KV cache into swap: idx %d\n", __func__, i);
|
||||||
|
memcpy(kv_swap_bufs[i].k.data(), src[i].kv_self.k->data, kv_swap_bufs[i].k.size());
|
||||||
|
memcpy(kv_swap_bufs[i].v.data(), src[i].kv_self.v->data, kv_swap_bufs[i].v.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// since two-copy decoder KV caches are protected by kv_swap_bufs, modify them first
|
||||||
|
for (auto & i : two_copy) {
|
||||||
|
// skip the decoder indices that require pointer swapping
|
||||||
|
if (p_swap_set.find(i) != p_swap_set.end()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (two_copy.find(view[i]) != two_copy.end()) {
|
||||||
|
// modify KV caches of decoder using data from kv_swap_bufs
|
||||||
|
WHISPER_PRINT_DEBUG("%s: two-copy decoder using swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
|
||||||
|
memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
|
||||||
|
memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
|
||||||
|
} else {
|
||||||
|
// modify KV caches of decoder using data from correspond decoder KV caches directly
|
||||||
|
WHISPER_PRINT_DEBUG("%s: two-copy decoder without swap buffers: %d -> %d\n", __func__, view[i], i);
|
||||||
|
memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
|
||||||
|
memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// then modify one-copy decoder KV caches
|
||||||
|
for (auto & i : one_copy) {
|
||||||
|
// skip the decoder indices that require pointer swapping
|
||||||
|
if (p_swap_set.find(i) != p_swap_set.end()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (two_copy.find(view[i]) != two_copy.end()) {
|
||||||
|
// modify KV caches of decoder using data from kv_swap_bufs
|
||||||
|
WHISPER_PRINT_DEBUG("%s: one-copy decoder using swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
|
||||||
|
memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
|
||||||
|
memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
|
||||||
|
} else {
|
||||||
|
// modify KV caches of decoder using data from correspond decoder KV caches directly
|
||||||
|
WHISPER_PRINT_DEBUG("%s: one-copy decoder without swap buffers: %d -> %d\n", __func__, view[i], i);
|
||||||
|
memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
|
||||||
|
memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// swap the pointers
|
||||||
|
for (auto & i : p_swap_vec) {
|
||||||
|
WHISPER_PRINT_DEBUG("%s: swap pointers: %d <-> %d\n", __func__, i.first, i.second);
|
||||||
|
std::swap(src[i.first].kv_self, src[i.second].kv_self);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
int whisper_full_with_state(
|
int whisper_full_with_state(
|
||||||
struct whisper_context * ctx,
|
struct whisper_context * ctx,
|
||||||
struct whisper_state * state,
|
struct whisper_state * state,
|
||||||
@ -4243,14 +4395,6 @@ int whisper_full_with_state(
|
|||||||
std::vector<whisper_token> prompt;
|
std::vector<whisper_token> prompt;
|
||||||
prompt.reserve(whisper_n_text_ctx(ctx));
|
prompt.reserve(whisper_n_text_ctx(ctx));
|
||||||
|
|
||||||
// beam-search helpers
|
|
||||||
struct kv_buf {
|
|
||||||
std::vector<uint8_t> k;
|
|
||||||
std::vector<uint8_t> v;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<kv_buf> kv_bufs;
|
|
||||||
|
|
||||||
struct beam_candidate {
|
struct beam_candidate {
|
||||||
int decoder_idx;
|
int decoder_idx;
|
||||||
int seek_delta;
|
int seek_delta;
|
||||||
@ -4399,23 +4543,7 @@ int whisper_full_with_state(
|
|||||||
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
|
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
// store the KV caches of all decoders when doing beam-search
|
|
||||||
if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
|
if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
|
||||||
kv_bufs.resize(n_decoders_cur);
|
|
||||||
for (int j = 0; j < n_decoders_cur; ++j) {
|
|
||||||
auto & decoder = state->decoders[j];
|
|
||||||
|
|
||||||
if (decoder.completed || decoder.failed) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
kv_bufs[j].k.resize(ggml_nbytes(decoder.kv_self.k));
|
|
||||||
kv_bufs[j].v.resize(ggml_nbytes(decoder.kv_self.v));
|
|
||||||
|
|
||||||
memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
|
|
||||||
memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
beam_candidates.clear();
|
beam_candidates.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4463,6 +4591,7 @@ int whisper_full_with_state(
|
|||||||
});
|
});
|
||||||
|
|
||||||
uint32_t cur_c = 0;
|
uint32_t cur_c = 0;
|
||||||
|
std::vector<int> decoder_idx(n_decoders_cur, -1);
|
||||||
|
|
||||||
for (int j = 0; j < n_decoders_cur; ++j) {
|
for (int j = 0; j < n_decoders_cur; ++j) {
|
||||||
auto & decoder = state->decoders[j];
|
auto & decoder = state->decoders[j];
|
||||||
@ -4481,12 +4610,13 @@ int whisper_full_with_state(
|
|||||||
decoder.seek_delta = cur.seek_delta;
|
decoder.seek_delta = cur.seek_delta;
|
||||||
decoder.has_ts = cur.has_ts;
|
decoder.has_ts = cur.has_ts;
|
||||||
|
|
||||||
memcpy(decoder.kv_self.k->data, kv_bufs[cur.decoder_idx].k.data(), kv_bufs[cur.decoder_idx].k.size());
|
decoder_idx[j] = cur.decoder_idx;
|
||||||
memcpy(decoder.kv_self.v->data, kv_bufs[cur.decoder_idx].v.data(), kv_bufs[cur.decoder_idx].v.size());
|
|
||||||
|
|
||||||
WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
|
WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
|
||||||
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
|
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update KV caches
|
||||||
|
whisper_kv_swap_fast(decoder_idx, state->decoders, state->kv_swap_bufs, n_decoders_cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
// update the decoder state
|
// update the decoder state
|
||||||
@ -5111,7 +5241,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|||||||
// b: N*N*sizeof(float)
|
// b: N*N*sizeof(float)
|
||||||
// c: N*N*sizeof(float)
|
// c: N*N*sizeof(float)
|
||||||
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
||||||
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
|
std::vector<uint8_t> buf (3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead());
|
||||||
|
std::vector<uint8_t> work(1llu*N_max*N_max*sizeof(float) + 1*ggml_tensor_overhead());
|
||||||
|
|
||||||
// put a bunch of random data in the buffer
|
// put a bunch of random data in the buffer
|
||||||
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
||||||
@ -5166,12 +5297,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|||||||
double tsum = 0.0;
|
double tsum = 0.0;
|
||||||
|
|
||||||
// heat-up
|
// heat-up
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
ggml_graph_compute_helper(work, &gf, n_threads);
|
||||||
|
|
||||||
for (int i = 0; i < n_max; ++i) {
|
for (int i = 0; i < n_max; ++i) {
|
||||||
const int64_t t0 = ggml_time_us();
|
const int64_t t0 = ggml_time_us();
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
ggml_graph_compute_helper(work, &gf, n_threads);
|
||||||
|
|
||||||
const int64_t t1 = ggml_time_us();
|
const int64_t t1 = ggml_time_us();
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user