mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-04 00:11:12 +02:00
Compare commits
19 Commits
grammar-de
...
fix-bench
Author | SHA1 | Date | |
---|---|---|---|
09a6325de5 | |||
39c4fc59dd | |||
9b14418863 | |||
6ddc727fac | |||
acb5278cc8 | |||
0839209cab | |||
b39809668a | |||
3e9edc6845 | |||
bfc73f1fa2 | |||
f00c9bba33 | |||
b55b505690 | |||
2818de21ff | |||
aed5d40607 | |||
afa5477d1c | |||
01fcd42431 | |||
f990610776 | |||
64cb45fd79 | |||
ace6c12ec6 | |||
cac75be05b |
6
.github/workflows/build.yml
vendored
6
.github/workflows/build.yml
vendored
@ -428,15 +428,15 @@ jobs:
|
||||
|
||||
- name: Publish package
|
||||
if: ${{ github.ref == 'refs/heads/master' }}
|
||||
uses: gradle/gradle-build-action@v2
|
||||
uses: gradle/gradle-build-action@v2.4.2
|
||||
with:
|
||||
arguments: publish
|
||||
build-root-directory: bindings/java
|
||||
env:
|
||||
MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
|
||||
MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
|
||||
# MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
|
||||
# MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
|
||||
PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
|
||||
PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
|
||||
|
||||
quantize:
|
||||
runs-on: ubuntu-latest
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -11,6 +11,7 @@ build/
|
||||
build-em/
|
||||
build-debug/
|
||||
build-release/
|
||||
build-rwdi/
|
||||
build-static/
|
||||
build-cublas/
|
||||
build-no-accel/
|
||||
|
@ -321,6 +321,53 @@ else()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
#
|
||||
# POSIX conformance
|
||||
#
|
||||
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||
add_compile_definitions(_XOPEN_SOURCE=600)
|
||||
|
||||
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||
# some string functions rely on locale_t availability,
|
||||
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
||||
remove_definitions(-D_XOPEN_SOURCE=600)
|
||||
add_compile_definitions(_XOPEN_SOURCE=700)
|
||||
endif()
|
||||
|
||||
# Data types, macros and functions related to controlling CPU affinity
|
||||
# are available on Linux through GNU extensions in libc
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
||||
add_compile_definitions(_GNU_SOURCE)
|
||||
endif()
|
||||
|
||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||
# and on macOS its availability depends on enabling Darwin extensions
|
||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
|
||||
add_compile_definitions(_DARWIN_C_SOURCE)
|
||||
endif()
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
|
||||
add_compile_definitions(_DARWIN_C_SOURCE)
|
||||
endif()
|
||||
|
||||
# alloca is a non-standard interface that is not visible on BSDs when
|
||||
# POSIX conformance is specified, but not all of them provide a clean way
|
||||
# to enable it in such cases
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
|
||||
add_compile_definitions(__BSD_VISIBLE)
|
||||
endif()
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
|
||||
add_compile_definitions(_NETBSD_SOURCE)
|
||||
endif()
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
|
||||
add_compile_definitions(_BSD_SOURCE)
|
||||
endif()
|
||||
|
||||
if (WHISPER_PERF)
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
|
||||
endif()
|
||||
|
47
Makefile
47
Makefile
@ -42,18 +42,55 @@ CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/37
|
||||
ifneq ($(wildcard /usr/include/musl/*),)
|
||||
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
|
||||
CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
|
||||
# clock_gettime came in POSIX.1b (1993)
|
||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||
# posix_memalign came in POSIX.1-2001 / SUSv3
|
||||
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
|
||||
CFLAGS += -D_XOPEN_SOURCE=600
|
||||
CXXFLAGS += -D_XOPEN_SOURCE=600
|
||||
|
||||
# Somehow in OpenBSD whenever POSIX conformance is specified
|
||||
# some string functions rely on locale_t availability,
|
||||
# which was introduced in POSIX.1-2008, forcing us to go higher
|
||||
ifeq ($(UNAME_S),OpenBSD)
|
||||
CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||
CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
|
||||
endif
|
||||
|
||||
# Data types, macros and functions related to controlling CPU affinity
|
||||
# are available on Linux through GNU extensions in libc
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
CFLAGS += -D_GNU_SOURCE
|
||||
CXXFLAGS += -D_GNU_SOURCE
|
||||
endif
|
||||
|
||||
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
|
||||
# and on macOS its availability depends on enabling Darwin extensions
|
||||
# similarly on DragonFly, enabling BSD extensions is necessary
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -D_DARWIN_C_SOURCE
|
||||
CXXFLAGS += -D_DARWIN_C_SOURCE
|
||||
endif
|
||||
ifeq ($(UNAME_S),DragonFly)
|
||||
CFLAGS += -D__BSD_VISIBLE
|
||||
CXXFLAGS += -D__BSD_VISIBLE
|
||||
endif
|
||||
|
||||
# alloca is a non-standard interface that is not visible on BSDs when
|
||||
# POSIX conformance is specified, but not all of them provide a clean way
|
||||
# to enable it in such cases
|
||||
ifeq ($(UNAME_S),FreeBSD)
|
||||
CFLAGS += -D__BSD_VISIBLE
|
||||
CXXFLAGS += -D__BSD_VISIBLE
|
||||
endif
|
||||
ifeq ($(UNAME_S),NetBSD)
|
||||
CFLAGS += -D_NETBSD_SOURCE
|
||||
CXXFLAGS += -D_NETBSD_SOURCE
|
||||
endif
|
||||
ifeq ($(UNAME_S),OpenBSD)
|
||||
CFLAGS += -D_BSD_SOURCE
|
||||
CXXFLAGS += -D_BSD_SOURCE
|
||||
endif
|
||||
|
||||
# OS specific
|
||||
# TODO: support Windows
|
||||
@ -67,7 +104,7 @@ endif
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CPUINFO_CMD := sysctl machdep.cpu.features
|
||||
CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
|
||||
else ifeq ($(UNAME_S),Linux)
|
||||
CPUINFO_CMD := cat /proc/cpuinfo
|
||||
else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
|
||||
|
@ -2,6 +2,7 @@ plugins {
|
||||
id 'java'
|
||||
id 'java-library'
|
||||
id 'maven-publish'
|
||||
id 'signing'
|
||||
}
|
||||
|
||||
archivesBaseName = 'whispercpp'
|
||||
@ -109,4 +110,23 @@ publishing {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
repositories {
|
||||
maven {
|
||||
def releasesRepoUrl = 'https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/'
|
||||
def snapshotsRepoUrl = 'https://s01.oss.sonatype.org/content/repositories/snapshots/'
|
||||
url = version.endsWith('-SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl
|
||||
credentials {
|
||||
username = System.getenv("MAVEN_USERNAME")
|
||||
password = System.getenv("MAVEN_PASSWORD")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
signing {
|
||||
def signingKey = System.getenv("PGP_SECRET")
|
||||
def signingPassword = System.getenv("PGP_PASSPHRASE")
|
||||
useInMemoryPgpKeys(signingKey, signingPassword)
|
||||
sign publishing.publications.mavenJava
|
||||
}
|
||||
|
@ -6,8 +6,8 @@
|
||||
// ref: https://github.com/ggerganov/whisper.cpp/issues/171
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "common-sdl.h"
|
||||
#include "common.h"
|
||||
#include "whisper.h"
|
||||
|
||||
#include <sstream>
|
||||
|
@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void sam_print_usage(int argc, char ** argv, const sam_params & params) {
|
||||
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
|
@ -324,12 +324,12 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
|
||||
commandset_list.push_back(cs);
|
||||
return json{{"index",index}};
|
||||
}
|
||||
json seek(struct whisper_context * ctx, audio_async &audio, json params) {
|
||||
json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
|
||||
// whisper_state has the pertinent offsets, but there also seem to be a large
|
||||
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
|
||||
// I'll give this a another pass once everything else is implemented,
|
||||
// but for now, it's unsupported
|
||||
throw json{
|
||||
throw json {
|
||||
{"code", -32601},
|
||||
{"message", "Seeking is not yet supported."}
|
||||
};
|
||||
@ -412,7 +412,7 @@ void process_loop(struct whisper_context * ctx, audio_async &audio, const whispe
|
||||
jobqueue.pop_front();
|
||||
// send response
|
||||
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
|
||||
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str());
|
||||
std::cout.flush();
|
||||
|
||||
}
|
||||
|
@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
|
||||
|
||||
return speaker;
|
||||
}
|
||||
void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
|
||||
void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
||||
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
|
||||
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
|
||||
if (progress >= *progress_prev + progress_step) {
|
||||
@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
|
||||
return true;
|
||||
}
|
||||
|
||||
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
|
||||
std::ofstream fout(fname);
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
|
@ -3,8 +3,8 @@
|
||||
// A very quick-n-dirty implementation serving mainly as a proof of concept.
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "common-sdl.h"
|
||||
#include "common.h"
|
||||
#include "whisper.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -1,11 +1,3 @@
|
||||
// Defines fileno on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include "llama-util.h"
|
||||
#include "llama.h"
|
||||
|
||||
@ -1164,7 +1156,7 @@ static bool llama_eval_internal(
|
||||
const llama_token * tokens,
|
||||
const int n_tokens,
|
||||
const int n_past,
|
||||
const int n_threads) {
|
||||
int n_threads) {
|
||||
|
||||
// enforce that the first token is BOS
|
||||
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
||||
@ -1190,6 +1182,8 @@ static bool llama_eval_internal(
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||
|
||||
const float eps = 5e-6f; // TODO: take from hparams
|
||||
|
||||
auto & mem_per_token = lctx.mem_per_token;
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
|
||||
@ -1204,7 +1198,7 @@ static bool llama_eval_internal(
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
ggml_set_name(embd, "embd");
|
||||
@ -1221,7 +1215,7 @@ static bool llama_eval_internal(
|
||||
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
cur = ggml_rms_norm(ctx0, inpL, eps);
|
||||
|
||||
// cur = cur*attention_norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
||||
@ -1329,7 +1323,7 @@ static bool llama_eval_internal(
|
||||
{
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpFF);
|
||||
cur = ggml_rms_norm(ctx0, inpFF, eps);
|
||||
|
||||
// cur = cur*ffn_norm(broadcasted)
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
||||
@ -1367,7 +1361,7 @@ static bool llama_eval_internal(
|
||||
// norm
|
||||
{
|
||||
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
inpL = ggml_rms_norm(ctx0, inpL, eps);
|
||||
|
||||
// inpL = inpL*norm(broadcasted)
|
||||
inpL = ggml_mul(ctx0, inpL, model.norm);
|
||||
@ -1384,8 +1378,8 @@ static bool llama_eval_internal(
|
||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
ggml_build_forward_expand (&gf, inpL);
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
|
||||
#ifdef GGML_PERF
|
||||
// print timing information per ggml operation (for debugging purposes)
|
||||
@ -2488,8 +2482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||
}
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||
gf.n_threads = n_threads;
|
||||
ggml_graph_compute(lora_ctx, &gf);
|
||||
ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
ggml_free(lora_ctx);
|
||||
@ -2635,7 +2628,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||
|
||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
||||
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||
kout3d->data = out;
|
||||
@ -2655,7 +2647,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
@ -2743,7 +2735,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
|
||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||
kin3d->data = (void *) inp;
|
||||
@ -2763,7 +2754,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
// Talk with AI
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "common-sdl.h"
|
||||
#include "common.h"
|
||||
#include "whisper.h"
|
||||
#include "llama.h"
|
||||
|
||||
@ -649,7 +649,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
text_to_speak = ::replace(text_to_speak, "\"", "");
|
||||
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "%s: failed to speak\n", __func__);
|
||||
}
|
||||
|
||||
audio.clear();
|
||||
|
||||
|
@ -191,9 +191,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
||||
// create the ggml context
|
||||
{
|
||||
struct ggml_init_params params = {
|
||||
.mem_size = ctx_size,
|
||||
.mem_buffer = NULL,
|
||||
.no_alloc = false,
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
|
@ -1,8 +1,8 @@
|
||||
// Talk with AI
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "common-sdl.h"
|
||||
#include "common.h"
|
||||
#include "whisper.h"
|
||||
#include "gpt-2.h"
|
||||
|
||||
@ -349,7 +349,10 @@ int main(int argc, char ** argv) {
|
||||
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
||||
|
||||
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
|
||||
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "%s: system() failed!\n", __func__);
|
||||
}
|
||||
|
||||
audio.clear();
|
||||
|
||||
|
2
examples/whisper.android/.idea/compiler.xml
generated
2
examples/whisper.android/.idea/compiler.xml
generated
@ -1,6 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="CompilerConfiguration">
|
||||
<bytecodeTargetLevel target="11" />
|
||||
<bytecodeTargetLevel target="17" />
|
||||
</component>
|
||||
</project>
|
4
examples/whisper.android/.idea/gradle.xml
generated
4
examples/whisper.android/.idea/gradle.xml
generated
@ -4,15 +4,15 @@
|
||||
<component name="GradleSettings">
|
||||
<option name="linkedExternalProjectsSettings">
|
||||
<GradleProjectSettings>
|
||||
<option name="testRunner" value="GRADLE" />
|
||||
<option name="distributionType" value="DEFAULT_WRAPPED" />
|
||||
<option name="externalProjectPath" value="$PROJECT_DIR$" />
|
||||
<option name="gradleJvm" value="#GRADLE_LOCAL_JAVA_HOME" />
|
||||
<option name="modules">
|
||||
<set>
|
||||
<option value="$PROJECT_DIR$" />
|
||||
<option value="$PROJECT_DIR$/app" />
|
||||
</set>
|
||||
</option>
|
||||
<option name="resolveExternalAnnotations" value="false" />
|
||||
</GradleProjectSettings>
|
||||
</option>
|
||||
</component>
|
||||
|
2
examples/whisper.android/.idea/misc.xml
generated
2
examples/whisper.android/.idea/misc.xml
generated
@ -1,7 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ExternalStorageConfigurationManager" enabled="true" />
|
||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="Android Studio default JDK" project-jdk-type="JavaSDK">
|
||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="jbr-17" project-jdk-type="JavaSDK">
|
||||
<output url="file://$PROJECT_DIR$/build/classes" />
|
||||
</component>
|
||||
<component name="ProjectType">
|
||||
|
@ -5,12 +5,12 @@ plugins {
|
||||
|
||||
android {
|
||||
namespace 'com.whispercppdemo'
|
||||
compileSdk 33
|
||||
compileSdk 34
|
||||
|
||||
defaultConfig {
|
||||
applicationId "com.whispercppdemo"
|
||||
minSdk 26
|
||||
targetSdk 32
|
||||
targetSdk 34
|
||||
versionCode 1
|
||||
versionName "1.0"
|
||||
|
||||
@ -31,19 +31,19 @@ android {
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility JavaVersion.VERSION_1_8
|
||||
targetCompatibility JavaVersion.VERSION_1_8
|
||||
sourceCompatibility JavaVersion.VERSION_17
|
||||
targetCompatibility JavaVersion.VERSION_17
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = '1.8'
|
||||
jvmTarget = '17'
|
||||
}
|
||||
buildFeatures {
|
||||
compose true
|
||||
}
|
||||
composeOptions {
|
||||
kotlinCompilerExtensionVersion '1.3.1'
|
||||
kotlinCompilerExtensionVersion '1.5.0'
|
||||
}
|
||||
ndkVersion "25.1.8937393"
|
||||
ndkVersion "25.2.9519653"
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/jni/whisper/CMakeLists.txt")
|
||||
@ -57,19 +57,19 @@ android {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation 'androidx.activity:activity-compose:1.6.1'
|
||||
implementation 'androidx.compose.material:material-icons-core:1.3.1'
|
||||
implementation 'androidx.compose.material3:material3:1.0.1'
|
||||
implementation "androidx.compose.ui:ui:1.3.2"
|
||||
implementation "androidx.compose.ui:ui-tooling-preview:1.3.2"
|
||||
implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.5.1'
|
||||
implementation 'androidx.activity:activity-compose:1.7.2'
|
||||
implementation 'androidx.compose.material:material-icons-core:1.5.0'
|
||||
implementation 'androidx.compose.material3:material3:1.1.1'
|
||||
implementation "androidx.compose.ui:ui:1.5.0"
|
||||
implementation "androidx.compose.ui:ui-tooling-preview:1.5.0"
|
||||
implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.6.1'
|
||||
implementation "com.google.accompanist:accompanist-permissions:0.28.0"
|
||||
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.6.4'
|
||||
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.2'
|
||||
|
||||
testImplementation 'junit:junit:4.13.2'
|
||||
androidTestImplementation 'androidx.test.ext:junit:1.1.4'
|
||||
androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.0'
|
||||
androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.3.2"
|
||||
debugImplementation "androidx.compose.ui:ui-tooling:1.3.2"
|
||||
debugImplementation "androidx.compose.ui:ui-test-manifest:1.3.2"
|
||||
androidTestImplementation 'androidx.test.ext:junit:1.1.5'
|
||||
androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.1'
|
||||
androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.5.0"
|
||||
debugImplementation "androidx.compose.ui:ui-tooling:1.5.0"
|
||||
debugImplementation "androidx.compose.ui:ui-test-manifest:1.5.0"
|
||||
}
|
@ -66,7 +66,7 @@ private fun MainScreen(
|
||||
|
||||
@Composable
|
||||
private fun MessageLog(log: String) {
|
||||
SelectionContainer() {
|
||||
SelectionContainer {
|
||||
Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
|
||||
}
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
|
||||
}
|
||||
|
||||
private suspend fun printSystemInfo() {
|
||||
printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
|
||||
printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()))
|
||||
}
|
||||
|
||||
private suspend fun loadData() {
|
||||
|
@ -13,7 +13,7 @@ import androidx.compose.runtime.SideEffect
|
||||
import androidx.compose.ui.graphics.toArgb
|
||||
import androidx.compose.ui.platform.LocalContext
|
||||
import androidx.compose.ui.platform.LocalView
|
||||
import androidx.core.view.ViewCompat
|
||||
import androidx.core.view.WindowCompat
|
||||
|
||||
private val DarkColorScheme = darkColorScheme(
|
||||
primary = Purple80,
|
||||
@ -55,8 +55,9 @@ fun WhisperCppDemoTheme(
|
||||
val view = LocalView.current
|
||||
if (!view.isInEditMode) {
|
||||
SideEffect {
|
||||
(view.context as Activity).window.statusBarColor = colorScheme.primary.toArgb()
|
||||
ViewCompat.getWindowInsetsController(view)?.isAppearanceLightStatusBars = darkTheme
|
||||
val window = (view.context as Activity).window
|
||||
window.statusBarColor = colorScheme.primary.toArgb()
|
||||
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -18,7 +18,9 @@ class WhisperContext private constructor(private var ptr: Long) {
|
||||
|
||||
suspend fun transcribeData(data: FloatArray): String = withContext(scope.coroutineContext) {
|
||||
require(ptr != 0L)
|
||||
WhisperLib.fullTranscribe(ptr, data)
|
||||
val numThreads = WhisperCpuConfig.preferredThreadCount
|
||||
Log.d(LOG_TAG, "Selecting $numThreads threads")
|
||||
WhisperLib.fullTranscribe(ptr, numThreads, data)
|
||||
val textCount = WhisperLib.getTextSegmentCount(ptr)
|
||||
return@withContext buildString {
|
||||
for (i in 0 until textCount) {
|
||||
@ -126,7 +128,7 @@ private class WhisperLib {
|
||||
external fun initContextFromAsset(assetManager: AssetManager, assetPath: String): Long
|
||||
external fun initContext(modelPath: String): Long
|
||||
external fun freeContext(contextPtr: Long)
|
||||
external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
|
||||
external fun fullTranscribe(contextPtr: Long, numThreads: Int, audioData: FloatArray)
|
||||
external fun getTextSegmentCount(contextPtr: Long): Int
|
||||
external fun getTextSegment(contextPtr: Long, index: Int): String
|
||||
external fun getSystemInfo(): String
|
||||
|
@ -0,0 +1,73 @@
|
||||
package com.whispercppdemo.whisper
|
||||
|
||||
import android.util.Log
|
||||
import java.io.BufferedReader
|
||||
import java.io.FileReader
|
||||
|
||||
object WhisperCpuConfig {
|
||||
val preferredThreadCount: Int
|
||||
// Always use at least 2 threads:
|
||||
get() = CpuInfo.getHighPerfCpuCount().coerceAtLeast(2)
|
||||
}
|
||||
|
||||
private class CpuInfo(private val lines: List<String>) {
|
||||
private fun getHighPerfCpuCount(): Int = try {
|
||||
getHighPerfCpuCountByFrequencies()
|
||||
} catch (e: Exception) {
|
||||
Log.d(LOG_TAG, "Couldn't read CPU frequencies", e)
|
||||
getHighPerfCpuCountByVariant()
|
||||
}
|
||||
|
||||
private fun getHighPerfCpuCountByFrequencies(): Int =
|
||||
getCpuValues(property = "processor") { getMaxCpuFrequency(it.toInt()) }
|
||||
.also { Log.d(LOG_TAG, "Binned cpu frequencies (frequency, count): ${it.binnedValues()}") }
|
||||
.countDroppingMin()
|
||||
|
||||
private fun getHighPerfCpuCountByVariant(): Int =
|
||||
getCpuValues(property = "CPU variant") { it.substringAfter("0x").toInt(radix = 16) }
|
||||
.also { Log.d(LOG_TAG, "Binned cpu variants (variant, count): ${it.binnedValues()}") }
|
||||
.countKeepingMin()
|
||||
|
||||
private fun List<Int>.binnedValues() = groupingBy { it }.eachCount()
|
||||
|
||||
private fun getCpuValues(property: String, mapper: (String) -> Int) = lines
|
||||
.asSequence()
|
||||
.filter { it.startsWith(property) }
|
||||
.map { mapper(it.substringAfter(':').trim()) }
|
||||
.sorted()
|
||||
.toList()
|
||||
|
||||
|
||||
private fun List<Int>.countDroppingMin(): Int {
|
||||
val min = min()
|
||||
return count { it > min }
|
||||
}
|
||||
|
||||
private fun List<Int>.countKeepingMin(): Int {
|
||||
val min = min()
|
||||
return count { it == min }
|
||||
}
|
||||
|
||||
companion object {
|
||||
private const val LOG_TAG = "WhisperCpuConfig"
|
||||
|
||||
fun getHighPerfCpuCount(): Int = try {
|
||||
readCpuInfo().getHighPerfCpuCount()
|
||||
} catch (e: Exception) {
|
||||
Log.d(LOG_TAG, "Couldn't read CPU info", e)
|
||||
// Our best guess -- just return the # of CPUs minus 4.
|
||||
(Runtime.getRuntime().availableProcessors() - 4).coerceAtLeast(0)
|
||||
}
|
||||
|
||||
private fun readCpuInfo() = CpuInfo(
|
||||
BufferedReader(FileReader("/proc/cpuinfo"))
|
||||
.useLines { it.toList() }
|
||||
)
|
||||
|
||||
private fun getMaxCpuFrequency(cpuIndex: Int): Int {
|
||||
val path = "/sys/devices/system/cpu/cpu${cpuIndex}/cpufreq/cpuinfo_max_freq"
|
||||
val maxFreq = BufferedReader(FileReader(path)).use { it.readLine() }
|
||||
return maxFreq.toInt()
|
||||
}
|
||||
}
|
||||
}
|
@ -163,16 +163,12 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext(
|
||||
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
|
||||
JNIEnv *env, jobject thiz, jlong context_ptr, jfloatArray audio_data) {
|
||||
JNIEnv *env, jobject thiz, jlong context_ptr, jint num_threads, jfloatArray audio_data) {
|
||||
UNUSED(thiz);
|
||||
struct whisper_context *context = (struct whisper_context *) context_ptr;
|
||||
jfloat *audio_data_arr = (*env)->GetFloatArrayElements(env, audio_data, NULL);
|
||||
const jsize audio_data_length = (*env)->GetArrayLength(env, audio_data);
|
||||
|
||||
// Leave 2 processors free (i.e. the high-efficiency cores).
|
||||
int max_threads = max(1, min(8, get_nprocs() - 2));
|
||||
LOGI("Selecting %d threads", max_threads);
|
||||
|
||||
// The below adapted from the Objective-C iOS sample
|
||||
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
params.print_realtime = true;
|
||||
@ -181,7 +177,7 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
|
||||
params.print_special = false;
|
||||
params.translate = false;
|
||||
params.language = "en";
|
||||
params.n_threads = max_threads;
|
||||
params.n_threads = num_threads;
|
||||
params.offset_ms = 0;
|
||||
params.no_context = true;
|
||||
params.single_segment = false;
|
||||
|
@ -1,10 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<resources>
|
||||
<color name="purple_200">#FFBB86FC</color>
|
||||
<color name="purple_500">#FF6200EE</color>
|
||||
<color name="purple_700">#FF3700B3</color>
|
||||
<color name="teal_200">#FF03DAC5</color>
|
||||
<color name="teal_700">#FF018786</color>
|
||||
<color name="black">#FF000000</color>
|
||||
<color name="white">#FFFFFFFF</color>
|
||||
</resources>
|
@ -1,6 +1,6 @@
|
||||
// Top-level build file where you can add configuration options common to all sub-projects/modules.
|
||||
plugins {
|
||||
id 'com.android.application' version '7.3.1' apply false
|
||||
id 'com.android.library' version '7.3.1' apply false
|
||||
id 'org.jetbrains.kotlin.android' version '1.7.10' apply false
|
||||
id 'com.android.application' version '8.1.1' apply false
|
||||
id 'com.android.library' version '8.1.1' apply false
|
||||
id 'org.jetbrains.kotlin.android' version '1.9.0' apply false
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
#Wed Dec 14 10:37:24 EST 2022
|
||||
distributionBase=GRADLE_USER_HOME
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
|
||||
distributionPath=wrapper/dists
|
||||
zipStorePath=wrapper/dists
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
|
32
ggml-cuda.cu
32
ggml-cuda.cu
@ -4086,7 +4086,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
|
||||
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int half_n_dims = ncols/4;
|
||||
|
||||
@ -4098,8 +4099,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
||||
const int i = row*ncols + col;
|
||||
|
||||
const float col_theta_scale = powf(theta_scale, col);
|
||||
const float p = p0 + p_delta*(row/p_delta_rows);
|
||||
|
||||
const float theta = p*col_theta_scale;
|
||||
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta);
|
||||
|
||||
@ -4109,7 +4111,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
||||
|
||||
const float block_theta = block_p*col_theta_scale;
|
||||
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
|
||||
@ -4984,12 +4986,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
|
||||
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
||||
}
|
||||
|
||||
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
||||
GGML_ASSERT(nrows % 4 == 0);
|
||||
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
||||
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
||||
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
||||
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % 4 == 0);
|
||||
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
|
||||
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
|
||||
const dim3 block_nums(num_blocks_x, nrows, 1);
|
||||
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
||||
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
|
||||
}
|
||||
|
||||
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
||||
@ -5723,22 +5726,18 @@ inline void ggml_cuda_op_rope(
|
||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
// compute
|
||||
if (is_glm) {
|
||||
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
||||
const float id_p = min(p, n_ctx - 2.f);
|
||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, n_ctx, cudaStream_main);
|
||||
} else if (is_neox) {
|
||||
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||
} else {
|
||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||
}
|
||||
|
||||
@ -6400,10 +6399,7 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
|
||||
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, true);
|
||||
}
|
||||
|
||||
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
|
@ -327,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
|
||||
void * ggml_metal_host_malloc(size_t n) {
|
||||
void * data = NULL;
|
||||
const int result = posix_memalign((void **) &data, getpagesize(), n);
|
||||
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
||||
if (result != 0) {
|
||||
metal_printf("%s: error: posix_memalign failed\n", __func__);
|
||||
return NULL;
|
||||
@ -401,7 +401,7 @@ bool ggml_metal_add_buffer(
|
||||
}
|
||||
}
|
||||
|
||||
const size_t size_page = getpagesize();
|
||||
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||
|
||||
size_t size_aligned = size;
|
||||
if ((size_aligned % size_page) != 0) {
|
||||
@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
|
||||
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CPY:
|
||||
|
@ -220,14 +220,10 @@ kernel void kernel_norm(
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
//// broadcast
|
||||
//if (tpitg == 0) {
|
||||
// sum[0] /= ne00;
|
||||
//}
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
const float mean = sum[0];
|
||||
const float mean = sum[0] / ne00;
|
||||
|
||||
// recenter and VARIANCE
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
device float * y = dst + tgpig*ne00;
|
||||
sum[tpitg] = 0.0f;
|
||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||
@ -235,12 +231,6 @@ kernel void kernel_norm(
|
||||
sum[tpitg] += y[i00] * y[i00];
|
||||
}
|
||||
|
||||
//// VARIANCE
|
||||
//// parallel sum
|
||||
//sum[tpitg] = 0.0f;
|
||||
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||
// sum[tpitg] += y[i00] * y[i00];
|
||||
//}
|
||||
// reduce
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
for (uint i = ntg/2; i > 0; i /= 2) {
|
||||
@ -249,12 +239,7 @@ kernel void kernel_norm(
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
//// broadcast
|
||||
//if (tpitg == 0) {
|
||||
// sum[0] /= ne00;
|
||||
//}
|
||||
//threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
const float variance = sum[0];
|
||||
const float variance = sum[0] / ne00;
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + eps);
|
||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||
@ -262,7 +247,6 @@ kernel void kernel_norm(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
kernel void kernel_rms_norm(
|
||||
device const void * src0,
|
||||
device float * dst,
|
||||
@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
kernel void kernel_alibi_f32(
|
||||
@ -699,25 +682,27 @@ kernel void kernel_rope(
|
||||
constant int & mode,
|
||||
constant float & freq_base,
|
||||
constant float & freq_scale,
|
||||
uint3 tpig[[thread_position_in_grid]]) {
|
||||
const int64_t i3 = tpig[2];
|
||||
const int64_t i2 = tpig[1];
|
||||
const int64_t i1 = tpig[0];
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int64_t i3 = tgpig[2];
|
||||
const int64_t i2 = tgpig[1];
|
||||
const int64_t i1 = tgpig[0];
|
||||
|
||||
const bool is_neox = mode & 2;
|
||||
const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
||||
|
||||
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
||||
|
||||
float theta = freq_scale * (float)p;
|
||||
const float theta_0 = freq_scale * (float)p;
|
||||
const float inv_ndims = -1.f/n_dims;
|
||||
|
||||
if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
||||
|
||||
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
||||
const float cos_theta = cos(theta);
|
||||
const float sin_theta = sin(theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
@ -729,12 +714,12 @@ kernel void kernel_rope(
|
||||
}
|
||||
} else {
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
|
||||
|
||||
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
|
||||
const float cos_theta = cos(theta);
|
||||
const float sin_theta = sin(theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const int64_t i0 = ib*n_dims + ic/2;
|
||||
|
||||
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
|
47
ggml.c
47
ggml.c
@ -1,4 +1,3 @@
|
||||
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
||||
|
||||
#include "ggml.h"
|
||||
@ -107,6 +106,9 @@ typedef void * thread_ret_t;
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#endif
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
#include <hbwmalloc.h>
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
@ -196,9 +198,15 @@ typedef void * thread_ret_t;
|
||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||
#else
|
||||
inline static void * ggml_aligned_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * aligned_memory = NULL;
|
||||
#ifdef GGML_USE_METAL
|
||||
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
||||
#elif GGML_USE_METAL
|
||||
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
||||
#else
|
||||
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
||||
#endif
|
||||
@ -219,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||
return aligned_memory;
|
||||
}
|
||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
||||
#else
|
||||
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||
@ -4572,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// allow to call ggml_init with 0 size
|
||||
if (params.mem_size == 0) {
|
||||
params.mem_size = GGML_MEM_ALIGN;
|
||||
}
|
||||
|
||||
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
||||
|
||||
*ctx = (struct ggml_context) {
|
||||
@ -4774,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||
|
||||
size_t obj_alloc_size = 0;
|
||||
|
||||
if (view_src == NULL && ctx->no_alloc == false) {
|
||||
if (view_src == NULL && !ctx->no_alloc) {
|
||||
if (ctx->scratch.data != NULL) {
|
||||
// allocate tensor data in the scratch buffer
|
||||
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
||||
@ -5475,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
||||
}
|
||||
|
||||
if (inplace) {
|
||||
GGML_ASSERT(is_node == false);
|
||||
GGML_ASSERT(!is_node);
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
@ -5518,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
|
||||
}
|
||||
|
||||
if (inplace) {
|
||||
GGML_ASSERT(is_node == false);
|
||||
GGML_ASSERT(!is_node);
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
@ -17266,10 +17283,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
} else {
|
||||
// wait for other threads to finish
|
||||
const int last = node_n;
|
||||
do {
|
||||
//sched_yield();
|
||||
while (true) {
|
||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||
// depending on the workload and the operating system.
|
||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
sched_yield();
|
||||
#endif
|
||||
|
||||
node_n = atomic_load(&state->shared->node_n);
|
||||
} while (node_n == last);
|
||||
if (node_n != last) break;
|
||||
};
|
||||
}
|
||||
|
||||
// check if we should stop
|
||||
@ -19962,7 +19987,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
struct ggml_tensor * data = NULL;
|
||||
|
||||
if (params.no_alloc == false) {
|
||||
if (!params.no_alloc) {
|
||||
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
||||
|
||||
ok = ok && data != NULL;
|
||||
@ -20003,7 +20028,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
}
|
||||
|
||||
// point the data member to the appropriate location in the binary blob using the tensor infos
|
||||
if (params.no_alloc == false) {
|
||||
if (!params.no_alloc) {
|
||||
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
||||
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
||||
}
|
||||
|
@ -22,7 +22,28 @@ function get_script_path() {
|
||||
models_path="$(get_script_path)"
|
||||
|
||||
# Whisper models
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small.en-tdrz" "small" "medium.en" "medium" "large-v1" "large" )
|
||||
models=(
|
||||
"tiny.en"
|
||||
"tiny"
|
||||
"tiny-q5_1"
|
||||
"tiny.en-q5_1"
|
||||
"base.en"
|
||||
"base"
|
||||
"base-q5_1"
|
||||
"base.en-q5_1"
|
||||
"small.en"
|
||||
"small.en-tdrz"
|
||||
"small"
|
||||
"small-q5_1"
|
||||
"small.en-q5_1"
|
||||
"medium"
|
||||
"medium.en"
|
||||
"medium-q5_0"
|
||||
"medium.en-q5_0"
|
||||
"large-v1"
|
||||
"large"
|
||||
"large-q5_0"
|
||||
)
|
||||
|
||||
# list available models
|
||||
function list_models {
|
||||
|
221
whisper.cpp
221
whisper.cpp
@ -18,6 +18,7 @@
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
@ -117,6 +118,21 @@ static void byteswap_tensor(ggml_tensor * tensor) {
|
||||
#define WHISPER_USE_SCRATCH
|
||||
#define WHISPER_MAX_SCRATCH_BUFFERS 16
|
||||
|
||||
//
|
||||
// ggml helpers
|
||||
//
|
||||
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
buf.resize(plan.work_size);
|
||||
plan.work_data = buf.data();
|
||||
}
|
||||
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
// available whisper models
|
||||
enum e_model {
|
||||
MODEL_UNKNOWN,
|
||||
@ -537,6 +553,7 @@ struct whisper_kv_cache {
|
||||
|
||||
struct ggml_context * ctx;
|
||||
|
||||
// buf points to the memory allocated for both ggml_tensor 'k' and 'v' (see kv_cache_init)
|
||||
std::vector<uint8_t> buf;
|
||||
|
||||
int n; // number of tokens currently in the cache
|
||||
@ -602,7 +619,7 @@ struct whisper_sequence {
|
||||
|
||||
// TAGS: WHISPER_DECODER_INIT
|
||||
struct whisper_decoder {
|
||||
// each decoders keeps its own KV-cache
|
||||
// each decoder keeps its own KV-cache
|
||||
whisper_kv_cache kv_self;
|
||||
|
||||
// the currently generated sequence of tokens
|
||||
@ -622,6 +639,24 @@ struct whisper_decoder {
|
||||
std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
|
||||
};
|
||||
|
||||
// replace std::pair by using customized pair struct (reason: std::pair is very slow)
|
||||
template<typename A, typename B>
|
||||
struct whisper_pair {
|
||||
A first;
|
||||
B second;
|
||||
|
||||
// Define a constructor that takes two arguments.
|
||||
whisper_pair(const A& a, const B& b) : first(a), second(b) {}
|
||||
// Define a constructor that takes no argument.
|
||||
whisper_pair() : first(A()), second(B()) {}
|
||||
};
|
||||
|
||||
// beam-search helpers
|
||||
struct kv_buf {
|
||||
std::vector<uint8_t> k;
|
||||
std::vector<uint8_t> v;
|
||||
};
|
||||
|
||||
struct whisper_state {
|
||||
int64_t t_sample_us = 0;
|
||||
int64_t t_encode_us = 0;
|
||||
@ -641,8 +676,12 @@ struct whisper_state {
|
||||
|
||||
whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
|
||||
|
||||
// buffer for swapping KV caches between decoders during beam-search
|
||||
std::vector<kv_buf> kv_swap_bufs;
|
||||
|
||||
// memory buffers used by encode / decode contexts
|
||||
std::vector<uint8_t> buf_compute;
|
||||
std::vector<uint8_t> buf_work;
|
||||
std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
|
||||
|
||||
int buf_last = 0;
|
||||
@ -655,7 +694,7 @@ struct whisper_state {
|
||||
std::vector<whisper_token> prompt_past;
|
||||
|
||||
// work container used to avoid memory allocations
|
||||
std::vector<std::pair<double, whisper_vocab::id>> logits_id;
|
||||
std::vector<whisper_pair<double, whisper_vocab::id>> logits_id;
|
||||
|
||||
mutable std::mt19937 rng; // used for sampling at t > 0.0
|
||||
|
||||
@ -1807,8 +1846,8 @@ static bool whisper_encode_internal(
|
||||
{
|
||||
struct ggml_cgraph gf = {};
|
||||
|
||||
ggml_build_forward_expand (&gf, cur);
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
ggml_build_forward_expand(&gf, cur);
|
||||
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
||||
|
||||
//ggml_graph_print(&gf);
|
||||
}
|
||||
@ -1893,7 +1932,7 @@ static bool whisper_encode_internal(
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
|
||||
}
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
||||
//ggml_graph_print(&gf);
|
||||
}
|
||||
|
||||
@ -2306,8 +2345,8 @@ static bool whisper_decode_internal(
|
||||
|
||||
// run the computation
|
||||
{
|
||||
ggml_build_forward_expand (&gf, logits);
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
ggml_build_forward_expand(&gf, logits);
|
||||
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
|
||||
}
|
||||
|
||||
// extract logits for all N tokens
|
||||
@ -3975,17 +4014,21 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
|
||||
|
||||
auto & logits_id = state.logits_id;
|
||||
|
||||
logits_id.clear();
|
||||
logits_id.resize(n_logits);
|
||||
for (int i = 0; i < n_logits; ++i) {
|
||||
logits_id.push_back({ logits[i], i });
|
||||
logits_id[i].first = logits[i];
|
||||
logits_id[i].second = i;
|
||||
}
|
||||
|
||||
std::partial_sort(
|
||||
logits_id.begin(),
|
||||
logits_id.begin() + k, logits_id.end(),
|
||||
[](const std::pair<double, whisper_token> & a, const std::pair<double, whisper_token> & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
{
|
||||
using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
|
||||
std::partial_sort(
|
||||
logits_id.begin(),
|
||||
logits_id.begin() + k, logits_id.end(),
|
||||
[](const pair_type & a, const pair_type & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<whisper_token_data> result;
|
||||
result.reserve(k);
|
||||
@ -4080,6 +4123,115 @@ static void whisper_sequence_score(
|
||||
}
|
||||
}
|
||||
|
||||
static bool whisper_kv_swap_fast(
|
||||
std::vector<int> & view,
|
||||
whisper_decoder src[],
|
||||
std::vector<kv_buf> & kv_swap_bufs,
|
||||
const int & n_decoders) {
|
||||
WHISPER_PRINT_DEBUG("%s: n_decoders %d\n", __func__, n_decoders);
|
||||
|
||||
// (decoder->buffer->decoder or decoder->buffer + decoder->decoder)
|
||||
std::set<int> two_copy; // decoder indices require two copies to safely modify KV caches
|
||||
|
||||
// (buffer->decoder or decoder->decoder)
|
||||
std::set<int> one_copy; // decoder indices require one copy to safely modify KV caches
|
||||
|
||||
// (decoder<->decoder)
|
||||
std::set<int> p_swap_set; // decoder indices able to swap KV-cache pointers
|
||||
std::vector<whisper_pair<int, int>> p_swap_vec;
|
||||
p_swap_vec.reserve(n_decoders);
|
||||
|
||||
// see https://github.com/ggerganov/whisper.cpp/wiki
|
||||
for (int i = 0; i < n_decoders; i++) {
|
||||
// zero-copy (no modification)
|
||||
if (i == view[i] || view[i] < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool is_one_copy = true;
|
||||
// since we modify data sequentially, we only consider decoder indices after current index
|
||||
for (int j = i + 1; j < n_decoders; j++) {
|
||||
if (i == view[j]) {
|
||||
// detect symmetric diagram
|
||||
if (j == view[i]) {
|
||||
p_swap_set.insert(i);
|
||||
p_swap_set.insert(j);
|
||||
p_swap_vec.emplace_back(i, j);
|
||||
} else {
|
||||
two_copy.insert(i);
|
||||
is_one_copy = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (is_one_copy) {
|
||||
one_copy.insert(i);
|
||||
}
|
||||
}
|
||||
|
||||
kv_swap_bufs.resize(n_decoders);
|
||||
|
||||
for (int i = 0; i < n_decoders; i++) {
|
||||
kv_swap_bufs[i].k.resize(ggml_nbytes(src[i].kv_self.k));
|
||||
kv_swap_bufs[i].v.resize(ggml_nbytes(src[i].kv_self.v));
|
||||
}
|
||||
|
||||
for (auto & i : two_copy) {
|
||||
// make a copy of KV caches
|
||||
WHISPER_PRINT_DEBUG("%s: store KV cache into swap: idx %d\n", __func__, i);
|
||||
memcpy(kv_swap_bufs[i].k.data(), src[i].kv_self.k->data, kv_swap_bufs[i].k.size());
|
||||
memcpy(kv_swap_bufs[i].v.data(), src[i].kv_self.v->data, kv_swap_bufs[i].v.size());
|
||||
}
|
||||
|
||||
// since two-copy decoder KV caches are protected by kv_swap_bufs, modify them first
|
||||
for (auto & i : two_copy) {
|
||||
// skip the decoder indices that require pointer swapping
|
||||
if (p_swap_set.find(i) != p_swap_set.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (two_copy.find(view[i]) != two_copy.end()) {
|
||||
// modify KV caches of decoder using data from kv_swap_bufs
|
||||
WHISPER_PRINT_DEBUG("%s: two-copy decoder using swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
|
||||
memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
|
||||
memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
|
||||
} else {
|
||||
// modify KV caches of decoder using data from correspond decoder KV caches directly
|
||||
WHISPER_PRINT_DEBUG("%s: two-copy decoder without swap buffers: %d -> %d\n", __func__, view[i], i);
|
||||
memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
|
||||
memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
|
||||
}
|
||||
}
|
||||
|
||||
// then modify one-copy decoder KV caches
|
||||
for (auto & i : one_copy) {
|
||||
// skip the decoder indices that require pointer swapping
|
||||
if (p_swap_set.find(i) != p_swap_set.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (two_copy.find(view[i]) != two_copy.end()) {
|
||||
// modify KV caches of decoder using data from kv_swap_bufs
|
||||
WHISPER_PRINT_DEBUG("%s: one-copy decoder using swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
|
||||
memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
|
||||
memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
|
||||
} else {
|
||||
// modify KV caches of decoder using data from correspond decoder KV caches directly
|
||||
WHISPER_PRINT_DEBUG("%s: one-copy decoder without swap buffers: %d -> %d\n", __func__, view[i], i);
|
||||
memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
|
||||
memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
|
||||
}
|
||||
}
|
||||
|
||||
// swap the pointers
|
||||
for (auto & i : p_swap_vec) {
|
||||
WHISPER_PRINT_DEBUG("%s: swap pointers: %d <-> %d\n", __func__, i.first, i.second);
|
||||
std::swap(src[i.first].kv_self, src[i.second].kv_self);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int whisper_full_with_state(
|
||||
struct whisper_context * ctx,
|
||||
struct whisper_state * state,
|
||||
@ -4243,14 +4395,6 @@ int whisper_full_with_state(
|
||||
std::vector<whisper_token> prompt;
|
||||
prompt.reserve(whisper_n_text_ctx(ctx));
|
||||
|
||||
// beam-search helpers
|
||||
struct kv_buf {
|
||||
std::vector<uint8_t> k;
|
||||
std::vector<uint8_t> v;
|
||||
};
|
||||
|
||||
std::vector<kv_buf> kv_bufs;
|
||||
|
||||
struct beam_candidate {
|
||||
int decoder_idx;
|
||||
int seek_delta;
|
||||
@ -4399,23 +4543,7 @@ int whisper_full_with_state(
|
||||
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
// store the KV caches of all decoders when doing beam-search
|
||||
if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
|
||||
kv_bufs.resize(n_decoders_cur);
|
||||
for (int j = 0; j < n_decoders_cur; ++j) {
|
||||
auto & decoder = state->decoders[j];
|
||||
|
||||
if (decoder.completed || decoder.failed) {
|
||||
continue;
|
||||
}
|
||||
|
||||
kv_bufs[j].k.resize(ggml_nbytes(decoder.kv_self.k));
|
||||
kv_bufs[j].v.resize(ggml_nbytes(decoder.kv_self.v));
|
||||
|
||||
memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
|
||||
memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
|
||||
}
|
||||
|
||||
beam_candidates.clear();
|
||||
}
|
||||
|
||||
@ -4463,6 +4591,7 @@ int whisper_full_with_state(
|
||||
});
|
||||
|
||||
uint32_t cur_c = 0;
|
||||
std::vector<int> decoder_idx(n_decoders_cur, -1);
|
||||
|
||||
for (int j = 0; j < n_decoders_cur; ++j) {
|
||||
auto & decoder = state->decoders[j];
|
||||
@ -4481,12 +4610,13 @@ int whisper_full_with_state(
|
||||
decoder.seek_delta = cur.seek_delta;
|
||||
decoder.has_ts = cur.has_ts;
|
||||
|
||||
memcpy(decoder.kv_self.k->data, kv_bufs[cur.decoder_idx].k.data(), kv_bufs[cur.decoder_idx].k.size());
|
||||
memcpy(decoder.kv_self.v->data, kv_bufs[cur.decoder_idx].v.data(), kv_bufs[cur.decoder_idx].v.size());
|
||||
|
||||
decoder_idx[j] = cur.decoder_idx;
|
||||
WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
|
||||
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
|
||||
}
|
||||
|
||||
// update KV caches
|
||||
whisper_kv_swap_fast(decoder_idx, state->decoders, state->kv_swap_bufs, n_decoders_cur);
|
||||
}
|
||||
|
||||
// update the decoder state
|
||||
@ -5111,7 +5241,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
||||
// b: N*N*sizeof(float)
|
||||
// c: N*N*sizeof(float)
|
||||
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
||||
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
|
||||
std::vector<uint8_t> buf (3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead());
|
||||
std::vector<uint8_t> work(1llu*N_max*N_max*sizeof(float) + 1*ggml_tensor_overhead());
|
||||
|
||||
// put a bunch of random data in the buffer
|
||||
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
||||
@ -5166,12 +5297,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
||||
double tsum = 0.0;
|
||||
|
||||
// heat-up
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
ggml_graph_compute_helper(work, &gf, n_threads);
|
||||
|
||||
for (int i = 0; i < n_max; ++i) {
|
||||
const int64_t t0 = ggml_time_us();
|
||||
|
||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||
ggml_graph_compute_helper(work, &gf, n_threads);
|
||||
|
||||
const int64_t t1 = ggml_time_us();
|
||||
|
||||
|
Reference in New Issue
Block a user