Compare commits

...

19 Commits

Author SHA1 Message Date
09a6325de5 ggml : use sched_yield when using BLAS + add comment 2023-09-12 13:33:09 +03:00
39c4fc59dd whisper : fix bench regression 2023-09-12 11:21:02 +03:00
9b14418863 whisper : faster beam_search sampling via reduced KV cache copies (#1243)
* Faster `beam_search` sampling

Refine the KV cache update logic for more intelligent and efficient updating.

* Faster `whisper_sample_token_topk`

* Update whisper.cpp

* Update whisper.cpp

* Update whisper.cpp

* Reduce `memory allocation`

* Add `pointer swapping`

* Fixed some bugs

* Update whisper.cpp

* Apply suggestions from code review

* Updated the logic for determining `two-copy`

* Updated the logic for determining `two-copy` v2

* whisper : add debug logs + coding style

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-09-10 16:04:27 +03:00
6ddc727fac java : fixed signing of java artifact using gradle (#1267)
* --stacktrace signMavenJavaPublication

* added temporary step "Debug gradle signing"

* cd bindings/java

* use GPG_PRIVATE_KEY and GPG_PASSPHRASE

* use secrets.GPG_PRIVATE_KEY and GPG_PASSPHRASE
2023-09-09 18:55:51 +03:00
acb5278cc8 ci : try to fix gradle action (#1265) 2023-09-08 20:50:15 +03:00
0839209cab gitignore : update 2023-09-08 19:45:28 +03:00
b39809668a sync : ggml (HBM + Metal + style) (#1264) 2023-09-08 17:58:31 +03:00
3e9edc6845 ci : upgrade gradle to 2.4.2 (#1263)
* ci : upgrade gradle to 2.4.2

* cmake : add comment (#1129)
2023-09-08 17:58:14 +03:00
bfc73f1fa2 sync : ggml (CUDA faster rope) 2023-09-08 15:01:26 +03:00
f00c9bba33 cmake : noramlize case (#1129) 2023-09-08 14:50:03 +03:00
b55b505690 build : do not use _GNU_SOURCE gratuitously (#1129)
* Do not use _GNU_SOURCE gratuitously.

What is needed to build whisper.cpp and examples is availability of
stuff defined in The Open Group Base Specifications Issue 6
(https://pubs.opengroup.org/onlinepubs/009695399/) known also as
Single Unix Specification v3 (SUSv3) or POSIX.1-2001 + XSI extensions,
plus some stuff from BSD that is not specified in POSIX.1.

Well, that was true until NUMA support was added recently in ggml,
so enable GNU libc extensions for Linux builds to cover that.

There is no need to penalize musl libc which simply follows standards.

Not having feature test macros in source code gives greater flexibility
to those wanting to reuse it in 3rd party app, as they can build it with
minimal FTM (_XOPEN_SOURCE=600) or other FTM depending on their needs.

It builds without issues in Alpine (musl libc), Ubuntu (glibc), MSYS2.

* examples : include SDL headers before other headers

Avoid macOS build error when _DARWIN_C_SOURCE is not defined, brought by
SDL2 relying on Darwin extension memset_pattern4/8/16 (from string.h).

* make : enable BSD extensions for DragonFlyBSD to expose RLIMIT_MEMLOCK

* make : use BSD-specific FTMs to enable alloca on BSDs

* make : fix OpenBSD build by exposing newer POSIX definitions

* cmake : follow recent FTM improvements from Makefile
2023-09-07 12:36:14 +03:00
2818de21ff examples : fix build + compile warnings (close #1256) 2023-09-07 12:33:12 +03:00
aed5d40607 models : add quantum models to download-ggml-model.sh (#1235)
* Add quantized models to download-ggml-model.sh

* Update names in download-ggml-model script to normalized
2023-09-07 12:16:58 +03:00
afa5477d1c whisper.android : bump gradle plugin and dependencies + a lint pass (#1255) 2023-09-07 12:15:59 +03:00
01fcd42431 sign jar for Maven Central repo 2023-09-07 11:45:44 +10:00
f990610776 whisper.android : address ARM's big.LITTLE arch by checking cpu info (#1254)
Addresses https://github.com/ggerganov/whisper.cpp/issues/1248
2023-09-06 18:32:30 +03:00
64cb45fd79 make : fix detection of AVX2 on macOS (#1250) 2023-09-06 18:22:21 +03:00
ace6c12ec6 ggml : posixify pagesize (#1251)
* ggml : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD

sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml.c

* metal : use sysconf(_SC_PAGESIZE) instead of getpagesize() derived from BSD

sed -i 's,getpagesize(),sysconf(_SC_PAGESIZE),g' ggml-metal.m
2023-09-06 18:19:36 +03:00
cac75be05b configured publishing.repositories 2023-09-06 13:13:36 +10:00
33 changed files with 526 additions and 204 deletions

View File

@ -428,15 +428,15 @@ jobs:
- name: Publish package
if: ${{ github.ref == 'refs/heads/master' }}
uses: gradle/gradle-build-action@v2
uses: gradle/gradle-build-action@v2.4.2
with:
arguments: publish
build-root-directory: bindings/java
env:
MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
# MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
# MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
quantize:
runs-on: ubuntu-latest

1
.gitignore vendored
View File

@ -11,6 +11,7 @@ build/
build-em/
build-debug/
build-release/
build-rwdi/
build-static/
build-cublas/
build-no-accel/

View File

@ -321,6 +321,53 @@ else()
endif()
endif()
#
# POSIX conformance
#
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
add_compile_definitions(_XOPEN_SOURCE=600)
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
remove_definitions(-D_XOPEN_SOURCE=600)
add_compile_definitions(_XOPEN_SOURCE=700)
endif()
# Data types, macros and functions related to controlling CPU affinity
# are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE)
endif()
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
add_compile_definitions(__BSD_VISIBLE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
add_compile_definitions(_NETBSD_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
add_compile_definitions(_BSD_SOURCE)
endif()
if (WHISPER_PERF)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
endif()

View File

@ -42,18 +42,55 @@ CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
# ref: https://github.com/ggerganov/whisper.cpp/issues/37
ifneq ($(wildcard /usr/include/musl/*),)
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
CFLAGS += -D_XOPEN_SOURCE=600
CXXFLAGS += -D_XOPEN_SOURCE=600
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
ifeq ($(UNAME_S),OpenBSD)
CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
endif
# Data types, macros and functions related to controlling CPU affinity
# are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux)
CFLAGS += -D_GNU_SOURCE
CXXFLAGS += -D_GNU_SOURCE
endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
ifeq ($(UNAME_S),Darwin)
CFLAGS += -D_DARWIN_C_SOURCE
CXXFLAGS += -D_DARWIN_C_SOURCE
endif
ifeq ($(UNAME_S),DragonFly)
CFLAGS += -D__BSD_VISIBLE
CXXFLAGS += -D__BSD_VISIBLE
endif
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
ifeq ($(UNAME_S),FreeBSD)
CFLAGS += -D__BSD_VISIBLE
CXXFLAGS += -D__BSD_VISIBLE
endif
ifeq ($(UNAME_S),NetBSD)
CFLAGS += -D_NETBSD_SOURCE
CXXFLAGS += -D_NETBSD_SOURCE
endif
ifeq ($(UNAME_S),OpenBSD)
CFLAGS += -D_BSD_SOURCE
CXXFLAGS += -D_BSD_SOURCE
endif
# OS specific
# TODO: support Windows
@ -67,7 +104,7 @@ endif
# feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
ifeq ($(UNAME_S),Darwin)
CPUINFO_CMD := sysctl machdep.cpu.features
CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
else ifeq ($(UNAME_S),Linux)
CPUINFO_CMD := cat /proc/cpuinfo
else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))

View File

@ -2,6 +2,7 @@ plugins {
id 'java'
id 'java-library'
id 'maven-publish'
id 'signing'
}
archivesBaseName = 'whispercpp'
@ -109,4 +110,23 @@ publishing {
}
}
}
repositories {
maven {
def releasesRepoUrl = 'https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/'
def snapshotsRepoUrl = 'https://s01.oss.sonatype.org/content/repositories/snapshots/'
url = version.endsWith('-SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl
credentials {
username = System.getenv("MAVEN_USERNAME")
password = System.getenv("MAVEN_PASSWORD")
}
}
}
}
signing {
def signingKey = System.getenv("PGP_SECRET")
def signingPassword = System.getenv("PGP_PASSPHRASE")
useInMemoryPgpKeys(signingKey, signingPassword)
sign publishing.publications.mavenJava
}

View File

@ -6,8 +6,8 @@
// ref: https://github.com/ggerganov/whisper.cpp/issues/171
//
#include "common.h"
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include <sstream>

View File

@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) {
return true;
}
void sam_print_usage(int argc, char ** argv, const sam_params & params) {
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");

View File

@ -324,12 +324,12 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
commandset_list.push_back(cs);
return json{{"index",index}};
}
json seek(struct whisper_context * ctx, audio_async &audio, json params) {
json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
// whisper_state has the pertinent offsets, but there also seem to be a large
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
// I'll give this a another pass once everything else is implemented,
// but for now, it's unsupported
throw json{
throw json {
{"code", -32601},
{"message", "Seeking is not yet supported."}
};
@ -412,7 +412,7 @@ void process_loop(struct whisper_context * ctx, audio_async &audio, const whispe
jobqueue.pop_front();
// send response
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str());
std::cout.flush();
}

View File

@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
return speaker;
}
void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
if (progress >= *progress_prev + progress_step) {
@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
return true;
}
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

View File

@ -3,8 +3,8 @@
// A very quick-n-dirty implementation serving mainly as a proof of concept.
//
#include "common.h"
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include <cassert>

View File

@ -1,11 +1,3 @@
// Defines fileno on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#include <cstddef>
#include <cstdint>
#include <cstdio>
#endif
#include "llama-util.h"
#include "llama.h"
@ -1164,7 +1156,7 @@ static bool llama_eval_internal(
const llama_token * tokens,
const int n_tokens,
const int n_past,
const int n_threads) {
int n_threads) {
// enforce that the first token is BOS
if (n_past == 0 && tokens[0] != llama_token_bos()) {
@ -1190,6 +1182,8 @@ static bool llama_eval_internal(
const int n_vocab = hparams.n_vocab;
const int n_rot = hparams.n_embd/hparams.n_head;
const float eps = 5e-6f; // TODO: take from hparams
auto & mem_per_token = lctx.mem_per_token;
auto & buf_compute = lctx.buf_compute;
@ -1204,7 +1198,7 @@ static bool llama_eval_internal(
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ggml_cgraph gf = {};
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_set_name(embd, "embd");
@ -1221,7 +1215,7 @@ static bool llama_eval_internal(
// norm
{
cur = ggml_rms_norm(ctx0, inpL);
cur = ggml_rms_norm(ctx0, inpL, eps);
// cur = cur*attention_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
@ -1329,7 +1323,7 @@ static bool llama_eval_internal(
{
// norm
{
cur = ggml_rms_norm(ctx0, inpFF);
cur = ggml_rms_norm(ctx0, inpFF, eps);
// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
@ -1367,7 +1361,7 @@ static bool llama_eval_internal(
// norm
{
inpL = ggml_rms_norm(ctx0, inpL);
inpL = ggml_rms_norm(ctx0, inpL, eps);
// inpL = inpL*norm(broadcasted)
inpL = ggml_mul(ctx0, inpL, model.norm);
@ -1384,8 +1378,8 @@ static bool llama_eval_internal(
//inpL = ggml_soft_max_inplace(ctx0, inpL);
// run the computation
ggml_build_forward_expand(&gf, inpL);
ggml_graph_compute (ctx0, &gf);
ggml_build_forward_expand (&gf, inpL);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
#ifdef GGML_PERF
// print timing information per ggml operation (for debugging purposes)
@ -2488,8 +2482,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
}
struct ggml_cgraph gf = ggml_build_forward(r);
gf.n_threads = n_threads;
ggml_graph_compute(lora_ctx, &gf);
ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);
// we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx);
@ -2635,7 +2628,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{};
gf.n_threads = 1;
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
kout3d->data = out;
@ -2655,7 +2647,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
ggml_graph_compute(cpy_ctx, &gf);
ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
ggml_free(cpy_ctx);
}
@ -2743,7 +2735,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{};
gf.n_threads = 1;
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
kin3d->data = (void *) inp;
@ -2763,7 +2754,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
ggml_graph_compute(cpy_ctx, &gf);
ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
ggml_free(cpy_ctx);
}

View File

@ -1,8 +1,8 @@
// Talk with AI
//
#include "common.h"
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include "llama.h"
@ -649,7 +649,10 @@ int main(int argc, char ** argv) {
}
text_to_speak = ::replace(text_to_speak, "\"", "");
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
if (ret != 0) {
fprintf(stderr, "%s: failed to speak\n", __func__);
}
audio.clear();

View File

@ -191,9 +191,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// create the ggml context
{
struct ggml_init_params params = {
.mem_size = ctx_size,
.mem_buffer = NULL,
.no_alloc = false,
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);

View File

@ -1,8 +1,8 @@
// Talk with AI
//
#include "common.h"
#include "common-sdl.h"
#include "common.h"
#include "whisper.h"
#include "gpt-2.h"
@ -349,7 +349,10 @@ int main(int argc, char ** argv) {
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
if (ret != 0) {
fprintf(stderr, "%s: system() failed!\n", __func__);
}
audio.clear();

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<bytecodeTargetLevel target="11" />
<bytecodeTargetLevel target="17" />
</component>
</project>

View File

@ -4,15 +4,15 @@
<component name="GradleSettings">
<option name="linkedExternalProjectsSettings">
<GradleProjectSettings>
<option name="testRunner" value="GRADLE" />
<option name="distributionType" value="DEFAULT_WRAPPED" />
<option name="externalProjectPath" value="$PROJECT_DIR$" />
<option name="gradleJvm" value="#GRADLE_LOCAL_JAVA_HOME" />
<option name="modules">
<set>
<option value="$PROJECT_DIR$" />
<option value="$PROJECT_DIR$/app" />
</set>
</option>
<option name="resolveExternalAnnotations" value="false" />
</GradleProjectSettings>
</option>
</component>

View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="Android Studio default JDK" project-jdk-type="JavaSDK">
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="jbr-17" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/build/classes" />
</component>
<component name="ProjectType">

View File

@ -5,12 +5,12 @@ plugins {
android {
namespace 'com.whispercppdemo'
compileSdk 33
compileSdk 34
defaultConfig {
applicationId "com.whispercppdemo"
minSdk 26
targetSdk 32
targetSdk 34
versionCode 1
versionName "1.0"
@ -31,19 +31,19 @@ android {
}
}
compileOptions {
sourceCompatibility JavaVersion.VERSION_1_8
targetCompatibility JavaVersion.VERSION_1_8
sourceCompatibility JavaVersion.VERSION_17
targetCompatibility JavaVersion.VERSION_17
}
kotlinOptions {
jvmTarget = '1.8'
jvmTarget = '17'
}
buildFeatures {
compose true
}
composeOptions {
kotlinCompilerExtensionVersion '1.3.1'
kotlinCompilerExtensionVersion '1.5.0'
}
ndkVersion "25.1.8937393"
ndkVersion "25.2.9519653"
externalNativeBuild {
cmake {
path = file("src/main/jni/whisper/CMakeLists.txt")
@ -57,19 +57,19 @@ android {
}
dependencies {
implementation 'androidx.activity:activity-compose:1.6.1'
implementation 'androidx.compose.material:material-icons-core:1.3.1'
implementation 'androidx.compose.material3:material3:1.0.1'
implementation "androidx.compose.ui:ui:1.3.2"
implementation "androidx.compose.ui:ui-tooling-preview:1.3.2"
implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.5.1'
implementation 'androidx.activity:activity-compose:1.7.2'
implementation 'androidx.compose.material:material-icons-core:1.5.0'
implementation 'androidx.compose.material3:material3:1.1.1'
implementation "androidx.compose.ui:ui:1.5.0"
implementation "androidx.compose.ui:ui-tooling-preview:1.5.0"
implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.6.1'
implementation "com.google.accompanist:accompanist-permissions:0.28.0"
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.6.4'
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.2'
testImplementation 'junit:junit:4.13.2'
androidTestImplementation 'androidx.test.ext:junit:1.1.4'
androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.0'
androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.3.2"
debugImplementation "androidx.compose.ui:ui-tooling:1.3.2"
debugImplementation "androidx.compose.ui:ui-test-manifest:1.3.2"
androidTestImplementation 'androidx.test.ext:junit:1.1.5'
androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.1'
androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.5.0"
debugImplementation "androidx.compose.ui:ui-tooling:1.5.0"
debugImplementation "androidx.compose.ui:ui-test-manifest:1.5.0"
}

View File

@ -66,7 +66,7 @@ private fun MainScreen(
@Composable
private fun MessageLog(log: String) {
SelectionContainer() {
SelectionContainer {
Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
}
}

View File

@ -47,7 +47,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
}
private suspend fun printSystemInfo() {
printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()))
}
private suspend fun loadData() {

View File

@ -13,7 +13,7 @@ import androidx.compose.runtime.SideEffect
import androidx.compose.ui.graphics.toArgb
import androidx.compose.ui.platform.LocalContext
import androidx.compose.ui.platform.LocalView
import androidx.core.view.ViewCompat
import androidx.core.view.WindowCompat
private val DarkColorScheme = darkColorScheme(
primary = Purple80,
@ -55,8 +55,9 @@ fun WhisperCppDemoTheme(
val view = LocalView.current
if (!view.isInEditMode) {
SideEffect {
(view.context as Activity).window.statusBarColor = colorScheme.primary.toArgb()
ViewCompat.getWindowInsetsController(view)?.isAppearanceLightStatusBars = darkTheme
val window = (view.context as Activity).window
window.statusBarColor = colorScheme.primary.toArgb()
WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
}
}

View File

@ -18,7 +18,9 @@ class WhisperContext private constructor(private var ptr: Long) {
suspend fun transcribeData(data: FloatArray): String = withContext(scope.coroutineContext) {
require(ptr != 0L)
WhisperLib.fullTranscribe(ptr, data)
val numThreads = WhisperCpuConfig.preferredThreadCount
Log.d(LOG_TAG, "Selecting $numThreads threads")
WhisperLib.fullTranscribe(ptr, numThreads, data)
val textCount = WhisperLib.getTextSegmentCount(ptr)
return@withContext buildString {
for (i in 0 until textCount) {
@ -126,7 +128,7 @@ private class WhisperLib {
external fun initContextFromAsset(assetManager: AssetManager, assetPath: String): Long
external fun initContext(modelPath: String): Long
external fun freeContext(contextPtr: Long)
external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
external fun fullTranscribe(contextPtr: Long, numThreads: Int, audioData: FloatArray)
external fun getTextSegmentCount(contextPtr: Long): Int
external fun getTextSegment(contextPtr: Long, index: Int): String
external fun getSystemInfo(): String

View File

@ -0,0 +1,73 @@
package com.whispercppdemo.whisper
import android.util.Log
import java.io.BufferedReader
import java.io.FileReader
object WhisperCpuConfig {
val preferredThreadCount: Int
// Always use at least 2 threads:
get() = CpuInfo.getHighPerfCpuCount().coerceAtLeast(2)
}
private class CpuInfo(private val lines: List<String>) {
private fun getHighPerfCpuCount(): Int = try {
getHighPerfCpuCountByFrequencies()
} catch (e: Exception) {
Log.d(LOG_TAG, "Couldn't read CPU frequencies", e)
getHighPerfCpuCountByVariant()
}
private fun getHighPerfCpuCountByFrequencies(): Int =
getCpuValues(property = "processor") { getMaxCpuFrequency(it.toInt()) }
.also { Log.d(LOG_TAG, "Binned cpu frequencies (frequency, count): ${it.binnedValues()}") }
.countDroppingMin()
private fun getHighPerfCpuCountByVariant(): Int =
getCpuValues(property = "CPU variant") { it.substringAfter("0x").toInt(radix = 16) }
.also { Log.d(LOG_TAG, "Binned cpu variants (variant, count): ${it.binnedValues()}") }
.countKeepingMin()
private fun List<Int>.binnedValues() = groupingBy { it }.eachCount()
private fun getCpuValues(property: String, mapper: (String) -> Int) = lines
.asSequence()
.filter { it.startsWith(property) }
.map { mapper(it.substringAfter(':').trim()) }
.sorted()
.toList()
private fun List<Int>.countDroppingMin(): Int {
val min = min()
return count { it > min }
}
private fun List<Int>.countKeepingMin(): Int {
val min = min()
return count { it == min }
}
companion object {
private const val LOG_TAG = "WhisperCpuConfig"
fun getHighPerfCpuCount(): Int = try {
readCpuInfo().getHighPerfCpuCount()
} catch (e: Exception) {
Log.d(LOG_TAG, "Couldn't read CPU info", e)
// Our best guess -- just return the # of CPUs minus 4.
(Runtime.getRuntime().availableProcessors() - 4).coerceAtLeast(0)
}
private fun readCpuInfo() = CpuInfo(
BufferedReader(FileReader("/proc/cpuinfo"))
.useLines { it.toList() }
)
private fun getMaxCpuFrequency(cpuIndex: Int): Int {
val path = "/sys/devices/system/cpu/cpu${cpuIndex}/cpufreq/cpuinfo_max_freq"
val maxFreq = BufferedReader(FileReader(path)).use { it.readLine() }
return maxFreq.toInt()
}
}
}

View File

@ -163,16 +163,12 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext(
JNIEXPORT void JNICALL
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
JNIEnv *env, jobject thiz, jlong context_ptr, jfloatArray audio_data) {
JNIEnv *env, jobject thiz, jlong context_ptr, jint num_threads, jfloatArray audio_data) {
UNUSED(thiz);
struct whisper_context *context = (struct whisper_context *) context_ptr;
jfloat *audio_data_arr = (*env)->GetFloatArrayElements(env, audio_data, NULL);
const jsize audio_data_length = (*env)->GetArrayLength(env, audio_data);
// Leave 2 processors free (i.e. the high-efficiency cores).
int max_threads = max(1, min(8, get_nprocs() - 2));
LOGI("Selecting %d threads", max_threads);
// The below adapted from the Objective-C iOS sample
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
params.print_realtime = true;
@ -181,7 +177,7 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
params.print_special = false;
params.translate = false;
params.language = "en";
params.n_threads = max_threads;
params.n_threads = num_threads;
params.offset_ms = 0;
params.no_context = true;
params.single_segment = false;

View File

@ -1,10 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<resources>
<color name="purple_200">#FFBB86FC</color>
<color name="purple_500">#FF6200EE</color>
<color name="purple_700">#FF3700B3</color>
<color name="teal_200">#FF03DAC5</color>
<color name="teal_700">#FF018786</color>
<color name="black">#FF000000</color>
<color name="white">#FFFFFFFF</color>
</resources>

View File

@ -1,6 +1,6 @@
// Top-level build file where you can add configuration options common to all sub-projects/modules.
plugins {
id 'com.android.application' version '7.3.1' apply false
id 'com.android.library' version '7.3.1' apply false
id 'org.jetbrains.kotlin.android' version '1.7.10' apply false
id 'com.android.application' version '8.1.1' apply false
id 'com.android.library' version '8.1.1' apply false
id 'org.jetbrains.kotlin.android' version '1.9.0' apply false
}

View File

@ -1,6 +1,6 @@
#Wed Dec 14 10:37:24 EST 2022
distributionBase=GRADLE_USER_HOME
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
distributionPath=wrapper/dists
zipStorePath=wrapper/dists
zipStoreBase=GRADLE_USER_HOME

View File

@ -4086,7 +4086,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
}
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int half_n_dims = ncols/4;
@ -4098,8 +4099,9 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
const int i = row*ncols + col;
const float col_theta_scale = powf(theta_scale, col);
const float p = p0 + p_delta*(row/p_delta_rows);
const float theta = p*col_theta_scale;
const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
const float sin_theta = sinf(theta);
const float cos_theta = cosf(theta);
@ -4109,7 +4111,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
dst[i + 0] = x0*cos_theta - x1*sin_theta;
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
const float block_theta = block_p*col_theta_scale;
const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
const float sin_block_theta = sinf(block_theta);
const float cos_block_theta = cosf(block_theta);
@ -4984,12 +4986,13 @@ static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, co
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
}
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
GGML_ASSERT(nrows % 4 == 0);
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
GGML_ASSERT(ncols % 4 == 0);
const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
const dim3 block_nums(num_blocks_x, nrows, 1);
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
}
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@ -5723,22 +5726,18 @@ inline void ggml_cuda_op_rope(
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
// compute
if (is_glm) {
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
const float id_p = min(p, n_ctx - 2.f);
const float block_p = max(p - (n_ctx - 2.f), 0.f);
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, n_ctx, cudaStream_main);
} else if (is_neox) {
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
} else {
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
}
@ -6400,10 +6399,7 @@ void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_ten
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
const int mode = ((int32_t *) dst->op_params)[2];
const bool is_glm = mode & 4;
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, true);
}
void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

View File

@ -327,7 +327,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
void * ggml_metal_host_malloc(size_t n) {
void * data = NULL;
const int result = posix_memalign((void **) &data, getpagesize(), n);
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
if (result != 0) {
metal_printf("%s: error: posix_memalign failed\n", __func__);
return NULL;
@ -401,7 +401,7 @@ bool ggml_metal_add_buffer(
}
}
const size_t size_page = getpagesize();
const size_t size_page = sysconf(_SC_PAGESIZE);
size_t size_aligned = size;
if ((size_aligned % size_page) != 0) {
@ -1141,7 +1141,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&freq_base length:sizeof(float) atIndex:21];
[encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
} break;
case GGML_OP_DUP:
case GGML_OP_CPY:

View File

@ -220,14 +220,10 @@ kernel void kernel_norm(
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
//// broadcast
//if (tpitg == 0) {
// sum[0] /= ne00;
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float mean = sum[0];
const float mean = sum[0] / ne00;
// recenter and VARIANCE
threadgroup_barrier(mem_flags::mem_threadgroup);
device float * y = dst + tgpig*ne00;
sum[tpitg] = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -235,12 +231,6 @@ kernel void kernel_norm(
sum[tpitg] += y[i00] * y[i00];
}
//// VARIANCE
//// parallel sum
//sum[tpitg] = 0.0f;
//for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
// sum[tpitg] += y[i00] * y[i00];
//}
// reduce
threadgroup_barrier(mem_flags::mem_threadgroup);
for (uint i = ntg/2; i > 0; i /= 2) {
@ -249,12 +239,7 @@ kernel void kernel_norm(
}
threadgroup_barrier(mem_flags::mem_threadgroup);
}
//// broadcast
//if (tpitg == 0) {
// sum[0] /= ne00;
//}
//threadgroup_barrier(mem_flags::mem_threadgroup);
const float variance = sum[0];
const float variance = sum[0] / ne00;
const float scale = 1.0f/sqrt(variance + eps);
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
@ -262,7 +247,6 @@ kernel void kernel_norm(
}
}
kernel void kernel_rms_norm(
device const void * src0,
device float * dst,
@ -630,7 +614,6 @@ kernel void kernel_mul_mat_f16_f32(
}
}
}
}
kernel void kernel_alibi_f32(
@ -699,25 +682,27 @@ kernel void kernel_rope(
constant int & mode,
constant float & freq_base,
constant float & freq_scale,
uint3 tpig[[thread_position_in_grid]]) {
const int64_t i3 = tpig[2];
const int64_t i2 = tpig[1];
const int64_t i1 = tpig[0];
uint tiitg[[thread_index_in_threadgroup]],
uint3 tptg[[threads_per_threadgroup]],
uint3 tgpig[[threadgroup_position_in_grid]]) {
const int64_t i3 = tgpig[2];
const int64_t i2 = tgpig[1];
const int64_t i1 = tgpig[0];
const bool is_neox = mode & 2;
const float theta_scale = pow(freq_base, -2.0f/n_dims);
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
float theta = freq_scale * (float)p;
const float theta_0 = freq_scale * (float)p;
const float inv_ndims = -1.f/n_dims;
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
theta *= theta_scale;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
@ -729,12 +714,12 @@ kernel void kernel_rope(
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) {
for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
theta *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);

47
ggml.c
View File

@ -1,4 +1,3 @@
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
#include "ggml.h"
@ -107,6 +106,9 @@ typedef void * thread_ret_t;
#include <sys/stat.h>
#include <unistd.h>
#endif
#ifdef GGML_USE_CPU_HBM
#include <hbwmalloc.h>
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@ -196,9 +198,15 @@ typedef void * thread_ret_t;
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else
inline static void * ggml_aligned_malloc(size_t size) {
if (size == 0) {
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
return NULL;
}
void * aligned_memory = NULL;
#ifdef GGML_USE_METAL
int result = posix_memalign(&aligned_memory, getpagesize(), size);
#ifdef GGML_USE_CPU_HBM
int result = hbw_posix_memalign(&aligned_memory, 16, size);
#elif GGML_USE_METAL
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
#else
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
#endif
@ -219,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
return aligned_memory;
}
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
#ifdef GGML_USE_CPU_HBM
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
#else
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
#endif
#define UNUSED GGML_UNUSED
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@ -4572,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
return NULL;
}
// allow to call ggml_init with 0 size
if (params.mem_size == 0) {
params.mem_size = GGML_MEM_ALIGN;
}
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
*ctx = (struct ggml_context) {
@ -4774,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
size_t obj_alloc_size = 0;
if (view_src == NULL && ctx->no_alloc == false) {
if (view_src == NULL && !ctx->no_alloc) {
if (ctx->scratch.data != NULL) {
// allocate tensor data in the scratch buffer
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@ -5475,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
}
if (inplace) {
GGML_ASSERT(is_node == false);
GGML_ASSERT(!is_node);
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -5518,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
}
if (inplace) {
GGML_ASSERT(is_node == false);
GGML_ASSERT(!is_node);
}
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -17266,10 +17283,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
} else {
// wait for other threads to finish
const int last = node_n;
do {
//sched_yield();
while (true) {
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
// depending on the workload and the operating system.
// since it is not clear what is the best approach, it should potentially become user-configurable
// ref: https://github.com/ggerganov/ggml/issues/291
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
sched_yield();
#endif
node_n = atomic_load(&state->shared->node_n);
} while (node_n == last);
if (node_n != last) break;
};
}
// check if we should stop
@ -19962,7 +19987,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
struct ggml_tensor * data = NULL;
if (params.no_alloc == false) {
if (!params.no_alloc) {
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
ok = ok && data != NULL;
@ -20003,7 +20028,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
}
// point the data member to the appropriate location in the binary blob using the tensor infos
if (params.no_alloc == false) {
if (!params.no_alloc) {
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
}

View File

@ -22,7 +22,28 @@ function get_script_path() {
models_path="$(get_script_path)"
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small.en-tdrz" "small" "medium.en" "medium" "large-v1" "large" )
models=(
"tiny.en"
"tiny"
"tiny-q5_1"
"tiny.en-q5_1"
"base.en"
"base"
"base-q5_1"
"base.en-q5_1"
"small.en"
"small.en-tdrz"
"small"
"small-q5_1"
"small.en-q5_1"
"medium"
"medium.en"
"medium-q5_0"
"medium.en-q5_0"
"large-v1"
"large"
"large-q5_0"
)
# list available models
function list_models {

View File

@ -18,6 +18,7 @@
#include <cstring>
#include <fstream>
#include <map>
#include <set>
#include <string>
#include <thread>
#include <vector>
@ -117,6 +118,21 @@ static void byteswap_tensor(ggml_tensor * tensor) {
#define WHISPER_USE_SCRATCH
#define WHISPER_MAX_SCRATCH_BUFFERS 16
//
// ggml helpers
//
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
// available whisper models
enum e_model {
MODEL_UNKNOWN,
@ -537,6 +553,7 @@ struct whisper_kv_cache {
struct ggml_context * ctx;
// buf points to the memory allocated for both ggml_tensor 'k' and 'v' (see kv_cache_init)
std::vector<uint8_t> buf;
int n; // number of tokens currently in the cache
@ -602,7 +619,7 @@ struct whisper_sequence {
// TAGS: WHISPER_DECODER_INIT
struct whisper_decoder {
// each decoders keeps its own KV-cache
// each decoder keeps its own KV-cache
whisper_kv_cache kv_self;
// the currently generated sequence of tokens
@ -622,6 +639,24 @@ struct whisper_decoder {
std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
};
// replace std::pair by using customized pair struct (reason: std::pair is very slow)
template<typename A, typename B>
struct whisper_pair {
A first;
B second;
// Define a constructor that takes two arguments.
whisper_pair(const A& a, const B& b) : first(a), second(b) {}
// Define a constructor that takes no argument.
whisper_pair() : first(A()), second(B()) {}
};
// beam-search helpers
struct kv_buf {
std::vector<uint8_t> k;
std::vector<uint8_t> v;
};
struct whisper_state {
int64_t t_sample_us = 0;
int64_t t_encode_us = 0;
@ -641,8 +676,12 @@ struct whisper_state {
whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
// buffer for swapping KV caches between decoders during beam-search
std::vector<kv_buf> kv_swap_bufs;
// memory buffers used by encode / decode contexts
std::vector<uint8_t> buf_compute;
std::vector<uint8_t> buf_work;
std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
int buf_last = 0;
@ -655,7 +694,7 @@ struct whisper_state {
std::vector<whisper_token> prompt_past;
// work container used to avoid memory allocations
std::vector<std::pair<double, whisper_vocab::id>> logits_id;
std::vector<whisper_pair<double, whisper_vocab::id>> logits_id;
mutable std::mt19937 rng; // used for sampling at t > 0.0
@ -1807,8 +1846,8 @@ static bool whisper_encode_internal(
{
struct ggml_cgraph gf = {};
ggml_build_forward_expand (&gf, cur);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
ggml_build_forward_expand(&gf, cur);
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
//ggml_graph_print(&gf);
}
@ -1893,7 +1932,7 @@ static bool whisper_encode_internal(
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
}
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
//ggml_graph_print(&gf);
}
@ -2306,8 +2345,8 @@ static bool whisper_decode_internal(
// run the computation
{
ggml_build_forward_expand (&gf, logits);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
ggml_build_forward_expand(&gf, logits);
ggml_graph_compute_helper(wstate.buf_work, &gf, n_threads);
}
// extract logits for all N tokens
@ -3975,17 +4014,21 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
auto & logits_id = state.logits_id;
logits_id.clear();
logits_id.resize(n_logits);
for (int i = 0; i < n_logits; ++i) {
logits_id.push_back({ logits[i], i });
logits_id[i].first = logits[i];
logits_id[i].second = i;
}
std::partial_sort(
logits_id.begin(),
logits_id.begin() + k, logits_id.end(),
[](const std::pair<double, whisper_token> & a, const std::pair<double, whisper_token> & b) {
return a.first > b.first;
});
{
using pair_type = std::remove_reference<decltype(logits_id)>::type::value_type;
std::partial_sort(
logits_id.begin(),
logits_id.begin() + k, logits_id.end(),
[](const pair_type & a, const pair_type & b) {
return a.first > b.first;
});
}
std::vector<whisper_token_data> result;
result.reserve(k);
@ -4080,6 +4123,115 @@ static void whisper_sequence_score(
}
}
static bool whisper_kv_swap_fast(
std::vector<int> & view,
whisper_decoder src[],
std::vector<kv_buf> & kv_swap_bufs,
const int & n_decoders) {
WHISPER_PRINT_DEBUG("%s: n_decoders %d\n", __func__, n_decoders);
// (decoder->buffer->decoder or decoder->buffer + decoder->decoder)
std::set<int> two_copy; // decoder indices require two copies to safely modify KV caches
// (buffer->decoder or decoder->decoder)
std::set<int> one_copy; // decoder indices require one copy to safely modify KV caches
// (decoder<->decoder)
std::set<int> p_swap_set; // decoder indices able to swap KV-cache pointers
std::vector<whisper_pair<int, int>> p_swap_vec;
p_swap_vec.reserve(n_decoders);
// see https://github.com/ggerganov/whisper.cpp/wiki
for (int i = 0; i < n_decoders; i++) {
// zero-copy (no modification)
if (i == view[i] || view[i] < 0) {
continue;
}
bool is_one_copy = true;
// since we modify data sequentially, we only consider decoder indices after current index
for (int j = i + 1; j < n_decoders; j++) {
if (i == view[j]) {
// detect symmetric diagram
if (j == view[i]) {
p_swap_set.insert(i);
p_swap_set.insert(j);
p_swap_vec.emplace_back(i, j);
} else {
two_copy.insert(i);
is_one_copy = false;
}
break;
}
}
if (is_one_copy) {
one_copy.insert(i);
}
}
kv_swap_bufs.resize(n_decoders);
for (int i = 0; i < n_decoders; i++) {
kv_swap_bufs[i].k.resize(ggml_nbytes(src[i].kv_self.k));
kv_swap_bufs[i].v.resize(ggml_nbytes(src[i].kv_self.v));
}
for (auto & i : two_copy) {
// make a copy of KV caches
WHISPER_PRINT_DEBUG("%s: store KV cache into swap: idx %d\n", __func__, i);
memcpy(kv_swap_bufs[i].k.data(), src[i].kv_self.k->data, kv_swap_bufs[i].k.size());
memcpy(kv_swap_bufs[i].v.data(), src[i].kv_self.v->data, kv_swap_bufs[i].v.size());
}
// since two-copy decoder KV caches are protected by kv_swap_bufs, modify them first
for (auto & i : two_copy) {
// skip the decoder indices that require pointer swapping
if (p_swap_set.find(i) != p_swap_set.end()) {
continue;
}
if (two_copy.find(view[i]) != two_copy.end()) {
// modify KV caches of decoder using data from kv_swap_bufs
WHISPER_PRINT_DEBUG("%s: two-copy decoder using swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
} else {
// modify KV caches of decoder using data from correspond decoder KV caches directly
WHISPER_PRINT_DEBUG("%s: two-copy decoder without swap buffers: %d -> %d\n", __func__, view[i], i);
memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
}
}
// then modify one-copy decoder KV caches
for (auto & i : one_copy) {
// skip the decoder indices that require pointer swapping
if (p_swap_set.find(i) != p_swap_set.end()) {
continue;
}
if (two_copy.find(view[i]) != two_copy.end()) {
// modify KV caches of decoder using data from kv_swap_bufs
WHISPER_PRINT_DEBUG("%s: one-copy decoder using swap buffers: swap[%d] -> %d\n", __func__, view[i], i);
memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size());
memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size());
} else {
// modify KV caches of decoder using data from correspond decoder KV caches directly
WHISPER_PRINT_DEBUG("%s: one-copy decoder without swap buffers: %d -> %d\n", __func__, view[i], i);
memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k));
memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v));
}
}
// swap the pointers
for (auto & i : p_swap_vec) {
WHISPER_PRINT_DEBUG("%s: swap pointers: %d <-> %d\n", __func__, i.first, i.second);
std::swap(src[i.first].kv_self, src[i.second].kv_self);
}
return true;
}
int whisper_full_with_state(
struct whisper_context * ctx,
struct whisper_state * state,
@ -4243,14 +4395,6 @@ int whisper_full_with_state(
std::vector<whisper_token> prompt;
prompt.reserve(whisper_n_text_ctx(ctx));
// beam-search helpers
struct kv_buf {
std::vector<uint8_t> k;
std::vector<uint8_t> v;
};
std::vector<kv_buf> kv_bufs;
struct beam_candidate {
int decoder_idx;
int seek_delta;
@ -4399,23 +4543,7 @@ int whisper_full_with_state(
for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
const int64_t t_start_sample_us = ggml_time_us();
// store the KV caches of all decoders when doing beam-search
if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
kv_bufs.resize(n_decoders_cur);
for (int j = 0; j < n_decoders_cur; ++j) {
auto & decoder = state->decoders[j];
if (decoder.completed || decoder.failed) {
continue;
}
kv_bufs[j].k.resize(ggml_nbytes(decoder.kv_self.k));
kv_bufs[j].v.resize(ggml_nbytes(decoder.kv_self.v));
memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
}
beam_candidates.clear();
}
@ -4463,6 +4591,7 @@ int whisper_full_with_state(
});
uint32_t cur_c = 0;
std::vector<int> decoder_idx(n_decoders_cur, -1);
for (int j = 0; j < n_decoders_cur; ++j) {
auto & decoder = state->decoders[j];
@ -4481,12 +4610,13 @@ int whisper_full_with_state(
decoder.seek_delta = cur.seek_delta;
decoder.has_ts = cur.has_ts;
memcpy(decoder.kv_self.k->data, kv_bufs[cur.decoder_idx].k.data(), kv_bufs[cur.decoder_idx].k.size());
memcpy(decoder.kv_self.v->data, kv_bufs[cur.decoder_idx].v.data(), kv_bufs[cur.decoder_idx].v.size());
decoder_idx[j] = cur.decoder_idx;
WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
}
// update KV caches
whisper_kv_swap_fast(decoder_idx, state->decoders, state->kv_swap_bufs, n_decoders_cur);
}
// update the decoder state
@ -5111,7 +5241,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
// b: N*N*sizeof(float)
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
std::vector<uint8_t> buf (3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead());
std::vector<uint8_t> work(1llu*N_max*N_max*sizeof(float) + 1*ggml_tensor_overhead());
// put a bunch of random data in the buffer
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@ -5166,12 +5297,12 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
double tsum = 0.0;
// heat-up
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
ggml_graph_compute_helper(work, &gf, n_threads);
for (int i = 0; i < n_max; ++i) {
const int64_t t0 = ggml_time_us();
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
ggml_graph_compute_helper(work, &gf, n_threads);
const int64_t t1 = ggml_time_us();