Compare commits

..

1 Commits

Author SHA1 Message Date
aaa3b5e5f6 ggml : try to fix the abort mechanism 2023-11-05 20:02:24 +02:00
60 changed files with 299 additions and 634 deletions

View File

@ -88,7 +88,7 @@ jobs:
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
apt update
apt install -y build-essential cmake libsdl2-dev
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
make
ctest -L gh --output-on-failure'
@ -115,7 +115,7 @@ jobs:
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
apt update
apt install -y build-essential cmake libsdl2-dev
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
make
ctest -L gh --output-on-failure'
@ -182,7 +182,7 @@ jobs:
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_SDL2=${{ matrix.sdl2 }}
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
- name: Build
run: |
@ -217,10 +217,10 @@ jobs:
sdl2: [ON]
include:
- arch: Win32
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x86.zip
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
s2arc: x86
- arch: x64
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x64.zip
obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
s2arc: x64
- sdl2: ON
s2ver: 2.26.0
@ -239,7 +239,7 @@ jobs:
7z x blas.zip -oblas -y
copy blas/include/cblas.h .
copy blas/include/openblas_config.h .
echo "OPENBLAS_PATH=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
- name: Fetch SDL2 and set SDL2_DIR
if: matrix.sdl2 == 'ON'
@ -252,9 +252,9 @@ jobs:
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_OPENBLAS=${{ matrix.blas }}
-DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
-DWHISPER_SDL2=${{ matrix.sdl2 }}
-DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
-DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
- name: Build
run: |
@ -263,7 +263,7 @@ jobs:
- name: Copy libopenblas.dll
if: matrix.blas == 'ON'
run: copy "$env:OPENBLAS_PATH/bin/libopenblas.dll" build/bin/${{ matrix.build }}
run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
- name: Copy SDL2.dll
if: matrix.sdl2 == 'ON'

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.5)
project(whisper.cpp VERSION 1.4.3)
project(whisper.cpp VERSION 1.4.2)
# Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

View File

@ -417,10 +417,9 @@ samples:
.PHONY: medium.en
.PHONY: medium
.PHONY: large-v1
.PHONY: large-v2
.PHONY: large
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
bash ./models/download-ggml-model.sh $@
@echo ""
@echo "==============================================="

View File

@ -6,7 +6,7 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Beta: [v1.4.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.3) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
Beta: [v1.4.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.2) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -234,7 +234,6 @@ make small
make medium.en
make medium
make large-v1
make large-v2
make large
```
@ -246,7 +245,7 @@ make large
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
## Quantization

View File

@ -24,7 +24,7 @@ const (
var (
// The models which will be downloaded, if no model is specified as an argument
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
)
var (

View File

@ -83,6 +83,7 @@ const (
SampleRate = C.WHISPER_SAMPLE_RATE // Expected sample rate, samples per second
SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
NumFFT = C.WHISPER_N_FFT
NumMEL = C.WHISPER_N_MEL
HopLength = C.WHISPER_HOP_LENGTH
ChunkSize = C.WHISPER_CHUNK_SIZE
)
@ -102,7 +103,7 @@ var (
func Whisper_init(path string) *Context {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
if ctx := C.whisper_init_from_file_with_params(cPath, C.whisper_context_default_params()); ctx != nil {
if ctx := C.whisper_init_from_file(cPath); ctx != nil {
return (*Context)(ctx)
} else {
return nil

View File

@ -4,7 +4,6 @@ import com.sun.jna.Structure;
import com.sun.jna.ptr.PointerByReference;
import io.github.ggerganov.whispercpp.ggml.GgmlType;
import io.github.ggerganov.whispercpp.WhisperModel;
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
import java.util.List;
@ -24,9 +23,8 @@ public class WhisperContext extends Structure {
public PointerByReference vocab;
public PointerByReference state;
/** populated by whisper_init_from_file_with_params() */
/** populated by whisper_init_from_file() */
String path_model;
WhisperContextParams params;
// public static class ByReference extends WhisperContext implements Structure.ByReference {
// }

View File

@ -2,7 +2,6 @@ package io.github.ggerganov.whispercpp;
import com.sun.jna.Native;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
@ -16,9 +15,8 @@ import java.io.IOException;
public class WhisperCpp implements AutoCloseable {
private WhisperCppJnaLibrary lib = WhisperCppJnaLibrary.instance;
private Pointer ctx = null;
private Pointer paramsPointer = null;
private Pointer greedyParamsPointer = null;
private Pointer beamParamsPointer = null;
private Pointer greedyPointer = null;
private Pointer beamPointer = null;
public File modelDir() {
String modelDirPath = System.getenv("XDG_CACHE_HOME");
@ -33,18 +31,6 @@ public class WhisperCpp implements AutoCloseable {
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
*/
public void initContext(String modelPath) throws FileNotFoundException {
initContextImpl(modelPath, getContextDefaultParams());
}
/**
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
* @param params - params to use when initialising the context
*/
public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
initContextImpl(modelPath, params);
}
private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
if (ctx != null) {
lib.whisper_free(ctx);
}
@ -57,26 +43,13 @@ public class WhisperCpp implements AutoCloseable {
modelPath = new File(modelDir(), modelPath).getAbsolutePath();
}
ctx = lib.whisper_init_from_file_with_params(modelPath, params);
ctx = lib.whisper_init_from_file(modelPath);
if (ctx == null) {
throw new FileNotFoundException(modelPath);
}
}
/**
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
* Because this function allocates memory for the params, the caller must call either:
* - call `whisper_free_context_params()`
* - `Native.free(Pointer.nativeValue(pointer));`
*/
public WhisperContextParams getContextDefaultParams() {
paramsPointer = lib.whisper_context_default_params_by_ref();
WhisperContextParams params = new WhisperContextParams(paramsPointer);
params.read();
return params;
}
/**
* Provides default params which can be used with `whisper_full()` etc.
* Because this function allocates memory for the params, the caller must call either:
@ -90,15 +63,15 @@ public class WhisperCpp implements AutoCloseable {
// whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
if (strategy == WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY) {
if (greedyParamsPointer == null) {
greedyParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
if (greedyPointer == null) {
greedyPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
}
pointer = greedyParamsPointer;
pointer = greedyPointer;
} else {
if (beamParamsPointer == null) {
beamParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
if (beamPointer == null) {
beamPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
}
pointer = beamParamsPointer;
pointer = beamPointer;
}
WhisperFullParams params = new WhisperFullParams(pointer);
@ -120,17 +93,13 @@ public class WhisperCpp implements AutoCloseable {
}
private void freeParams() {
if (paramsPointer != null) {
Native.free(Pointer.nativeValue(paramsPointer));
paramsPointer = null;
if (greedyPointer != null) {
Native.free(Pointer.nativeValue(greedyPointer));
greedyPointer = null;
}
if (greedyParamsPointer != null) {
Native.free(Pointer.nativeValue(greedyParamsPointer));
greedyParamsPointer = null;
}
if (beamParamsPointer != null) {
Native.free(Pointer.nativeValue(beamParamsPointer));
beamParamsPointer = null;
if (beamPointer != null) {
Native.free(Pointer.nativeValue(beamPointer));
beamPointer = null;
}
}

View File

@ -5,7 +5,6 @@ import com.sun.jna.Native;
import com.sun.jna.Pointer;
import io.github.ggerganov.whispercpp.model.WhisperModelLoader;
import io.github.ggerganov.whispercpp.model.WhisperTokenData;
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
import io.github.ggerganov.whispercpp.params.WhisperFullParams;
public interface WhisperCppJnaLibrary extends Library {
@ -14,31 +13,12 @@ public interface WhisperCppJnaLibrary extends Library {
String whisper_print_system_info();
/**
* DEPRECATED. Allocate (almost) all memory needed for the model by loading from a file.
* Allocate (almost) all memory needed for the model by loading from a file.
*
* @param path_model Path to the model file
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_file(String path_model);
/**
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
* Because this function allocates memory for the params, the caller must call either:
* - call `whisper_free_context_params()`
* - `Native.free(Pointer.nativeValue(pointer));`
*/
Pointer whisper_context_default_params_by_ref();
void whisper_free_context_params(Pointer params);
/**
* Allocate (almost) all memory needed for the model by loading from a file.
*
* @param path_model Path to the model file
* @param params Pointer to whisper_context_params
* @return Whisper context on success, null on failure
*/
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
/**
* Allocate (almost) all memory needed for the model by loading from a buffer.

View File

@ -1,31 +0,0 @@
package io.github.ggerganov.whispercpp.params;
import com.sun.jna.*;
import java.util.Arrays;
import java.util.List;
/**
* Parameters for the whisper_init_from_file_with_params() function.
* If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
* whisper_context_default_params()
*/
public class WhisperContextParams extends Structure {
public WhisperContextParams(Pointer p) {
super(p);
}
/** Use GPU for inference Number (default = true) */
public CBool use_gpu;
/** Use GPU for inference Number (default = true) */
public void useGpu(boolean enable) {
use_gpu = enable ? CBool.TRUE : CBool.FALSE;
}
@Override
protected List<String> getFieldOrder() {
return Arrays.asList("use_gpu");
}
}

View File

@ -20,7 +20,7 @@ struct whisper_context * g_context;
EMSCRIPTEN_BINDINGS(whisper) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
if (g_context == nullptr) {
g_context = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
g_context = whisper_init_from_file(path_model.c_str());
if (g_context != nullptr) {
return true;
} else {

View File

@ -1,6 +1,6 @@
{
"name": "whisper.cpp",
"version": "1.4.3",
"version": "1.4.2",
"description": "Whisper speech recognition",
"main": "whisper.js",
"scripts": {

View File

@ -87,7 +87,7 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) {
rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
}
rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
rw->context = whisper_init_from_file(StringValueCStr(whisper_model_file_path));
if (rw->context == nullptr) {
rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
}

View File

@ -11,7 +11,6 @@ const whisperParamsMock = {
language: "en",
model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
use_gpu: true,
};
describe("Run whisper.node", () => {

View File

@ -36,7 +36,6 @@ struct whisper_params {
bool print_colors = false;
bool print_progress = false;
bool no_timestamps = false;
bool use_gpu = true;
std::string language = "en";
std::string prompt;
@ -154,9 +153,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");
@ -318,12 +315,10 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
std::string language = whisper_params.Get("language").As<Napi::String>();
std::string model = whisper_params.Get("model").As<Napi::String>();
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
params.language = language;
params.model = model;
params.fname_inp.emplace_back(input);
params.use_gpu = use_gpu;
Napi::Function callback = info[1].As<Napi::Function>();
Worker* worker = new Worker(callback, params);

View File

@ -11,7 +11,6 @@ const whisperParams = {
language: "en",
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
fname_inp: "../../samples/jfk.wav",
use_gpu: true,
};
const arguments = process.argv.slice(2);

View File

@ -23,9 +23,7 @@ void bench_main(size_t index) {
fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
const int n_mels = whisper_model_n_mels(ctx);
if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
fprintf(stderr, "error: failed to set mel: %d\n", ret);
return;
}
@ -59,7 +57,7 @@ EMSCRIPTEN_BINDINGS(bench) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
g_contexts[i] = whisper_init_from_file(path_model.c_str());
if (g_contexts[i] != nullptr) {
if (g_worker.joinable()) {
g_worker.join();

View File

@ -11,8 +11,6 @@ struct whisper_params {
int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat
std::string model = "models/ggml-base.en.bin";
bool use_gpu = true;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -25,10 +23,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-w" || arg == "--what") { params.what = atoi(argv[++i]); }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
@ -48,7 +45,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " %-7s 0 - whisper\n", "");
fprintf(stderr, " %-7s 1 - memcpy\n", "");
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
@ -58,10 +54,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
int whisper_bench_full(const whisper_params & params) {
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
{
fprintf(stderr, "\n");
@ -73,9 +66,7 @@ int whisper_bench_full(const whisper_params & params) {
return 2;
}
const int n_mels = whisper_model_n_mels(ctx);
if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
fprintf(stderr, "error: failed to set mel: %d\n", ret);
return 3;
}

View File

@ -243,7 +243,7 @@ EMSCRIPTEN_BINDINGS(command) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
g_contexts[i] = whisper_init_from_file(path_model.c_str());
if (g_contexts[i] != nullptr) {
g_running = true;
if (g_worker.joinable()) {

View File

@ -38,7 +38,6 @@ struct whisper_params {
bool print_special = false;
bool print_energy = false;
bool no_timestamps = true;
bool use_gpu = true;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
@ -69,7 +68,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
@ -103,7 +101,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
@ -613,10 +610,7 @@ int main(int argc, char ** argv) {
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
// print some info about the processing
{

View File

@ -48,7 +48,7 @@ if [ -n "$3" ]; then
fi
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
# list available models
function list_models {

View File

@ -30,7 +30,6 @@ struct whisper_params {
bool translate = false;
bool print_special = false;
bool print_energy = false;
bool use_gpu = true;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
@ -73,7 +72,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else {
@ -104,7 +102,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, "\n");
@ -435,9 +432,7 @@ int main(int argc, char ** argv) {
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
// init audio
audio_async audio(30*1000);

View File

@ -90,7 +90,6 @@ struct whisper_params {
bool print_progress = false;
bool no_timestamps = false;
bool log_score = false;
bool use_gpu = true;
std::string language = "en";
std::string prompt;
@ -166,7 +165,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
@ -223,7 +221,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, "\n");
}
@ -880,10 +877,7 @@ int main(int argc, char ** argv) {
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");

View File

@ -132,7 +132,7 @@ EMSCRIPTEN_BINDINGS(stream) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
g_contexts[i] = whisper_init_from_file(path_model.c_str());
if (g_contexts[i] != nullptr) {
g_running = true;
if (g_worker.joinable()) {

View File

@ -48,12 +48,11 @@ struct whisper_params {
bool no_context = true;
bool no_timestamps = false;
bool tinydiarize = false;
bool save_audio = false; // save audio to wav file
bool use_gpu = true;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_out;
bool save_audio = false; // save audio to wav file
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -66,26 +65,25 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); }
else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); }
else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); }
else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); }
else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -120,9 +118,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true");
fprintf(stderr, "\n");
}
@ -166,10 +163,7 @@ int main(int argc, char ** argv) {
exit(0);
}
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
std::vector<float> pcmf32 (n_samples_30s, 0.0f);
std::vector<float> pcmf32_old;
@ -430,4 +424,4 @@ int main(int argc, char ** argv) {
whisper_free(ctx);
return 0;
}
}

View File

@ -14,8 +14,6 @@ if (WHISPER_SDL2)
../common-sdl.cpp
../../ggml.c
../../ggml-alloc.c
../../ggml-backend.c
../../ggml-quants.c
../../whisper.cpp)
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)

View File

@ -63,7 +63,6 @@ struct whisper_params {
bool print_energy = false;
bool no_timestamps = true;
bool verbose_prompt = false;
bool use_gpu = true;
std::string person = "Georgi";
std::string language = "en";
@ -85,26 +84,25 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-vp" || arg == "--verbose-prompt") { params.verbose_prompt = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
else if (arg == "--session") { params.path_session = argv[++i];}
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
else if (arg == "--prompt-file") {
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "--verbose-prompt") { params.verbose_prompt = true; }
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
else if (arg == "--session") { params.path_session = argv[++i];}
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
else if (arg == "--prompt-file") {
std::ifstream file(argv[++i]);
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
@ -112,7 +110,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
}
}
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
@ -128,28 +125,27 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -vp, --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
fprintf(stderr, " --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false");
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, "\n");
}
@ -256,19 +252,13 @@ int main(int argc, char ** argv) {
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
// llama init
llama_backend_init(true);
auto lmparams = llama_model_default_params();
if (!params.use_gpu) {
lmparams.n_gpu_layers = 0;
}
struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);

View File

@ -271,7 +271,7 @@ EMSCRIPTEN_BINDINGS(talk) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
g_contexts[i] = whisper_init_from_file(path_model.c_str());
if (g_contexts[i] != nullptr) {
g_running = true;
if (g_worker.joinable()) {

View File

@ -31,7 +31,6 @@ struct whisper_params {
bool print_special = false;
bool print_energy = false;
bool no_timestamps = true;
bool use_gpu = true;
std::string person = "Santa";
std::string language = "en";
@ -62,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
@ -96,7 +94,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
@ -184,10 +181,8 @@ int main(int argc, char ** argv) {
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
// gpt init

View File

@ -21,7 +21,7 @@ help()
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
echo "options:"
echo "-s Step in seconds (default is $step)."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
echo "-t Number of threads to use."
echo "-h Print this help page."
echo

View File

@ -18,7 +18,9 @@ android {
vectorDrawables {
useSupportLibrary true
}
ndk {
abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
}
}
buildTypes {
@ -41,10 +43,20 @@ android {
composeOptions {
kotlinCompilerExtensionVersion '1.5.0'
}
ndkVersion "25.2.9519653"
externalNativeBuild {
cmake {
path = file("src/main/jni/whisper/CMakeLists.txt")
}
}
packagingOptions {
resources {
excludes += '/META-INF/{AL2.0,LGPL2.1}'
}
}
}
dependencies {
implementation project(':lib')
implementation 'androidx.activity:activity-compose:1.7.2'
implementation 'androidx.compose.material:material-icons-core:1.5.0'
implementation 'androidx.compose.material3:material3:1.1.1'

View File

@ -15,7 +15,7 @@ import androidx.lifecycle.viewmodel.initializer
import androidx.lifecycle.viewmodel.viewModelFactory
import com.whispercppdemo.media.decodeWaveFile
import com.whispercppdemo.recorder.Recorder
import com.whispercpp.whisper.WhisperContext
import com.whispercppdemo.whisper.WhisperContext
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.launch
import kotlinx.coroutines.runBlocking
@ -35,7 +35,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
private val modelsPath = File(application.filesDir, "models")
private val samplesPath = File(application.filesDir, "samples")
private var recorder: Recorder = Recorder()
private var whisperContext: com.whispercpp.whisper.WhisperContext? = null
private var whisperContext: WhisperContext? = null
private var mediaPlayer: MediaPlayer? = null
private var recordedFile: File? = null
@ -47,7 +47,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
}
private suspend fun printSystemInfo() {
printMessage(String.format("System Info: %s\n", com.whispercpp.whisper.WhisperContext.getSystemInfo()))
printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()))
}
private suspend fun loadData() {
@ -78,7 +78,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
printMessage("Loading model...\n")
val models = application.assets.list("models/")
if (models != null) {
whisperContext = com.whispercpp.whisper.WhisperContext.createContextFromAsset(application.assets, "models/" + models[0])
whisperContext = WhisperContext.createContextFromAsset(application.assets, "models/" + models[0])
printMessage("Loaded model ${models[0]}.\n")
}

View File

@ -1,4 +1,4 @@
package com.whispercpp.whisper
package com.whispercppdemo.whisper
import android.content.res.AssetManager
import android.os.Build

View File

@ -1,4 +1,4 @@
package com.whispercpp.whisper
package com.whispercppdemo.whisper
import android.util.Log
import java.io.BufferedReader

View File

@ -127,11 +127,11 @@ static struct whisper_context *whisper_init_from_asset(
.close = &asset_close
};
return whisper_init_with_params(&loader, whisper_context_default_params());
return whisper_init(&loader);
}
JNIEXPORT jlong JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_initContextFromAsset(
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContextFromAsset(
JNIEnv *env, jobject thiz, jobject assetManager, jstring asset_path_str) {
UNUSED(thiz);
struct whisper_context *context = NULL;
@ -142,18 +142,18 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_initContextFromAsset(
}
JNIEXPORT jlong JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_initContext(
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContext(
JNIEnv *env, jobject thiz, jstring model_path_str) {
UNUSED(thiz);
struct whisper_context *context = NULL;
const char *model_path_chars = (*env)->GetStringUTFChars(env, model_path_str, NULL);
context = whisper_init_from_file_with_params(model_path_chars, whisper_context_default_params());
context = whisper_init_from_file(model_path_chars);
(*env)->ReleaseStringUTFChars(env, model_path_str, model_path_chars);
return (jlong) context;
}
JNIEXPORT void JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_freeContext(
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext(
JNIEnv *env, jobject thiz, jlong context_ptr) {
UNUSED(env);
UNUSED(thiz);
@ -162,7 +162,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_freeContext(
}
JNIEXPORT void JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_fullTranscribe(
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
JNIEnv *env, jobject thiz, jlong context_ptr, jint num_threads, jfloatArray audio_data) {
UNUSED(thiz);
struct whisper_context *context = (struct whisper_context *) context_ptr;
@ -194,7 +194,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_fullTranscribe(
}
JNIEXPORT jint JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_getTextSegmentCount(
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegmentCount(
JNIEnv *env, jobject thiz, jlong context_ptr) {
UNUSED(env);
UNUSED(thiz);
@ -203,7 +203,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_getTextSegmentCount(
}
JNIEXPORT jstring JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_getTextSegment(
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
JNIEnv *env, jobject thiz, jlong context_ptr, jint index) {
UNUSED(thiz);
struct whisper_context *context = (struct whisper_context *) context_ptr;
@ -213,7 +213,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_getTextSegment(
}
JNIEXPORT jstring JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_getSystemInfo(
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
JNIEnv *env, jobject thiz
) {
UNUSED(thiz);
@ -223,7 +223,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_getSystemInfo(
}
JNIEXPORT jstring JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
jint n_threads) {
UNUSED(thiz);
const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
@ -231,7 +231,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, j
}
JNIEXPORT jstring JNICALL
Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
jint n_threads) {
UNUSED(thiz);
const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);

View File

@ -1 +0,0 @@
/build

View File

@ -1,51 +0,0 @@
plugins {
id 'com.android.library'
id 'org.jetbrains.kotlin.android'
}
android {
namespace 'com.whispercpp'
compileSdk 34
defaultConfig {
minSdk 26
targetSdk 34
versionCode 1
versionName "1.0"
ndk {
abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
}
}
buildTypes {
release {
minifyEnabled false
}
}
compileOptions {
sourceCompatibility JavaVersion.VERSION_1_8
targetCompatibility JavaVersion.VERSION_1_8
}
kotlinOptions {
jvmTarget = '1.8'
}
ndkVersion "25.2.9519653"
externalNativeBuild {
cmake {
path = file("src/main/jni/whisper/CMakeLists.txt")
}
}
packagingOptions {
resources {
excludes += '/META-INF/{AL2.0,LGPL2.1}'
}
}
}
dependencies {
implementation 'androidx.core:core-ktx:1.9.0'
implementation 'androidx.appcompat:appcompat:1.6.1'
implementation 'com.google.android.material:material:1.8.0'
}

View File

@ -1,4 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
</manifest>

View File

@ -14,4 +14,3 @@ dependencyResolutionManagement {
}
rootProject.name = "WhisperCppDemo"
include ':app'
include ':lib'

View File

@ -17,8 +17,8 @@
18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; };
18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; };
18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; };
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK -DGGML_USE_METAL"; }; };
18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL"; }; };
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML"; }; };
18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE"; }; };
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };

View File

@ -61,13 +61,7 @@ void AudioInputCallback(void * inUserData,
NSLog(@"Loading model from %@", modelPath);
// create ggml context
struct whisper_context_params cparams = whisper_context_default_params();
#if TARGET_OS_SIMULATOR
cparams.use_gpu = false;
NSLog(@"Running on simulator, using CPU");
#endif
stateInp.ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams);
stateInp.ctx = whisper_init_from_file([modelPath UTF8String]);
// check if the model was loaded successfully
if (stateInp.ctx == NULL) {

View File

@ -55,12 +55,7 @@ actor WhisperContext {
}
static func createContext(path: String) throws -> WhisperContext {
var params = whisper_context_default_params()
#if targetEnvironment(simulator)
params.use_gpu = false
print("Running on the simulator, using CPU")
#endif
let context = whisper_init_from_file_with_params(path, params)
let context = whisper_init_from_file(path)
if let context {
return WhisperContext(context: context)
} else {

View File

@ -16,15 +16,13 @@
0AAC5D9D29539CCF003032C3 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9C29539CCF003032C3 /* ContentView.swift */; };
0AAC5D9F29539CD0003032C3 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9E29539CD0003032C3 /* Assets.xcassets */; };
0AAC5DA329539CD0003032C3 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5DA229539CD0003032C3 /* Preview Assets.xcassets */; };
0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC729539EB0003032C3 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_METAL -Wno-shorten-64-to-32"; }; };
0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC929539EB0003032C3 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -Wno-shorten-64-to-32"; }; };
0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC729539EB0003032C3 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-Wno-shorten-64-to-32"; }; };
0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC929539EB0003032C3 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -Wno-shorten-64-to-32"; }; };
0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DCD2953A05C003032C3 /* WhisperState.swift */; };
0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DD02953A394003032C3 /* LibWhisper.swift */; };
18ABE1522AF555FA0044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE14C2AF555FA0044A204 /* ggml-backend.c */; };
18ABE1532AF555FA0044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1512AF555FA0044A204 /* ggml-quants.c */; };
18AED4812AB21F2B009D854F /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 18AED47F2AB21F2B009D854F /* ggml-alloc.c */; };
7FCB08262ACFA3A400AF3530 /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FCB08252ACFA3A400AF3530 /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
7FCB08282ACFA48500AF3530 /* ggml-metal.metal in Sources */ = {isa = PBXBuildFile; fileRef = 7FCB08272ACFA48500AF3530 /* ggml-metal.metal */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
@ -54,9 +52,6 @@
18ABE1512AF555FA0044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "ggml-quants.c"; sourceTree = "<group>"; };
18AED47F2AB21F2B009D854F /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "ggml-alloc.c"; sourceTree = "<group>"; };
18AED4802AB21F2B009D854F /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-alloc.h"; sourceTree = "<group>"; };
7FCB081E2ACFA04400AF3530 /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-metal.h"; sourceTree = "<group>"; };
7FCB08252ACFA3A400AF3530 /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "ggml-metal.m"; sourceTree = "<group>"; };
7FCB08272ACFA48500AF3530 /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = "ggml-metal.metal"; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -140,9 +135,6 @@
0AAC5DC529539E89003032C3 /* whisper.cpp */ = {
isa = PBXGroup;
children = (
7FCB08272ACFA48500AF3530 /* ggml-metal.metal */,
7FCB081E2ACFA04400AF3530 /* ggml-metal.h */,
7FCB08252ACFA3A400AF3530 /* ggml-metal.m */,
18ABE14E2AF555FA0044A204 /* ggml-backend-impl.h */,
18ABE14C2AF555FA0044A204 /* ggml-backend.c */,
18ABE14D2AF555FA0044A204 /* ggml-backend.h */,
@ -266,12 +258,10 @@
0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */,
18ABE1532AF555FA0044A204 /* ggml-quants.c in Sources */,
0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */,
7FCB08282ACFA48500AF3530 /* ggml-metal.metal in Sources */,
0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */,
0AA7514C2953B569001EE061 /* RiffWaveUtils.swift in Sources */,
0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */,
0AA7514E2953D958001EE061 /* Recorder.swift in Sources */,
7FCB08262ACFA3A400AF3530 /* ggml-metal.m in Sources */,
18AED4812AB21F2B009D854F /* ggml-alloc.c in Sources */,
18ABE1522AF555FA0044A204 /* ggml-backend.c in Sources */,
);

View File

@ -24,7 +24,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
for (size_t i = 0; i < g_contexts.size(); ++i) {
if (g_contexts[i] == nullptr) {
g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
g_contexts[i] = whisper_init_from_file(path_model.c_str());
if (g_contexts[i] != nullptr) {
return i + 1;
} else {

View File

@ -1,6 +1,6 @@
#!/bin/bash
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
for model in "${models[@]}"; do
python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/

View File

@ -1030,7 +1030,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
@ -1342,7 +1342,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
[encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
const int64_t nrows = ggml_nrows(src0);
@ -1361,7 +1361,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
const int64_t nrows = ggml_nrows(src0);

21
ggml.c
View File

@ -143,6 +143,12 @@ void ggml_print_backtrace(void) {
}
#endif
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_GELU_FP16
@ -271,12 +277,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
// floating point type used to accumulate sums
typedef double ggml_float;
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
//
// global data
//
@ -16391,9 +16391,6 @@ struct ggml_compute_state_shared {
// synchronization primitives
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
void * abort_callback_data;
};
struct ggml_compute_state {
@ -16744,6 +16741,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
node_n = atomic_load(&state->shared->node_n);
if (node_n != last) break;
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
break;
}
};
}
@ -17037,8 +17038,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
/*.n_threads =*/ n_threads,
/*.n_active =*/ n_threads,
/*.node_n =*/ -1,
/*.abort_callback =*/ NULL,
/*.abort_callback_data =*/ NULL,
};
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);

View File

@ -50,8 +50,7 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
| large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
| large-v2 | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
| large | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
| large | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
## Model files for testing purposes

View File

@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
# Ported from models/convert-whisper-to-coreml.py
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
args = parser.parse_args()
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
raise ValueError("Invalid model name")
pt_target_path = f"models/hf-{args.model_name}.pt"

View File

@ -228,7 +228,7 @@ with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
# for backwards compatibility, also check for older hf_transformers format tokenizer files
# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
multilingual = hparams["n_vocab"] >= 51865
multilingual = hparams["n_vocab"] == 51865
tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
tokenizer_type = "tiktoken"
if not tokenizer.is_file():

View File

@ -194,7 +194,7 @@ class TextDecoderANE(TextDecoder):
x = x.permute(0,2,3,1).squeeze(0)
# ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
if self.token_embedding.weight.shape[0] >= 51865:
if self.token_embedding.weight.shape[0] == 51865:
# split in 11 chunks - 4715 each
splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
args = parser.parse_args()
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
raise ValueError("Invalid model name")
whisper = load_model(args.model).cpu()

View File

@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
args = parser.parse_args()
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
raise ValueError("Invalid model name")
whisper = load_model(args.model).cpu()

View File

@ -19,7 +19,7 @@ function get_script_path() {
models_path="$(get_script_path)"
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
# list available models
function list_models {

View File

@ -8,7 +8,7 @@ popd
set argc=0
for %%x in (%*) do set /A argc+=1
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large
if %argc% neq 1 (
echo.
@ -57,8 +57,8 @@ goto :eof
:list_models
echo.
echo Available models:
(for %%a in (%models%) do (
echo %%a
(for %%a in (%models%) do (
echo %%a
))
echo.
exit /b

View File

@ -41,7 +41,6 @@ models=(
"medium-q5_0"
"medium.en-q5_0"
"large-v1"
"large-v2"
"large"
"large-q5_0"
)

View File

@ -19,7 +19,7 @@
cd `dirname $0`
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
# list available models
function list_models {

View File

@ -193,15 +193,6 @@ enum e_model {
MODEL_LARGE,
};
static const std::map<e_model, std::string> g_model_name = {
{ MODEL_UNKNOWN, "unknown" },
{ MODEL_TINY, "tiny" },
{ MODEL_BASE, "base" },
{ MODEL_SMALL, "small" },
{ MODEL_MEDIUM, "medium" },
{ MODEL_LARGE, "large" },
};
static const std::map<std::string, std::pair<int, std::string>> g_lang = {
{ "en", { 0, "english", } },
{ "zh", { 1, "chinese", } },
@ -302,7 +293,6 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
{ "ba", { 96, "bashkir", } },
{ "jw", { 97, "javanese", } },
{ "su", { 98, "sundanese", } },
{ "yue", { 99, "cantonese", } },
};
static const size_t MB = 1ull*1024*1024;
@ -412,11 +402,7 @@ struct whisper_vocab {
id token_beg = 50363; // begin timestamps
bool is_multilingual() const {
return n_vocab >= 51865;
}
int num_languages() const {
return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
return n_vocab == 51865;
}
};
@ -750,7 +736,7 @@ struct whisper_state {
int lang_id = 0; // english by default
std::string path_model; // populated by whisper_init_from_file_with_params()
std::string path_model; // populated by whisper_init_from_file()
#ifdef WHISPER_USE_COREML
whisper_coreml_context * ctx_coreml = nullptr;
#endif
@ -784,8 +770,7 @@ struct whisper_context {
whisper_vocab vocab;
whisper_state * state = nullptr;
std::string path_model; // populated by whisper_init_from_file_with_params()
whisper_context_params params;
std::string path_model; // populated by whisper_init_from_file()
};
static void whisper_default_log(const char * text) {
@ -936,8 +921,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
assert(hparams.n_text_state == hparams.n_audio_state);
std::string mver = "";
if (hparams.n_audio_layer == 4) {
model.type = e_model::MODEL_TINY;
}
@ -956,10 +939,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
if (hparams.n_audio_layer == 32) {
model.type = e_model::MODEL_LARGE;
if (hparams.n_vocab == 51866) {
mver = " v3";
}
}
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
@ -988,7 +967,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
log("%s: n_mels = %d\n", __func__, hparams.n_mels);
log("%s: ftype = %d\n", __func__, model.hparams.ftype);
log("%s: qntvr = %d\n", __func__, qntvr);
log("%s: type = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str());
log("%s: type = %d\n", __func__, model.type);
// print memory requirements
{
@ -1059,17 +1038,13 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
if (vocab.is_multilingual()) {
vocab.token_eot++;
vocab.token_sot++;
// account for variable number of language tokens
const int dt = vocab.num_languages() - 98;
vocab.token_translate += dt;
vocab.token_transcribe += dt;
vocab.token_solm += dt;
vocab.token_prev += dt;
vocab.token_nosp += dt;
vocab.token_not += dt;
vocab.token_beg += dt;
vocab.token_translate++;
vocab.token_transcribe++;
vocab.token_solm++;
vocab.token_prev++;
vocab.token_nosp++;
vocab.token_not++;
vocab.token_beg++;
}
if (n_vocab < model.hparams.n_vocab) {
@ -1098,8 +1073,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
vocab.id_to_token[i] = word;
}
}
log("%s: n_langs = %d\n", __func__, vocab.num_languages());
}
size_t ctx_size = 0;
@ -2957,64 +2930,59 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
}
#ifdef GGML_USE_METAL
if (ctx->params.use_gpu) {
state->ctx_metal = ggml_metal_init(1);
if (!state->ctx_metal) {
log("%s: ggml_metal_init() failed\n", __func__);
delete state;
return nullptr;
}
state->ctx_metal = ggml_metal_init(1);
if (!state->ctx_metal) {
log("%s: ggml_metal_init() failed\n", __func__);
delete state;
return nullptr;
}
if (state->ctx_metal) {
log("%s: Metal context initialized\n", __func__);
log("%s: Metal context initialized\n", __func__);
// this allocates all Metal resources and memory buffers
// this allocates all Metal resources and memory buffers
void * data_ptr = NULL;
size_t data_size = 0;
void * data_ptr = NULL;
size_t data_size = 0;
// TODO: add mmap support
//if (params.use_mmap) {
// data_ptr = ctx->model.mapping->addr;
// data_size = ctx->model.mapping->size;
//} else {
// data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
// data_size = ggml_get_mem_size (ctx->model.ctx);
//}
// TODO: add mmap support
//if (params.use_mmap) {
// data_ptr = ctx->model.mapping->addr;
// data_size = ctx->model.mapping->size;
//} else {
// data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
// data_size = ggml_get_mem_size (ctx->model.ctx);
//}
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
data_size = ggml_get_mem_size (ctx->model.ctx);
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
data_size = ggml_get_mem_size (ctx->model.ctx);
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
log("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
log("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
#define WHISPER_METAL_CHECK_BUF(result) \
if (!(result)) { \
log("%s: failed to add metal buffer\n", __func__); \
delete state; \
return nullptr; \
}
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data", data_ptr, data_size, max_size));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_conv", state->alloc_conv.meta.data(), state->alloc_conv.meta.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_encode", state->alloc_encode.meta.data(), state->alloc_encode.meta.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_cross", state->alloc_cross.meta.data(), state->alloc_cross.meta.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_decode", state->alloc_decode.meta.data(), state->alloc_decode.meta.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_conv", state->alloc_conv.data.data(), state->alloc_conv.data.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_encode", state->alloc_encode.data.data(), state->alloc_encode.data.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_cross", state->alloc_cross.data.data(), state->alloc_cross.data.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_decode", state->alloc_decode.data.data(), state->alloc_decode.data.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_cross", state->kv_cross.buf.data(), state->kv_cross.buf.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_self_0", state->decoders[0].kv_self.buf.data(), state->decoders[0].kv_self.buf.size(), 0));
#undef WHISPER_METAL_CHECK_BUF
if (!(result)) { \
log("%s: failed to add metal buffer\n", __func__); \
delete state; \
return nullptr; \
}
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data", data_ptr, data_size, max_size));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_conv", state->alloc_conv.meta.data(), state->alloc_conv.meta.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_encode", state->alloc_encode.meta.data(), state->alloc_encode.meta.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_cross", state->alloc_cross.meta.data(), state->alloc_cross.meta.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_decode", state->alloc_decode.meta.data(), state->alloc_decode.meta.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_conv", state->alloc_conv.data.data(), state->alloc_conv.data.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_encode", state->alloc_encode.data.data(), state->alloc_encode.data.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_cross", state->alloc_cross.data.data(), state->alloc_cross.data.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_decode", state->alloc_decode.data.data(), state->alloc_decode.data.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_cross", state->kv_cross.buf.data(), state->kv_cross.buf.size(), 0));
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_self_0", state->decoders[0].kv_self.buf.data(), state->decoders[0].kv_self.buf.size(), 0));
#undef WHISPER_METAL_CHECK_BUF
#endif
state->rng = std::mt19937(0);
@ -3071,14 +3039,7 @@ int whisper_ctx_init_openvino_encoder(
#endif
}
struct whisper_context_params whisper_context_default_params() {
struct whisper_context_params result = {
/*.use_gpu =*/ true,
};
return result;
}
struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params) {
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
log("%s: loading model from '%s'\n", __func__, path_model);
auto fin = std::ifstream(path_model, std::ios::binary);
@ -3107,7 +3068,7 @@ struct whisper_context * whisper_init_from_file_with_params_no_state(const char
fin->close();
};
auto ctx = whisper_init_with_params_no_state(&loader, params);
auto ctx = whisper_init_no_state(&loader);
if (ctx) {
ctx->path_model = path_model;
@ -3116,7 +3077,7 @@ struct whisper_context * whisper_init_from_file_with_params_no_state(const char
return ctx;
}
struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params) {
struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
struct buf_context {
uint8_t* buffer;
size_t size;
@ -3150,14 +3111,13 @@ struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * bu
loader.close = [](void * /*ctx*/) { };
return whisper_init_with_params_no_state(&loader, params);
return whisper_init_no_state(&loader);
}
struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params) {
struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
ggml_time_init();
whisper_context * ctx = new whisper_context;
ctx->params = params;
if (!whisper_model_load(loader, *ctx)) {
loader->close(loader->context);
@ -3171,73 +3131,49 @@ struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_
return ctx;
}
struct whisper_context * whisper_init_from_file_with_params(const char * path_model, struct whisper_context_params params) {
whisper_context * ctx = whisper_init_from_file_with_params_no_state(path_model, params);
if (!ctx) {
return nullptr;
}
ctx->state = whisper_init_state(ctx);
if (!ctx->state) {
whisper_free(ctx);
return nullptr;
}
return ctx;
}
struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params) {
whisper_context * ctx = whisper_init_from_buffer_with_params_no_state(buffer, buffer_size, params);
if (!ctx) {
return nullptr;
}
ctx->state = whisper_init_state(ctx);
if (!ctx->state) {
whisper_free(ctx);
return nullptr;
}
return ctx;
}
struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params) {
whisper_context * ctx = whisper_init_with_params_no_state(loader, params);
if (!ctx) {
return nullptr;
}
ctx->state = whisper_init_state(ctx);
if (!ctx->state) {
whisper_free(ctx);
return nullptr;
}
return ctx;
}
struct whisper_context * whisper_init_from_file(const char * path_model) {
return whisper_init_from_file_with_params(path_model, whisper_context_default_params());
whisper_context * ctx = whisper_init_from_file_no_state(path_model);
if (!ctx) {
return nullptr;
}
ctx->state = whisper_init_state(ctx);
if (!ctx->state) {
whisper_free(ctx);
return nullptr;
}
return ctx;
}
struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
return whisper_init_from_buffer_with_params(buffer, buffer_size, whisper_context_default_params());
whisper_context * ctx = whisper_init_from_buffer_no_state(buffer, buffer_size);
if (!ctx) {
return nullptr;
}
ctx->state = whisper_init_state(ctx);
if (!ctx->state) {
whisper_free(ctx);
return nullptr;
}
return ctx;
}
struct whisper_context * whisper_init(struct whisper_model_loader * loader) {
return whisper_init_with_params(loader, whisper_context_default_params());
}
whisper_context * ctx = whisper_init_no_state(loader);
if (!ctx) {
return nullptr;
}
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
return whisper_init_from_file_with_params_no_state(path_model, whisper_context_default_params());
}
ctx->state = whisper_init_state(ctx);
if (!ctx->state) {
whisper_free(ctx);
return nullptr;
}
struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
return whisper_init_from_buffer_with_params_no_state(buffer, buffer_size, whisper_context_default_params());
}
struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
return whisper_init_with_params_no_state(loader, whisper_context_default_params());
return ctx;
}
void whisper_free_state(struct whisper_state * state)
@ -3294,12 +3230,6 @@ void whisper_free(struct whisper_context * ctx) {
}
}
void whisper_free_context_params(struct whisper_context_params * params) {
if (params) {
delete params;
}
}
void whisper_free_params(struct whisper_full_params * params) {
if (params) {
delete params;
@ -3307,7 +3237,7 @@ void whisper_free_params(struct whisper_full_params * params) {
}
int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
log("%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
@ -3321,7 +3251,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
log("%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
@ -3344,13 +3274,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float *
// TODO
int whisper_set_mel_with_state(
struct whisper_context * ctx,
struct whisper_context * /*ctx*/,
struct whisper_state * state,
const float * data,
int n_len,
int n_mel) {
if (n_mel != ctx->model.filters.n_mel) {
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel);
if (n_mel != WHISPER_N_MEL) {
log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
return -1;
}
@ -3714,7 +3644,6 @@ void whisper_print_timings(struct whisper_context * ctx) {
}
void whisper_reset_timings(struct whisper_context * ctx) {
ctx->t_start_us = ggml_time_us();
if (ctx->state != nullptr) {
ctx->state->t_sample_us = 0;
ctx->state->t_encode_us = 0;
@ -3769,14 +3698,6 @@ const char * whisper_print_system_info(void) {
////////////////////////////////////////////////////////////////////////////
struct whisper_context_params * whisper_context_default_params_by_ref() {
struct whisper_context_params params = whisper_context_default_params();
struct whisper_context_params* result = new whisper_context_params();
*result = params;
return result;
}
struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy) {
struct whisper_full_params params = whisper_full_default_params(strategy);
@ -3853,9 +3774,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.encoder_begin_callback =*/ nullptr,
/*.encoder_begin_callback_user_data =*/ nullptr,
/*.abort_callback =*/ nullptr,
/*.abort_callback_user_data =*/ nullptr,
/*.logits_filter_callback =*/ nullptr,
/*.logits_filter_callback_user_data =*/ nullptr,
};
@ -4589,19 +4507,17 @@ int whisper_full_with_state(
// TODO: not very clean - look for a better way and potentially merging with the init of decoder 0
#ifdef GGML_USE_METAL
if (state->ctx_metal) {
#define WHISPER_METAL_CHECK_BUF(result) \
if (!(result)) { \
log("%s: failed to add metal buffer\n", __func__); \
return 0; \
}
const std::string kv_name = "kv_self_" + std::to_string(j);
auto & kv_self = decoder.kv_self;
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, kv_name.c_str(), kv_self.buf.data(), kv_self.buf.size(), 0));
#undef WHISPER_METAL_CHECK_BUF
if (!(result)) { \
log("%s: failed to add metal buffer\n", __func__); \
return 0; \
}
const std::string kv_name = "kv_self_" + std::to_string(j);
auto & kv_self = decoder.kv_self;
WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, kv_name.c_str(), kv_self.buf.data(), kv_self.buf.size(), 0));
#undef WHISPER_METAL_CHECK_BUF
#endif
}
}

View File

@ -5,14 +5,6 @@
#include <stdint.h>
#include <stdbool.h>
#ifdef __GNUC__
# define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
#elif defined(_MSC_VER)
# define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
#else
# define WHISPER_DEPRECATED(func, hint) func
#endif
#ifdef WHISPER_SHARED
# ifdef _WIN32
# ifdef WHISPER_BUILD
@ -29,6 +21,7 @@
#define WHISPER_SAMPLE_RATE 16000
#define WHISPER_N_FFT 400
#define WHISPER_N_MEL 80
#define WHISPER_HOP_LENGTH 160
#define WHISPER_CHUNK_SIZE 30
@ -78,10 +71,6 @@ extern "C" {
typedef int whisper_token;
struct whisper_context_params {
bool use_gpu;
};
typedef struct whisper_token_data {
whisper_token id; // token id
whisper_token tid; // forced timestamp token id
@ -110,40 +99,15 @@ extern "C" {
// Various functions for loading a ggml whisper model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
WHISPER_API struct whisper_context * whisper_init_from_file_with_params(const char * path_model, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_with_params(struct whisper_model_loader * loader, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
// These are the same as the above, but the internal state of the context is not allocated automatically
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params);
WHISPER_API struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_loader * loader, struct whisper_context_params params);
WHISPER_DEPRECATED(
WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
"use whisper_init_from_file_with_params instead"
);
WHISPER_DEPRECATED(
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
"use whisper_init_from_buffer_with_params instead"
);
WHISPER_DEPRECATED(
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
"use whisper_init_with_params instead"
);
WHISPER_DEPRECATED(
WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
"use whisper_init_from_file_with_params_no_state instead"
);
WHISPER_DEPRECATED(
WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
"use whisper_init_from_buffer_with_params_no_state instead"
);
WHISPER_DEPRECATED(
WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
"use whisper_init_with_params_no_state instead"
);
WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
@ -168,7 +132,6 @@ extern "C" {
WHISPER_API void whisper_free (struct whisper_context * ctx);
WHISPER_API void whisper_free_state(struct whisper_state * state);
WHISPER_API void whisper_free_params(struct whisper_full_params * params);
WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
// Convert RAW PCM audio to log mel spectrogram.
// The resulting spectrogram is stored inside the default state of the provided whisper context.
@ -479,9 +442,7 @@ extern "C" {
void * logits_filter_callback_user_data;
};
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
WHISPER_API struct whisper_context_params whisper_context_default_params(void);
// NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);