Compare commits

...

18 Commits

Author SHA1 Message Date
c456ca476b llama podcast 2023-04-01 13:13:27 +03:00
0a2d1210bc whisper : add progress callback (#600) 2023-03-30 20:29:29 +03:00
859ffc994e misc : typo (#688) 2023-03-30 07:51:33 +03:00
5e6e2187a3 talk-llama : fixing usage message for talk-llama (#687)
"-ml" instead of "-mg" for specifying the llama file
2023-03-30 00:10:20 +03:00
a7f1f33715 main : add <cstring> header 2023-03-29 23:59:45 +03:00
86ecfc6333 whisper.addon : fixed test to new async implementation (#686)
* fixed blocking code on node addon

* modify the example to run async

* format

* added logic to see the whisper output

* added logic to see the whisper output

* removed extra function for more clean example

* fixed whisper test to new async implementation
2023-03-29 23:59:17 +03:00
18e6fb0287 models : handle spaces and special characters in shell script paths (#677)
This commit modifies the `get_script_path` function to correctly handle
spaces and special characters in directory paths. The fix involves adding
double quotes around variables and commands where needed to ensure proper
parsing of paths with spaces and special characters.
2023-03-29 23:38:33 +03:00
0f759f125d main : fix typo in JSON output (#648)
* typo in JSON output

* fix double quotes in JSON output
2023-03-29 23:26:39 +03:00
eefed45e37 whisper : add initial_prompt param (#645) 2023-03-29 23:23:23 +03:00
aac1710afb make : 32-bit ARM flags (#486)
* issue #470 - working 32-bit ARM

* Update Makefile

* Update Makefile

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-03-29 23:11:35 +03:00
21c1e6afc5 whisper.swiftui : update README.md (#682)
- Slight tweaks to README for improved comprehension.
2023-03-29 23:04:38 +03:00
a47e812a54 talk-llama : add alpaca support (#668) 2023-03-29 23:01:14 +03:00
42c6855103 whisper : bump "large" scratch buffer even mode (close #671) 2023-03-28 10:50:49 +03:00
0be9cd3497 whisper : increase scratch buffers after recent change (#671)
Should fix the error:

ggml_new_tensor_impl: not enough space in the scratch memory
2023-03-28 10:36:16 +03:00
e5c197d8aa talk-llama : add discussion link 2023-03-28 10:11:34 +03:00
7cd1d3bc34 talk-llama : try to fix windows build .. 2023-03-27 22:40:59 +03:00
82637b8e9f readme : add talk-llama example to the table 2023-03-27 21:02:35 +03:00
4a0deb8b1e talk-llama : add new example + sync ggml from llama.cpp (#664)
* talk-llama : talk with LLaMA AI

* talk.llama : disable EOS token

* talk-llama : add README instructions

* ggml : fix build in debug
2023-03-27 21:00:32 +03:00
28 changed files with 5602 additions and 686 deletions

3
.gitignore vendored
View File

@ -18,6 +18,7 @@ build-sanitize-thread/
/stream
/command
/talk
/talk-llama
/bench
arm_neon.h
@ -32,3 +33,5 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
extra/bench-gg.txt
*.mlmodel*

View File

@ -36,7 +36,7 @@ LDFLAGS =
# ref: https://github.com/ggerganov/whisper.cpp/issues/37
ifneq ($(wildcard /usr/include/musl/*),)
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
endif
@ -151,12 +151,15 @@ ifneq ($(filter aarch64%,$(UNAME_M)),)
CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, 2, 3
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
# 32-bit Raspberry Pi 1, 2, 3
CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 4
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
# 32-bit ARM, for example on Armbian or possibly raspbian
CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
# 64-bit ARM, use these (TODO: auto-detect 64-bit)
# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 4
@ -178,7 +181,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )
default: main
default: main bench
#
# Build library
@ -197,7 +200,7 @@ libwhisper.so: ggml.o whisper.o
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
clean:
rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
#
# Examples
@ -212,6 +215,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
./main -h
bench: examples/bench/bench.cpp ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
@ -221,8 +227,8 @@ command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whi
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
bench: examples/bench/bench.cpp ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk-llama $(CC_SDL) $(LDFLAGS)
#
# Audio samples

View File

@ -313,7 +313,7 @@ whisper_print_timings: total time = 32733.52 ms
## Real-time audio input example
This is a naive example of performing real-time inference on audio from your microphone.
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continously.
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
```java
@ -500,6 +500,7 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
| [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
| [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |

View File

@ -63,4 +63,5 @@ else()
add_subdirectory(command)
add_subdirectory(bench)
add_subdirectory(talk)
add_subdirectory(talk-llama)
endif()

View File

@ -1,15 +1,22 @@
const path = require('path');
const { whisper } = require(path.join(__dirname, '../../../build/Release/whisper-addon'));
const path = require("path");
const { whisper } = require(path.join(
__dirname,
"../../../build/Release/whisper-addon"
));
const { promisify } = require("util");
const whisperAsync = promisify(whisper);
const whisperParamsMock = {
language: 'en',
model: path.join(__dirname, '../../../models/ggml-base.en.bin'),
fname_inp: path.join(__dirname, '../../../samples/jfk.wav'),
language: "en",
model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
};
describe("Run whisper.node", () => {
test("it should receive a non-empty value", async () => {
let result = await whisperAsync(whisperParamsMock);
test("it should receive a non-empty value", () => {
expect(whisper(whisperParamsMock).length).toBeGreaterThan(0);
});
expect(result.length).toBeGreaterThan(0);
});
});

View File

@ -160,22 +160,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
return 3;
}
// initial prompt
std::vector<whisper_token> prompt_tokens;
if (!params.prompt.empty()) {
prompt_tokens.resize(1024);
prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
fprintf(stderr, "\n");
fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
fprintf(stderr, "initial tokens: [ ");
for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
fprintf(stderr, "%d ", prompt_tokens[i]);
}
fprintf(stderr, "]\n");
}
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
const auto fname_inp = params.fname_inp[f];
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -243,8 +227,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
wparams.initial_prompt = params.prompt.c_str();
whisper_print_user_data user_data = { &params, &pcmf32s };

View File

@ -8,6 +8,7 @@
#include <string>
#include <thread>
#include <vector>
#include <cstring>
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
// Lowest is red, middle is yellow, highest is green.
@ -371,6 +372,39 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
return true;
}
char *escape_double_quotes(const char *str) {
if (str == NULL) {
return NULL;
}
size_t escaped_length = strlen(str) + 1;
for (size_t i = 0; str[i] != '\0'; i++) {
if (str[i] == '"') {
escaped_length++;
}
}
char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
if (escaped == NULL) {
return NULL;
}
size_t pos = 0;
for (size_t i = 0; str[i] != '\0'; i++) {
if (str[i] == '"') {
escaped[pos++] = '\\';
escaped[pos++] = '"';
} else {
escaped[pos++] = str[i];
}
}
// no need to set zero due to calloc() being used prior
return escaped;
}
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
std::ofstream fout(fname);
int indent = 0;
@ -414,7 +448,9 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
auto value_s = [&](const char *name, const char *val, bool end = false) {
start_value(name);
fout << "\"" << val << (end ? "\"\n" : "\",\n");
char * val_escaped = escape_double_quotes(val);
fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
free(val_escaped);
};
auto end_value = [&](bool end = false) {
@ -455,7 +491,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
value_i("ctx", whisper_model_n_text_ctx(ctx));
value_i("state", whisper_model_n_text_state(ctx));
value_i("head", whisper_model_n_text_head(ctx));
value_i("leyer", whisper_model_n_text_layer(ctx), true);
value_i("layer", whisper_model_n_text_layer(ctx), true);
end_obj();
value_i("mels", whisper_model_n_mels(ctx));
value_i("f16", whisper_model_f16(ctx), true);
@ -477,7 +513,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
start_obj();
start_obj("timestanps");
start_obj("timestamps");
value_s("from", to_timestamp(t0, true).c_str());
value_s("to", to_timestamp(t1, true).c_str(), true);
end_obj();
@ -639,22 +675,6 @@ int main(int argc, char ** argv) {
return 3;
}
// initial prompt
std::vector<whisper_token> prompt_tokens;
if (!params.prompt.empty()) {
prompt_tokens.resize(1024);
prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
fprintf(stderr, "\n");
fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
fprintf(stderr, "initial tokens: [ ");
for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
fprintf(stderr, "%d ", prompt_tokens[i]);
}
fprintf(stderr, "]\n");
}
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
const auto fname_inp = params.fname_inp[f];
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -718,8 +738,7 @@ int main(int argc, char ** argv) {
wparams.speed_up = params.speed_up;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
wparams.initial_prompt = params.prompt.c_str();
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;

2
examples/talk-llama/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
eleven-labs.py
audio.mp3

View File

@ -0,0 +1,16 @@
if (WHISPER_SUPPORT_SDL2)
# talk-llama
set(TARGET talk-llama)
#add_executable(${TARGET} talk-llama.cpp llama.cpp)
#target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
#target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
# TODO: this is temporary
# need to export ggml symbols for MSVC, but too lazy ..
add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
include(DefaultTargetOptions)
endif ()

View File

@ -0,0 +1,36 @@
# talk-llama
Talk with an LLaMA AI in your terminal
[Demo Talk](https://user-images.githubusercontent.com/1991296/228024237-848f998c-c334-46a6-bef8-3271590da83b.mp4)
## Building
The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
```bash
# Install SDL2 on Linux
sudo apt-get install libsdl2-dev
# Install SDL2 on Mac OS
brew install sdl2
# Build the "talk-llama" executable
make talk-llama
# Run it
./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
```
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
- The `-ml` argument specifies the LLaMA model that you would like to use. Read the instructions in https://github.com/ggerganov/llama.cpp for information about how to obtain a `ggml` compatible LLaMA model
## TTS
For best experience, this example needs a TTS tool to convert the generated text responses to voice.
You can use any TTS engine that you would like - simply edit the [speak.sh](speak.sh) script to your needs.
By default, it is configured to use MacOS's `say`, but you can use whatever you wish.
## Discussion
If you have any feedback, please let "us" know in the following discussion: https://github.com/ggerganov/whisper.cpp/discussions/672?converting=1

File diff suppressed because it is too large Load Diff

152
examples/talk-llama/llama.h Normal file
View File

@ -0,0 +1,152 @@
#ifndef LLAMA_H
#define LLAMA_H
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport)
# else
# define LLAMA_API __declspec(dllimport)
# endif
# else
# define LLAMA_API __attribute__ ((visibility ("default")))
# endif
#else
# define LLAMA_API
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#ifdef __cplusplus
extern "C" {
#endif
//
// C interface
//
// TODO: show sample usage
//
struct llama_context;
typedef int llama_token;
typedef struct llama_token_data {
llama_token id; // token id
float p; // probability of the token
float plog; // log probability of the token
} llama_token_data;
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
int n_ctx; // text context
int n_parts; // -1 for default
int seed; // RNG seed, 0 for random
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
// context pointer passed to the progress callback
void * progress_callback_user_data;
};
LLAMA_API struct llama_context_params llama_context_default_params();
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
LLAMA_API struct llama_context * llama_init_from_file(
const char * path_model,
struct llama_context_params params);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
// TODO: not great API - very likely to change
// Returns 0 on success
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
// Returns 0 on success
LLAMA_API int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,
int n_tokens,
int n_past,
int n_threads);
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned
// TODO: not sure if correct
LLAMA_API int llama_tokenize(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
LLAMA_API int llama_n_embd (struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Rows: n_tokens
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos();
LLAMA_API llama_token llama_token_eos();
// TODO: improve the last_n_tokens interface ?
LLAMA_API llama_token llama_sample_top_p_top_k(
struct llama_context * ctx,
const llama_token * last_n_tokens_data,
int last_n_tokens_size,
int top_k,
float top_p,
float temp,
float repeat_penalty);
// Performance information
LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information
LLAMA_API const char * llama_print_system_info(void);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,23 @@
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Write a text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{1} responds with short and concise answers.
### Response:
{0}{4} Hello, {1}!
{1}{4} Hello {0}! How may I help you today?
{0}{4} What time is it?
{1}{4} It is {2} o'clock.
{0}{4} What year is it?
{1}{4} We are in {3}.
{0}{4} What is a cat?
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{0}{4} Name a color.
{1}{4} Blue
{0}{4}

28
examples/talk-llama/speak.sh Executable file
View File

@ -0,0 +1,28 @@
#!/bin/bash
# Usage:
# speak.sh <voice_id> <text-to-speak>
# espeak
# Mac OS: brew install espeak
# Linux: apt-get install espeak
#
#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
# for Mac
if [ "$1" = "0" ]; then
say "$2"
elif [ "$1" = "1" ]; then
say -v "Samantha (Enhanced)" "$2"
elif [ "$1" = "2" ]; then
say -v "Daniel (Enhanced)" "$2"
elif [ "$1" = "3" ]; then
say -v "Veena (Enhanced)" "$2"
fi
# Eleven Labs
#
#wd=$(dirname $0)
#script=$wd/eleven-labs.py
#python3 $script $1 "$2" >/dev/null 2>&1
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1

View File

@ -0,0 +1,703 @@
// Talk with AI
//
#include "common.h"
#include "common-sdl.h"
#include "whisper.h"
#include "llama.h"
#include <map>
#include <cassert>
#include <cstdio>
#include <fstream>
#include <regex>
#include <string>
#include <thread>
#include <vector>
#include <regex>
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int)add_bos);
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
return res;
}
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t voice_id = 0;
int32_t voice_ms = 10000;
int32_t capture_id = -1;
int32_t max_tokens = 64;
int32_t audio_ctx = 0;
int32_t n_parts_llama = -1;
float vad_thold = 0.4f;
float freq_thold = 100.0f;
bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
bool no_timestamps = true;
bool verbose_prompt = false;
std::string name_ni = "Georgi"; // natural intelligence
std::string name_ai = "LLaMA"; // artificial intelligence
std::string language = "en";
std::string model_wsp = "models/ggml-base.en.bin";
std::string model_llama = "models/ggml-llama-7B.bin";
std::string speak = "./examples/talk/speak.sh";
std::string prompt = "";
std::string fname_out;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-vid" || arg == "--voice-id") { params.voice_id = std::stoi(argv[++i]); }
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "--verbose-prompt") { params.verbose_prompt = true; }
else if (arg == "-nni" || arg == "--name-ni") { params.name_ni = argv[++i]; }
else if (arg == "-nai" || arg == "--name-ai") { params.name_ai = argv[++i]; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
else if (arg == "--prompt-file") {
std::ifstream file(argv[++i]);
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
}
}
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -vid N, --voice-id N [%-7d] voice ID\n", params.voice_id);
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -nni NAME,--name-ni NAME [%-7s] natural intelligence name\n", params.name_ni.c_str());
fprintf(stderr, " -nai NAME,--name-ai NAME [%-7s] artificial intelligence name\n", params.name_ai.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
fprintf(stderr, " --n-parts-llama N [%-7d] num parts in llama model file\n", params.n_parts_llama);
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
fprintf(stderr, " --verbose-prompt [%-7s] print prompt at start\n", params.verbose_prompt ? "true" : "false");
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, "\n");
}
std::string transcribe(
whisper_context * ctx,
const whisper_params & params,
const std::vector<float> & pcmf32,
const std::string prompt_text,
float & prob,
int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
prob = 0.0f;
t_ms = 0;
std::vector<whisper_token> prompt_tokens;
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
prompt_tokens.resize(1024);
prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = 2;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
static int iter = params.voice_id;
std::this_thread::sleep_for(std::chrono::milliseconds(100*iter));
iter = (iter + 1) % 4;
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}
int prob_n = 0;
std::string result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
result += text;
const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {
const auto token = whisper_full_get_token_data(ctx, i, j);
prob += token.p;
++prob_n;
}
}
if (prob_n > 0) {
prob /= prob_n;
}
const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
return result;
}
const std::vector<std::string> k_participants = {
"LLaMA",
"GGaMA",
"SSaMA",
"RRaMA",
};
// homophones
const std::map<std::string, std::vector<std::string>> k_homophones = {
{ "LLaMA", { "llama", "Llama", "LLAMA", }, },
{ "GGaMA", { "gama", "Gama", "GAMA", "gamma", "Gamma", "GAMMA", }, },
{ "SSaMA", { "sama", "Sama", "SAMA", "samma", "Samma", "SAMMA", }, },
{ "RRaMA", { "rama", "Rama", "RAMA", "ramma", "Ramma", "RAMMA", }, },
};
const std::string k_prompt_whisper = R"(A conversation between {1}, {10}, {11}, {12} and {13}.)";
const std::map<std::string, std::string> k_prompt = {
{
k_participants.at(0),
R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{10}, {11}, {12} and {13} respond with short and concise answers.
{10} is smart, objective, honest and kind. Never fails to give a meaningful and insightful answer and opinion.
{1} is leading the conversation and asking the questions.
{1}{4} Hello {10}! What is your opinion on the current state of the world?
{10}{4} Great question {1}! I think we live in a very interesting time.
There are many things to be concerned about, but also many things to be optimistic about.
{1}{4} What advice would you give to a young person who is just starting out in life?
{10}{4} I would tell them to be patient and to not be afraid to fail.
It is important to learn from your mistakes and to keep trying.
{1}{4})"
},
{
k_participants.at(1),
R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{10}, {11}, {12} and {13} respond with short and concise answers.
{11} has critical thinking skills, is very knowledgeable and is a good listener. He is very humble and never arrogant.
{1} is leading the conversation and asking the questions.
{1}{4} Hello {11}! What is your opinion on the current state of the world?
{11}{4} The world is about to experience a major change. We are on the verge of a new era.
{1}{4} What advice would you give to a young person who is just starting out in life?
{11}{4} My advice would be to be open minded and to be willing to learn from others.
{1}{4})"
},
{
k_participants.at(2),
R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{10}, {11}, {12} and {13} respond with short and concise answers.
{12} has strong leadership skills, strategic thinking, and innovative ideas. Has the ability to mentor and support young people.
{1} is leading the conversation and asking the questions.
{1}{4} Hello {12}! What is your opinion on the current state of the world?
{12}{4} Our future is bright. We are living in a time of great opportunity.
{1}{4} What advice would you give to a young person who is just starting out in life?
{12}{4} I would tell them to be brave and to be willing to take risks.
{1}{4})"
},
{
k_participants.at(3),
R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{10}, {11}, {12} and {13} respond with short and concise answers.
{13} is rude, arrogant, and has a bad attitude. He is very opinionated and never listens to others.
{1} is leading the conversation and asking the questions.
{1}{4} Hello {13}! What is your opinion on the current state of the world?
{13}{4} The world is a terrible place. It is full of evil and corruption.
{1}{4} What advice would you give to a young person who is just starting out in life?
{13}{4} I would tell them to be selfish and to never trust anyone.
{1}{4})"
},
};
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
// whisper init
struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
// llama init
auto lparams = llama_context_default_params();
// tune these to your liking
lparams.n_ctx = 512;
lparams.seed = 1;
lparams.f16_kv = true;
lparams.n_parts = params.n_parts_llama;
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx_wsp)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
__func__,
params.n_threads,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
fprintf(stderr, "\n");
}
// init audio
audio_async audio(30*1000);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
int n_iter = 0;
bool is_running = true;
bool force_speak = false;
float prob0 = 0.0f;
const std::string chat_symb = ":";
const std::string name_ni = params.name_ni;
const std::string name_ai = params.name_ai;
// the participant that was referenced last
std::string name_ref = name_ni;
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
std::string prompt_whisper = k_prompt_whisper;
prompt_whisper = ::replace(prompt_whisper, "{1}", name_ni);
prompt_whisper = ::replace(prompt_whisper, "{10}", k_participants.at(0));
prompt_whisper = ::replace(prompt_whisper, "{11}", k_participants.at(1));
prompt_whisper = ::replace(prompt_whisper, "{12}", k_participants.at(2));
prompt_whisper = ::replace(prompt_whisper, "{13}", k_participants.at(3));
// construct the initial prompt for LLaMA inference
std::string prompt_llama = params.prompt.empty() ? k_prompt.find(name_ai)->second : params.prompt;
// need to have leading ' '
prompt_llama.insert(0, 1, ' ');
prompt_llama = ::replace(prompt_llama, "{1}", name_ni);
prompt_llama = ::replace(prompt_llama, "{10}", k_participants.at(0));
prompt_llama = ::replace(prompt_llama, "{11}", k_participants.at(1));
prompt_llama = ::replace(prompt_llama, "{12}", k_participants.at(2));
prompt_llama = ::replace(prompt_llama, "{13}", k_participants.at(3));
{
// get date string
std::string date_str;
{
time_t t = time(0);
struct tm * now = localtime(&t);
char buf[128];
strftime(buf, sizeof(buf), "%d/%m/%Y", now);
date_str = buf;
}
prompt_llama = ::replace(prompt_llama, "{1}", date_str);
}
{
// get time string
std::string time_str;
{
time_t t = time(0);
struct tm * now = localtime(&t);
char buf[128];
strftime(buf, sizeof(buf), "%H:%M", now);
time_str = buf;
}
prompt_llama = ::replace(prompt_llama, "{2}", time_str);
}
{
// get year string
std::string year_str;
{
time_t t = time(0);
struct tm * now = localtime(&t);
char buf[128];
strftime(buf, sizeof(buf), "%Y", now);
year_str = buf;
}
prompt_llama = ::replace(prompt_llama, "{3}", year_str);
}
prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
// evaluate the initial prompt
auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);
printf("\n");
printf("%s : initializing - please wait ...\n", __func__);
if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
if (params.verbose_prompt) {
fprintf(stdout, "\n");
fprintf(stdout, "%s", prompt_whisper.c_str());
fprintf(stdout, "\n");
fprintf(stdout, "\n");
fprintf(stdout, "%s", prompt_llama.c_str());
fprintf(stdout, "\n");
fprintf(stdout, "\n");
fflush(stdout);
}
printf("%s : done! start speaking in the microphone\n", __func__);
printf("\n");
printf("%s%s", name_ni.c_str(), chat_symb.c_str());
fflush(stdout);
// clear audio buffer
audio.clear();
// text inference variables
const int voice_id = params.voice_id;
const int n_keep = embd_inp.size();
const int n_ctx = llama_n_ctx(ctx_llama);
int n_past = n_keep;
int n_prev = 64; // TODO arg
std::vector<llama_token> embd;
// reverse prompts for detecting when it's time to stop speaking
std::vector<std::string> antiprompts = {
name_ni + chat_symb,
};
for (const auto & p : k_participants) {
antiprompts.push_back(p + chat_symb);
}
std::string text_heard_all;
// main loop
while (is_running) {
// handle Ctrl + C
is_running = sdl_poll_events();
if (!is_running) {
break;
}
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
int64_t t_ms = 0;
{
audio.get(15000, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
//fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
audio.get(params.voice_ms, pcmf32_cur);
std::string text_heard;
if (!force_speak) {
text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
}
// remove text between brackets using regex
{
std::regex re("\\[.*?\\]");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove text between brackets using regex
{
std::regex re("\\(.*?\\)");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
// take first line
text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
// remove leading and trailing whitespace
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
if (text_heard.empty() || tokens.empty() || force_speak) {
//fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
audio.clear();
continue;
}
force_speak = false;
if (text_heard[0] != ' ') {
text_heard.insert(0, 1, ' ');
}
// replace homophones
for (const auto & homophone : k_homophones) {
for (const auto & word : homophone.second) {
text_heard = ::replace(text_heard, word, homophone.first);
}
}
// check which participant was mentioned
const auto name_ref_old = name_ref;
for (const auto & participant : k_participants) {
if (participant == name_ref) {
continue;
}
if (text_heard.find(participant) != std::string::npos) {
name_ref = participant;
break;
}
}
if (name_ref == name_ref_old && name_ref != name_ai) {
name_ref = name_ni;
}
text_heard += "\n" + name_ref + chat_symb;
fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
fflush(stdout);
text_heard_all += text_heard;
// keep only last 100 characters
if (text_heard_all.size() > 100) {
text_heard_all = text_heard_all.substr(text_heard_all.size() - 100);
}
if (name_ref != name_ai) {
} else {
// text inference
bool done = false;
std::string text_to_speak;
embd = ::llama_tokenize(ctx_llama, text_heard_all, false);
text_heard_all.clear();
while (true) {
// predict
if (embd.size() > 0) {
if (n_past + (int) embd.size() > n_ctx) {
n_past = n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx_llama, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
//printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
n_past += embd.size();
embd.clear();
if (done) break;
{
// out of user input, sample next token
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.20f;
const float repeat_penalty = 1.0764f;
const int repeat_last_n = 256;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
logits[llama_token_eos()] = 0;
id = llama_sample_top_p_top_k(ctx_llama,
embd_inp.data() + std::max(0, n_past - repeat_last_n),
repeat_last_n, top_k, top_p, temp, repeat_penalty);
}
if (id != llama_token_eos()) {
// add it to the context
embd.push_back(id);
text_to_speak += llama_token_to_str(ctx_llama, id);
printf("%s", llama_token_to_str(ctx_llama, id));
}
// new line
if (id == 13) {
}
}
{
std::string last_output;
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
}
last_output += llama_token_to_str(ctx_llama, embd[0]);
for (const std::string & antiprompt : antiprompts) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
done = true;
text_to_speak = ::replace(text_to_speak, antiprompt, "");
fflush(stdout);
break;
}
}
}
is_running = sdl_poll_events();
if (!is_running) {
break;
}
}
text_to_speak = ::replace(text_to_speak, "\"", "");
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
}
audio.clear();
++n_iter;
}
}
}
audio.pause();
whisper_print_timings(ctx_wsp);
whisper_free(ctx_wsp);
llama_print_timings(ctx_llama);
llama_free(ctx_llama);
return 0;
}

View File

@ -325,9 +325,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// create the ggml context
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -528,9 +531,11 @@ bool gpt2_eval(
}
}
struct ggml_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);

View File

@ -325,9 +325,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// create the ggml context
{
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = nullptr;
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ nullptr,
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -528,9 +530,11 @@ bool gpt2_eval(
}
}
struct ggml_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);

View File

@ -7,7 +7,10 @@
# Mac OS: brew install espeak
# Linux: apt-get install espeak
#
espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
# Mac OS "say" command
say "$2"
# Eleven Labs
#

View File

@ -1,15 +1,18 @@
A sample SwiftUI app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions.
See also: [whisper.objc](https://github.com/ggerganov/whisper.cpp/tree/master/examples/whisper.objc).
To use:
**Usage**:
1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1]
2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
2. Add the model to `whisper.swiftui.demo/Resources/models` **via Xcode**.
3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
4. Add the sample audio file to `whisper.swiftui.demo/Resources/samples` **via Xcode**.
5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
**Note:** Pay attention to the folder path: `whisper.swiftui.demo/Resources/models` is the appropriate directory to place resources whilst `whisper.swiftui.demo/Models` is related to actual code.
[^1]: I recommend the tiny, base or small models for running on an iOS device.
[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)

3120
ggml.c

File diff suppressed because it is too large Load Diff

32
ggml.h
View File

@ -198,6 +198,8 @@ struct ggml_object;
struct ggml_context;
enum ggml_type {
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
@ -226,7 +228,9 @@ enum ggml_op {
GGML_OP_STEP,
GGML_OP_RELU,
GGML_OP_GELU,
GGML_OP_SILU,
GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM,
GGML_OP_MUL_MAT,
@ -312,6 +316,7 @@ struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
void ggml_time_init(void); // call this once at the beginning of the program
@ -326,7 +331,10 @@ void ggml_print_objects(const struct ggml_context * ctx);
int ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_nbytes (const struct ggml_tensor * tensor);
size_t ggml_type_size (enum ggml_type type);
int ggml_blck_size (enum ggml_type type);
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
size_t ggml_element_size(const struct ggml_tensor * tensor);
struct ggml_context * ggml_init(struct ggml_init_params params);
@ -336,6 +344,13 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
bool ggml_mlock_supported(void);
bool ggml_mlock(
struct ggml_context * ctx,
const void *opt_extra_addr,
size_t opt_extra_len,
char **err_p);
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
@ -466,12 +481,20 @@ struct ggml_tensor * ggml_gelu(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_silu(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
struct ggml_tensor * a);
struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx,
struct ggml_tensor * a);
// A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally)
// result is m columns, p rows
@ -726,6 +749,13 @@ enum ggml_opt_result ggml_opt(
struct ggml_opt_params params,
struct ggml_tensor * f);
//
// quantization
//
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
//
// system info
//

View File

@ -12,7 +12,7 @@ pfx="resolve/main/ggml"
# get the path of this script
function get_script_path() {
if [ -x "$(command -v realpath)" ]; then
echo "$(dirname $(realpath $0))"
echo "$(dirname "$(realpath "$0")")"
else
local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
echo "$ret"

6
talk-ggama.sh Executable file
View File

@ -0,0 +1,6 @@
./talk-llama \
-mw ./models/ggml-small.en.bin \
-ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
--name-ni "Georgi" \
--name-ai "GGaMA" \
-t 8 -vid 1 --speak ./examples/talk-llama/speak.sh

6
talk-llama.sh Executable file
View File

@ -0,0 +1,6 @@
./talk-llama \
-mw ./models/ggml-small.en.bin \
-ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
--name-ni "Georgi" \
--name-ai "LLaMA" \
-t 8 -vid 0 --speak ./examples/talk-llama/speak.sh

6
talk-rrama.sh Executable file
View File

@ -0,0 +1,6 @@
./talk-llama \
-mw ./models/ggml-small.en.bin \
-ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
--name-ni "Georgi" \
--name-ai "RRaMA" \
-t 8 -vid 3 --speak ./examples/talk-llama/speak.sh

6
talk-ssama.sh Executable file
View File

@ -0,0 +1,6 @@
./talk-llama \
-mw ./models/ggml-small.en.bin \
-ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
--name-ni "Georgi" \
--name-ai "SSaMA" \
-t 8 -vid 2 --speak ./examples/talk-llama/speak.sh

View File

@ -218,14 +218,14 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
{ "su", { 98, "sundanese", } },
};
static const size_t MB = 1024*1024;
static const size_t MB = 1ull*1024*1024;
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
{ MODEL_TINY, 12ull*MB },
{ MODEL_BASE, 15ull*MB },
{ MODEL_SMALL, 23ull*MB },
{ MODEL_MEDIUM, 31ull*MB },
{ MODEL_LARGE, 38ull*MB },
{ MODEL_TINY, 14ull*MB },
{ MODEL_BASE, 18ull*MB },
{ MODEL_SMALL, 28ull*MB },
{ MODEL_MEDIUM, 36ull*MB },
{ MODEL_LARGE, 44ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
@ -636,6 +636,8 @@ struct whisper_context {
whisper_model model;
whisper_vocab vocab;
whisper_state * state = nullptr;
std::string path_model; // populated by whisper_init_from_file()
};
template<typename T>
@ -652,9 +654,11 @@ static bool kv_cache_init(
int n_ctx) {
cache.buf.resize(mem_bytes);
struct ggml_init_params params;
params.mem_size = cache.buf.size();
params.mem_buffer = cache.buf.data();
struct ggml_init_params params = {
/*.mem_size =*/ cache.buf.size(),
/*.mem_buffer =*/ cache.buf.data(),
/*.no_alloc =*/ false,
};
cache.ctx = ggml_init(params);
@ -686,9 +690,11 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
struct ggml_init_params params;
params.mem_size = cache.buf.size();
params.mem_buffer = cache.buf.data();
struct ggml_init_params params = {
/*.mem_size =*/ cache.buf.size(),
/*.mem_buffer =*/ cache.buf.data(),
/*.no_alloc =*/ false,
};
cache.ctx = ggml_init(params);
@ -1026,9 +1032,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
// create the ggml context
{
struct ggml_init_params params;
params.mem_size = wctx.model.buf->size();
params.mem_buffer = wctx.model.buf->data();
struct ggml_init_params params = {
/*.mem_size =*/ wctx.model.buf->size(),
/*.mem_buffer =*/ wctx.model.buf->data(),
/*.no_alloc =*/ false,
};
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -1342,9 +1350,11 @@ static bool whisper_encode_internal(
const int n_mels = hparams.n_mels;
assert(mel_inp.n_mel == n_mels);
struct ggml_init_params params;
params.mem_size = wstate.buf_compute.size();
params.mem_buffer = wstate.buf_compute.data();
struct ggml_init_params params = {
/*.mem_size =*/ wstate.buf_compute.size(),
/*.mem_buffer =*/ wstate.buf_compute.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
@ -1597,7 +1607,7 @@ static bool whisper_encode_internal(
ggml_repeat(ctx0, layer.mlp_ln_w, cur),
cur),
ggml_repeat(ctx0, layer.mlp_ln_b, cur));
}
}
#ifdef WHISPER_USE_FLASH_FF
wstate.use_buf(ctx0, 0);
@ -1637,7 +1647,7 @@ static bool whisper_encode_internal(
ggml_repeat(ctx0, layer.mlp_1_b, cur),
cur);
#endif
}
}
wstate.use_buf(ctx0, 3);
@ -1741,10 +1751,10 @@ static bool whisper_encode_internal(
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
// ggml_used_mem(ctx0)/1024.0/1024.0,
// wctx.get_buf_max_mem(0)/1024.0/1024.0,
// wctx.get_buf_max_mem(1)/1024.0/1024.0,
// wctx.get_buf_max_mem(2)/1024.0/1024.0,
// wctx.get_buf_max_mem(3)/1024.0/1024.0);
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
ggml_free(ctx0);
@ -1795,9 +1805,11 @@ static bool whisper_decode_internal(
//WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
struct ggml_init_params params;
params.mem_size = wstate.buf_compute.size();
params.mem_buffer = wstate.buf_compute.data();
struct ggml_init_params params = {
/*.mem_size =*/ wstate.buf_compute.size(),
/*.mem_buffer =*/ wstate.buf_compute.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
@ -1841,8 +1853,6 @@ static bool whisper_decode_internal(
// self-attention
{
wstate.use_buf(ctx0, 1);
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
layer.attn_q_w,
cur);
@ -1904,8 +1914,6 @@ static bool whisper_decode_internal(
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
wstate.use_buf(ctx0, 0);
//struct ggml_tensor * KQ_scaled =
// ggml_scale(ctx0,
// KQ,
@ -1914,20 +1922,16 @@ static bool whisper_decode_internal(
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
wstate.use_buf(ctx0, 1);
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
wstate.use_buf(ctx0, 0);
struct ggml_tensor * V_trans =
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
n_state/n_head, n_head, n_past + N),
1, 2, 0, 3);
wstate.use_buf(ctx0, 1);
ggml_cpy(ctx0,
ggml_permute(ctx0,
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
n_state/n_head, n_head, n_past + N),
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_state/n_head, n_head));
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
@ -1964,8 +1968,6 @@ static bool whisper_decode_internal(
cur = ggml_norm(ctx0, inpCA); // note: we use inpCA here
wstate.use_buf(ctx0, 1);
// cur = ln_0_w*cur + ln_0_b
cur = ggml_add(ctx0,
ggml_mul(ctx0,
@ -1976,8 +1978,6 @@ static bool whisper_decode_internal(
// cross-attention
{
wstate.use_buf(ctx0, 0);
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
layer.cross_attn_q_w,
cur);
@ -2001,12 +2001,13 @@ static bool whisper_decode_internal(
ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
n_state/n_head, n_head, M);
struct ggml_tensor * V_trans = ggml_permute(ctx0, Vcross, 1, 2, 0, 3);
struct ggml_tensor * V_trans =
ggml_cpy(ctx0,
ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
// ------
wstate.use_buf(ctx0, 1);
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_cpy(ctx0,
@ -2016,8 +2017,6 @@ static bool whisper_decode_internal(
struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
wstate.use_buf(ctx0, 0);
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
@ -2030,16 +2029,10 @@ static bool whisper_decode_internal(
// no masking for cross-attention
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
wstate.use_buf(ctx0, 1);
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
wstate.use_buf(ctx0, 0);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
wstate.use_buf(ctx0, 1);
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
// cur = KQV_merged.contiguous().view(n_state, N)
@ -2170,10 +2163,10 @@ static bool whisper_decode_internal(
if (N > 1) {
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
// ggml_used_mem(ctx0)/1024.0/1024.0,
// wctx.get_buf_max_mem(0)/1024.0/1024.0,
// wctx.get_buf_max_mem(1)/1024.0/1024.0,
// wctx.get_buf_max_mem(2)/1024.0/1024.0,
// wctx.get_buf_max_mem(3)/1024.0/1024.0);
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
}
ggml_free(ctx0);
@ -2482,7 +2475,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
return nullptr;
@ -2503,7 +2495,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
state->logits_id.reserve(ctx->model.hparams.n_vocab);
@ -2554,7 +2545,13 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
fin->close();
};
return whisper_init_no_state(&loader);
auto ctx = whisper_init_no_state(&loader);
if (ctx) {
ctx->path_model = path_model;
}
return ctx;
}
struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
@ -3134,6 +3131,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.speed_up =*/ false,
/*.audio_ctx =*/ 0,
/*.initial_prompt =*/ nullptr,
/*.prompt_tokens =*/ nullptr,
/*.prompt_n_tokens =*/ 0,
@ -3164,6 +3162,9 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.new_segment_callback =*/ nullptr,
/*.new_segment_callback_user_data =*/ nullptr,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.encoder_begin_callback =*/ nullptr,
/*.encoder_begin_callback_user_data =*/ nullptr,
@ -3806,6 +3807,15 @@ int whisper_full_with_state(
prompt_past.clear();
}
// initial prompt
if (!params.prompt_tokens && params.initial_prompt) {
std::vector<whisper_token> prompt_tokens;
prompt_tokens.resize(1024);
prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
params.prompt_tokens = prompt_tokens.data();
params.prompt_n_tokens = prompt_tokens.size();
}
// prepend the prompt tokens to the prompt_past
if (params.prompt_tokens && params.prompt_n_tokens > 0) {
// parse tokens from the pointer
@ -3871,6 +3881,10 @@ int whisper_full_with_state(
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev);
}
}
if (params.progress_callback) {
params.progress_callback(
ctx, ctx->state, progress_prev, params.progress_callback_user_data);
}
// of only 1 second left, then stop
if (seek + 100 >= seek_end) {
@ -4459,6 +4473,9 @@ int whisper_full_parallel(
params_cur.new_segment_callback = nullptr;
params_cur.new_segment_callback_user_data = nullptr;
params_cur.progress_callback = nullptr;
params_cur.progress_callback_user_data = nullptr;
workers[i] = std::thread(whisper_full_with_state, ctx, states[i], std::move(params_cur), samples + start_samples, n_samples_cur);
}
@ -4719,6 +4736,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(),
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(gparams);

View File

@ -306,6 +306,9 @@ extern "C" {
// Use the whisper_full_...() functions to obtain the text segments
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
// Progress callback
typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
// Encoder begin callback
// If not NULL, called before the encoder starts
// If it returns false, the computation is aborted
@ -356,6 +359,7 @@ extern "C" {
// tokens to provide to the whisper decoder as initial prompt
// these are prepended to any existing text context from a previous call
const char * initial_prompt;
const whisper_token * prompt_tokens;
int prompt_n_tokens;
@ -391,6 +395,10 @@ extern "C" {
whisper_new_segment_callback new_segment_callback;
void * new_segment_callback_user_data;
// called on each progress update
whisper_progress_callback progress_callback;
void * progress_callback_user_data;
// called each time before the encoder starts
whisper_encoder_begin_callback encoder_begin_callback;
void * encoder_begin_callback_user_data;