Compare commits

..

43 Commits

Author SHA1 Message Date
0a2621b637 stream : add "max_tokens" cli arg
Controls the max tokens per segment for the stream example
2022-11-20 21:22:02 +02:00
1b7a7df793 stream : add "audio_ctx" parameter
Used to overwrite the audio context size of the Encoder.
For example, setting "audio_ctx = 512" will make it run about 3 times
faster, processing about 10s of audio, instead of 30s.

The transcription quality drops, but this can be used for real-time
streaming purposes where performance is important.
2022-11-20 21:16:58 +02:00
4af1689ee5 stream : add "max_tokens" parameter
Used to limit the number of tokens in a segment.
Useful to battle with word repetition when using partial encoder context
2022-11-20 21:16:58 +02:00
b10d75199e stream : add "single_segment" option
Force the entire audio chunk to be transcribed into a single segment
2022-11-20 21:16:58 +02:00
ea3344eb8f stream : partial encoder experiments 2022-11-20 21:16:33 +02:00
83c742f1a7 whisper : add option to speed up the audio tempo by x2
Using a Phase Vocoder for speeding up the audio tempo by scaling down
the frequencies in the frequency domain.

This reduces the computation in the Encoder by a factor of 2.
The transcription accuracy is degraded, but for slow to normal speech -
it seems to be still very good.

I think this can find application for real-time transcription - i.e. the
"stream" example.
2022-11-13 16:25:43 +02:00
41b48ab7f1 make : add libwhisper.so target (#144) 2022-11-13 09:09:48 +02:00
a728be9cdb Add WHISPER_NO_AVX and WHISPER_NO_AVX2 to CMakeLists (#136)
* Check for AVX and AVX2 on Darwin

* Add AVX options to CMakeLists
2022-11-11 18:10:01 +02:00
46a68fb9b5 minor : remove one more redundant line 2022-11-11 18:02:58 +02:00
ccd56a9c5b minor : fix double float32 conversion in python script 2022-11-11 17:58:51 +02:00
3500ce8727 ref #40 : start working on the documentation 2022-11-09 21:41:40 +02:00
7519eabf65 Adds support for stdin wav input 2022-11-09 20:37:23 +02:00
b21213c23e js : update whipser.js to latest 2022-11-09 19:33:10 +02:00
9e700e1821 Check for AVX and AVX2 on Darwin 2022-11-09 18:49:55 +02:00
0bfe728b84 Fix the Windows pthread_create shim
The current implementation doesn't actually set the out parameter,
and it returns 0 on failure instead of on success.
2022-11-08 15:02:32 +02:00
4e5674a5d5 sync : submodule whisper.spm 2022-11-07 21:48:13 +02:00
4c66b6a828 cmake : add submodule whisper.spm 2022-11-07 20:50:24 +02:00
c30bffc8a5 ref #22 : add "duration" option
Can be used to partially process a recording
2022-11-07 20:14:52 +02:00
8fdfb0ba92 Update README.md 2022-11-06 21:04:21 +02:00
c71363f14c examples : add simple script for generating Karaoke video 2022-11-06 09:22:50 +02:00
a09e9123ca Update README.md 2022-11-05 08:44:41 +02:00
d42cf6d0df Update README.md 2022-11-04 22:26:08 +02:00
ef47d77492 main : fix generated bash script 2022-11-04 18:30:38 +02:00
75171c2b79 ggml : multi-thread the ggml_add operator 2022-11-03 20:53:44 +02:00
a2eeb941f6 cmake : fix passing GGML_PERF compile option 2022-11-03 20:19:06 +02:00
0e689f83d8 Update README.md 2022-11-02 22:03:27 +02:00
d5afebd37c whisper : token-level timestamp refactoring (#49, #120)
This turned out pretty good overall. The algorithm has been moved from
main.cpp to whisper.cpp and can be reused for all subtitles types. This
means that now you can specify the maximum length of the generated
lines. Simply provide the "-ml" argument specifying the max length in
number of characters
2022-11-02 21:45:54 +02:00
4b1c32e8ea Update README.md 2022-11-02 18:33:29 +02:00
b5dde365e9 extra : compute SHA of all models files 2022-11-02 18:31:55 +02:00
02dfd5b8c3 whisper : fix extra memory usage after recent processor changes
Had increased the memory buffer to the size of the model and forgot to
bring it down.
2022-11-02 18:31:18 +02:00
c63ce24834 Allow building with Accelerate for x86_64 Macs (#123)
* Cross compile windows

* set env properly

* rm log

* fix review

* Add back space

* Don't force architecture

* Allow building x86_64 with accelerate
2022-11-02 18:00:19 +02:00
137321915f ggml : fix the check for NEON support (#7)
Was using the wrong preprocessor macro
2022-11-02 17:52:24 +02:00
24cd12f647 Cross compilation (#121)
* Cross compile windows

* set env properly

* rm log

* fix review

* Add back space
2022-11-02 08:46:49 +02:00
e46bc56e71 Update README.md 2022-11-01 22:47:58 +02:00
6fb98370ba main : add some comments for the word-level timestamp algorithm 2022-11-01 22:35:21 +02:00
0729da9a3b main : fix some edge cases for word-level timestamps 2022-11-01 22:09:25 +02:00
5dc74e3aff Update README.md 2022-10-31 22:06:05 +02:00
ac8ef34039 Update README.md 2022-10-31 20:19:41 +02:00
b26345cc7b Added for Windows implemenated script download-ggml-model.cmd 2022-10-31 19:38:20 +02:00
8dac3c6e10 Fixed sched_yield 2022-10-30 21:38:18 +02:00
6417e59aad Implemenated sched_yield function for Windows 2022-10-30 21:38:18 +02:00
dc12994603 Update README.md 2022-10-30 17:11:37 +02:00
b0f2aa0ea6 Update README.md 2022-10-30 17:10:46 +02:00
19 changed files with 1286 additions and 770 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "bindings/ios"]
path = bindings/ios
url = https://github.com/ggerganov/whisper.spm

View File

@ -9,6 +9,11 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(WHISPER_STANDALONE ON)
include(cmake/GitVars.cmake)
include(cmake/BuildTypes.cmake)
# configure project version
if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
endif()
else()
set(WHISPER_STANDALONE OFF)
endif()
@ -43,11 +48,13 @@ option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
if (APPLE)
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
else()
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
endif()
option(WHISPER_PERF "whisper: enable perf timings" OFF)
option(WHISPER_PERF "whisper: enable perf timings" OFF)
# sanitizers
@ -138,19 +145,29 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
else()
message(STATUS "x86 detected")
if (MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /arch:AVX2")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
else()
if (EMSCRIPTEN)
# we require support for WASM SIMD 128-bit
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
if(NOT WHISPER_NO_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
if(NOT WHISPER_NO_AVX2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c")
endif()
endif()
endif()
if (WHISPER_PERF)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
endif()
#
# whisper - this is the main library of the project
#

View File

@ -1,6 +1,14 @@
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif
ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif
# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
@ -8,8 +16,8 @@ ifeq ($(UNAME_S),Darwin)
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
ifeq ($(SYSCTL_M),1)
UNAME_P := arm
UNAME_M := arm64
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
@ -42,12 +50,24 @@ endif
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),x86_64)
CFLAGS += -mavx -mavx2 -mfma -mf16c
CFLAGS += -mfma -mf16c
ifeq ($(UNAME_S),Darwin)
AVX1_M := $(shell sysctl machdep.cpu.features)
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
CFLAGS += -mavx
endif
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
ifneq (,$(findstring AVX2,$(AVX2_M)))
CFLAGS += -mavx2
endif
else
CFLAGS += -mavx -mavx2
endif
endif
ifeq ($(UNAME_M),amd64)
CFLAGS += -mavx -mavx2 -mfma -mf16c
endif
ifneq ($(filter arm%,$(UNAME_M)),)
ifndef WHISPER_NO_ACCELERATE
# Mac M1 - include Accelerate framework
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
@ -69,25 +89,26 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
#
# Build library + main
#
default: main
main: examples/main/main.cpp ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp whisper.o ggml.o -o main $(LDFLAGS)
./main -h
#
# Build library
#
ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
whisper.o: whisper.cpp whisper.h
$(CXX) $(CXXFLAGS) -c whisper.cpp
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
libwhisper.a: ggml.o whisper.o
ar rcs libwhisper.a ggml.o whisper.o
$(AR) rcs libwhisper.a ggml.o whisper.o
libwhisper.so: ggml.o whisper.o
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
clean:
rm -f *.o main stream bench libwhisper.a
rm -f *.o main stream bench libwhisper.a libwhisper.so
#
# Examples
@ -95,6 +116,10 @@ clean:
CC_SDL=`sdl2-config --cflags --libs`
main: examples/main/main.cpp ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o whisper.o -o main $(LDFLAGS)
./main -h
stream: examples/stream/stream.cpp ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)

190
README.md
View File

@ -26,14 +26,41 @@ Supported platforms:
The entire implementation of the model is contained in 2 source files:
- [ggml.h](ggml.h) / [ggml.c](ggml.c)
- [whisper.h](whisper.h) / [whisper.cpp](whisper.cpp)
- Tensor operations: [ggml.h](ggml.h) / [ggml.c](ggml.c)
- Transformer inference: [whisper.h](whisper.h) / [whisper.cpp](whisper.cpp)
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device:
https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
## Implementation details
- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
- Sample usage is demonstrated in [main.cpp](examples/main)
- Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
- Various other examples are available in the [examples](examples) folder
The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
## Limitations
- Inference only
- No GPU support
- Very basic greedy sampling scheme - always pick up the token with highest probability.
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
to run the python code with the following parameters:
```
whisper --best_of None --beam_size None ...
```
In the future, `whisper.cpp` will support more sampling strategies.
## Quick start
First, download one of the Whisper models converted in [ggml format](models). For example:
@ -59,8 +86,8 @@ For a quick demo, simply run `make base.en`:
```java
$ make base.en
cc -I. -O3 -std=c11 -pthread -DGGML_USE_ACCELERATE -c ggml.c
c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp
cc -I. -O3 -std=c11 -pthread -DGGML_USE_ACCELERATE -c ggml.c -o ggml.o
c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o ggml.o -o main -framework Accelerate
./main -h
@ -70,13 +97,18 @@ options:
-h, --help show this help message and exit
-s SEED, --seed SEED RNG seed (default: -1)
-t N, --threads N number of threads to use during computation (default: 4)
-p N, --processors N number of processors to use during computation (default: 1)
-ot N, --offset-t N time offset in milliseconds (default: 0)
-on N, --offset-n N segment index offset (default: 0)
-mc N, --max-context N maximum number of text context tokens to store (default: max)
-ml N, --max-len N maximum segment length in characters (default: 0)
-wt N, --word-thold N word timestamp probability threshold (default: 0.010000)
-v, --verbose verbose output
--translate translate from source language to english
-otxt, --output-txt output result in a text file
-ovtt, --output-vtt output result in a vtt file
-osrt, --output-srt output result in a srt file
-owts, --output-words output script for generating karaoke video
-ps, --print_special print special tokens
-pc, --print_colors print colors
-nt, --no_timestamps do not print timestamps
@ -86,7 +118,7 @@ options:
bash ./models/download-ggml-model.sh base.en
Downloading ggml model base.en ...
ggml-base.en.bin 100%[========================>] 141.11M 6.34MB/s in 24s
ggml-base.en.bin 100%[========================>] 141.11M 6.34MB/s in 24s
Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
You can now use it like this:
@ -114,23 +146,26 @@ whisper_model_load: n_text_layer = 6
whisper_model_load: n_mels = 80
whisper_model_load: f16 = 1
whisper_model_load: type = 2
whisper_model_load: mem_required = 505.00 MB
whisper_model_load: mem_required = 670.00 MB
whisper_model_load: adding 1607 extra tokens
whisper_model_load: ggml ctx size = 163.43 MB
whisper_model_load: ggml ctx size = 140.60 MB
whisper_model_load: memory size = 22.83 MB
whisper_model_load: model size = 140.54 MB
main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, lang = en, task = transcribe, timestamps = 1 ...
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
[00:00.000 --> 00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
whisper_print_timings: load time = 87.21 ms
whisper_print_timings: mel time = 24.26 ms
whisper_print_timings: sample time = 3.87 ms
whisper_print_timings: encode time = 323.67 ms / 53.94 ms per layer
whisper_print_timings: decode time = 83.25 ms / 13.87 ms per layer
whisper_print_timings: total time = 522.66 ms
[00:00:00.000 --> 00:00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
whisper_print_timings: load time = 105.91 ms
whisper_print_timings: mel time = 24.62 ms
whisper_print_timings: sample time = 3.63 ms
whisper_print_timings: encode time = 324.71 ms / 54.12 ms per layer
whisper_print_timings: decode time = 83.58 ms / 13.93 ms per layer
whisper_print_timings: total time = 542.81 ms
```
The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
@ -172,8 +207,8 @@ make large
| Model | Disk | Mem | SHA |
| --- | --- | --- | --- |
| tiny | 75 MB | ~280 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
| base | 142 MB | ~430 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| tiny | 75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
| base | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| small | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
@ -185,7 +220,7 @@ in about half a minute on a MacBook M1 Pro, using `medium.en` model:
<details>
<summary>Expand to see the result</summary>
```java
$ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
@ -273,32 +308,108 @@ to highlight words with high or low confidence:
<img width="965" alt="image" src="https://user-images.githubusercontent.com/1991296/197356445-311c8643-9397-4e5e-b46e-0b4b4daa2530.png">
## Implementation details
## Controlling the length of the generated text segments (experimental)
- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
- The high-level C-style API is implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
- Sample usage is demonstrated in [main.cpp](examples/main)
- Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
- Various other examples are available in the [examples](examples) folder
For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
```java
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
## Limitations
whisper_model_load: loading model from './models/ggml-base.en.bin'
...
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
- Inference only
- No GPU support
- Very basic greedy sampling scheme - always pick up the token with highest probability.
This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
to run the python code with the following parameters:
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
```
whisper --best_of None --beam_size None ...
```
[00:00:00.000 --> 00:00:00.850] And so my
[00:00:00.850 --> 00:00:01.590] fellow
[00:00:01.590 --> 00:00:04.140] Americans, ask
[00:00:04.140 --> 00:00:05.660] not what your
[00:00:05.660 --> 00:00:06.840] country can do
[00:00:06.840 --> 00:00:08.430] for you, ask
[00:00:08.430 --> 00:00:09.440] what you can do
[00:00:09.440 --> 00:00:10.020] for your
[00:00:10.020 --> 00:00:11.000] country.
```
In the future, `whisper.cpp` will support more sampling strategies.
## Word-level timestamp
The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:
```java
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
whisper_model_load: loading model from './models/ggml-base.en.bin'
...
system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
[00:00:00.000 --> 00:00:00.320]
[00:00:00.320 --> 00:00:00.370] And
[00:00:00.370 --> 00:00:00.690] so
[00:00:00.690 --> 00:00:00.850] my
[00:00:00.850 --> 00:00:01.590] fellow
[00:00:01.590 --> 00:00:02.850] Americans
[00:00:02.850 --> 00:00:03.300] ,
[00:00:03.300 --> 00:00:04.140] ask
[00:00:04.140 --> 00:00:04.990] not
[00:00:04.990 --> 00:00:05.410] what
[00:00:05.410 --> 00:00:05.660] your
[00:00:05.660 --> 00:00:06.260] country
[00:00:06.260 --> 00:00:06.600] can
[00:00:06.600 --> 00:00:06.840] do
[00:00:06.840 --> 00:00:07.010] for
[00:00:07.010 --> 00:00:08.170] you
[00:00:08.170 --> 00:00:08.190] ,
[00:00:08.190 --> 00:00:08.430] ask
[00:00:08.430 --> 00:00:08.910] what
[00:00:08.910 --> 00:00:09.040] you
[00:00:09.040 --> 00:00:09.320] can
[00:00:09.320 --> 00:00:09.440] do
[00:00:09.440 --> 00:00:09.760] for
[00:00:09.760 --> 00:00:10.020] your
[00:00:10.020 --> 00:00:10.510] country
[00:00:10.510 --> 00:00:11.000] .
```
## Karaoke-style movie generation (experimental)
The [main](examples/main) example provides support for output of karaoke-style movies, where the
currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
This requires to have `ffmpeg` installed.
Here are a few *"typical"* examples:
```java
./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
source ./samples/jfk.wav.wts
ffplay ./samples/jfk.wav.mp4
```
https://user-images.githubusercontent.com/1991296/199337465-dbee4b5e-9aeb-48a3-b1c6-323ac4db5b2c.mp4
---
```java
./main -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
source ./samples/mm0.wav.wts
ffplay ./samples/mm0.wav.mp4
```
https://user-images.githubusercontent.com/1991296/199337504-cc8fd233-0cb7-4920-95f9-4227de3570aa.mp4
---
```java
./main -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
source ./samples/gb0.wav.wts
ffplay ./samples/gb0.wav.mp4
```
https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a0cd-f28a317987ba.mp4
---
## Benchmarks
@ -326,9 +437,12 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
## Bindings
- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs)
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm)
- [ ] Python:
- [ ] Java:
## Examples
There are various examples of using the library for different projects in the [examples](examples) folder. Check them out!
## [Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126)

1
bindings/ios Submodule

Submodule bindings/ios added at 4bda8e9d80

File diff suppressed because one or more lines are too long

49
examples/generate-karaoke.sh Executable file
View File

@ -0,0 +1,49 @@
#!/bin/bash
executable="./main"
model="base.en"
model_path="models/ggml-$model.bin"
# require sox and ffmpeg to be installed
if ! command -v sox &> /dev/null
then
echo "sox could not be found"
exit 1
fi
if ! command -v ffmpeg &> /dev/null
then
echo "ffmpeg could not be found"
exit 2
fi
if [ ! -f "$executable" ]; then
echo "'$executable' does not exist. Please build it first."
exit 3
fi
if [ ! -f "$model_path" ]; then
echo "'$model_path' does not exist. Please download it first."
exit 4
fi
# record some raw audio
sox -d rec.wav
# resample to 16kHz
ffmpeg -y -i ./rec.wav -ar 16000 -ac 1 -c:a pcm_s16le ./rec16.wav > /dev/null 2>&1
# run Whisper
echo "Processing ..."
./main -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
# generate Karaoke video
echo "Generating video ..."
source rec16.wav.wts > /dev/null 2>&1
# play the video
echo "Playing ./rec16.wav.mp4 ..."
ffplay -loglevel 0 -autoexit ./rec16.wav.mp4
echo "Done"
exit 0

View File

@ -6,21 +6,29 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
```
./main -h
usage: ./main [options] file0.wav file1.wav ...
usage: ./bin/main [options] file0.wav file1.wav ...
options:
-h, --help show this help message and exit
-s SEED, --seed SEED RNG seed (default: -1)
-t N, --threads N number of threads to use during computation (default: 4)
-o N, --offset N offset in milliseconds (default: 0)
-p N, --processors N number of processors to use during computation (default: 1)
-ot N, --offset-t N time offset in milliseconds (default: 0)
-on N, --offset-n N segment index offset (default: 0)
-mc N, --max-context N maximum number of text context tokens to store (default: max)
-ml N, --max-len N maximum segment length in characters (default: 0)
-wt N, --word-thold N word timestamp probability threshold (default: 0.010000)
-v, --verbose verbose output
--translate translate from source language to english
-otxt, --output-txt output result in a text file
-ovtt, --output-vtt output result in a vtt file
-osrt, --output-srt output result in a srt file
-owts, --output-words output script for generating karaoke video
-ps, --print_special print special tokens
-pc, --print_colors print colors
-nt, --no_timestamps do not print timestamps
-l LANG, --language LANG spoken language (default: en)
-m FNAME, --model FNAME model path (default: models/ggml-base.en.bin)
-f FNAME, --file FNAME input WAV file path
-h, --help show this help message and exit
```

View File

@ -36,6 +36,7 @@ std::string to_timestamp(int64_t t, bool comma = false) {
return std::string(buf);
}
// helper function to replace substrings
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
for (size_t pos = 0; ; pos += replace.length()) {
pos = s.find(search, pos);
@ -45,29 +46,6 @@ void replace_all(std::string & s, const std::string & search, const std::string
}
}
// a cost-function that is high for text that takes longer to pronounce
float voice_length(const std::string & text) {
float res = 0.0f;
// letters - add 1
// digits - add 3
// else - 0
for (size_t i = 0; i < text.size(); ++i) {
if (text[i] >= '0' && text[i] <= '9') {
res += 3.0f;
} else if (text[i] >= 'a' && text[i] <= 'z') {
res += 1.0f;
} else if (text[i] >= 'A' && text[i] <= 'Z') {
res += 1.0f;
} else {
res += 0.01f;
}
// TODO: support unicode
}
return res;
}
// command-line parameters
struct whisper_params {
int32_t seed = -1; // RNG seed, not used currently
@ -75,10 +53,13 @@ struct whisper_params {
int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
int32_t duration_ms = 0;
int32_t max_context = -1;
int32_t max_len = 0;
float word_thold = 0.1f;
float word_thold = 0.01f;
bool speed_up = false;
bool verbose = false;
bool translate = false;
bool output_txt = false;
@ -116,10 +97,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
params.offset_t_ms = std::stoi(argv[++i]);
} else if (arg == "-on" || arg == "--offset-n") {
params.offset_n = std::stoi(argv[++i]);
} else if (arg == "-d" || arg == "--duration") {
params.duration_ms = std::stoi(argv[++i]);
} else if (arg == "-mc" || arg == "--max-context") {
params.max_context = std::stoi(argv[++i]);
} else if (arg == "-ml" || arg == "--max-len") {
params.max_len = std::stoi(argv[++i]);
} else if (arg == "-wt" || arg == "--word-thold") {
params.word_thold = std::stof(argv[++i]);
} else if (arg == "-su" || arg == "--speed-up") {
params.speed_up = true;
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else if (arg == "--translate") {
@ -173,14 +160,17 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
fprintf(stderr, " -p N, --processors N number of processors to use during computation (default: %d)\n", params.n_processors);
fprintf(stderr, " -ot N, --offset-t N time offset in milliseconds (default: %d)\n", params.offset_t_ms);
fprintf(stderr, " -on N, --offset-n N segment index offset (default: %d)\n", params.offset_n);
fprintf(stderr, " -d N, --duration N duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -v, --verbose verbose output\n");
fprintf(stderr, " --translate translate from source language to english\n");
fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
fprintf(stderr, " -ovtt, --output-vtt output result in a vtt file\n");
fprintf(stderr, " -osrt, --output-srt output result in a srt file\n");
fprintf(stderr, " -owts, --output-words output word-level timestamps to a text file\n");
fprintf(stderr, " -owts, --output-words output script for generating karaoke video\n");
fprintf(stderr, " -ps, --print_special print special tokens\n");
fprintf(stderr, " -pc, --print_colors print colors\n");
fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
@ -190,65 +180,67 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
fprintf(stderr, "\n");
}
void whisper_print_segment_callback(struct whisper_context * ctx, void * user_data) {
void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
const whisper_params & params = *(whisper_params *) user_data;
const int n_segments = whisper_full_n_segments(ctx);
// print the last segment
const int i = n_segments - 1;
if (i == 0) {
// print the last n_new segments
const int s0 = n_segments - n_new;
if (s0 == 0) {
printf("\n");
}
if (params.no_timestamps) {
if (params.print_colors) {
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special_tokens == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
for (int i = s0; i < n_segments; i++) {
if (params.no_timestamps) {
if (params.print_colors) {
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special_tokens == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
}
}
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
}
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
printf("%s", text);
}
fflush(stdout);
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
printf("%s", text);
}
fflush(stdout);
} else {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
if (params.print_colors) {
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special_tokens == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
if (params.print_colors) {
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special_tokens == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
}
}
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
}
printf("\n");
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
}
printf("\n");
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
}
}
}
@ -318,566 +310,41 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
return true;
}
struct Interval {
int x0;
int x1;
int type;
};
struct IntervalArray : public std::vector<Interval> {
int F = -1;
};
std::vector<IntervalArray> fit_text_to_audio(const IntervalArray & input, int N, float alpha = 2.0f) {
const int x_max = input.back().x1;
std::vector<int> ls;
std::vector<int> rs;
std::vector<int> xs;
std::vector<int> gs;
int G_max = 0;
for (const auto & ii : input) {
if (ii.type == 0) {
continue;
}
ls.push_back(ii.x0);
rs.push_back(ii.x1);
xs.push_back(ii.x0);
xs.push_back(ii.x1);
gs.push_back(G_max);
G_max += ii.x1 - ii.x0;
gs.push_back(G_max);
}
const int inf = 100*G_max;
struct Cell {
int fval = -1;
int xprev = -1;
int w = -1;
};
// Function F + initial conditions
std::vector<std::vector<Cell>> F(xs.size());
for (auto & Fx : F) {
Fx.resize(N + 1);
for (auto & f : Fx) f.fval = inf;
Fx[0].fval = alpha*G_max;
}
// DP core
for (int n = 1; n <= N; ++n) {
for (int ix = 0; ix < (int) xs.size(); ++ix) {
const int x = xs[ix];
int best_fval = inf;
int best_xprev = -1;
int best_w = -1;
for (int il = 0; il < (int) ls.size(); ++il) {
const int l = ls[il];
if (l < n) continue;
if (l >= x) break;
for (int ir = il; ir < (int) rs.size(); ++ir) {
const int r = rs[ir];
if (r < l + 1) continue;
if (r > x) break;
const int cur_fval = F[2*il][n - 1].fval + (r - l) - (alpha + 1)*(gs[2*ir + 1] - gs[2*il]);
if (cur_fval < best_fval) {
best_fval = cur_fval;
best_xprev = 2*il;
best_w = r - l;
}
}
}
F[ix][n].fval = best_fval;
F[ix][n].xprev = best_xprev;
F[ix][n].w = best_w;
}
}
// generate output
std::vector<IntervalArray> res(N + 1);
{
for (int i = 1; i <= N; ++i) {
IntervalArray resCur;
int n = i;
std::vector<int> grid(x_max + 1, 0); // initally, everything is background
int best_ix = 0;
int best_fval = F[0][n].fval;
for (int ix = 1; ix < (int) xs.size(); ++ix) {
if (F[ix][n].fval < best_fval) {
best_ix = ix;
best_fval = F[ix][n].fval;
}
}
resCur.F = (i == input.size()/2) ? 0 : best_fval;
while (true) {
const int ix = F[best_ix][n].xprev;
const int w = F[best_ix][n].w;
for (int x = xs[ix]; x < xs[ix] + w; ++x) {
grid[x] = 1; // i.e. green
}
best_ix = F[best_ix][n].xprev;
if (--n == 0) break;
}
int x0 = 0;
int type = grid[0];
for (int x1 = 1; x1 <= x_max; ++x1) {
if (grid[x1] != grid[x1 - 1] || x1 == x_max) {
if (type == 1) {
resCur.push_back({x0, x1, 1});
}
x0 = x1;
type = grid[x1];
}
}
res[i] = std::move(resCur);
}
}
return res;
}
// word-level timestamps (experimental)
// TODO: probably still has bugs, needs refactoring, etc..
// TODO: auto threshold
// TODO: extra pass to detect unused speech and assign to tokens
// karaoke video generation
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
// TODO: font parameter adjustments
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, const std::vector<float> & pcmf32) {
std::vector<float> pcm_avg(pcmf32.size(), 0);
// average the fabs of the signal
{
const int hw = 32;
for (int i = 0; i < pcmf32.size(); i++) {
float sum = 0;
for (int j = -hw; j <= hw; j++) {
if (i + j >= 0 && i + j < pcmf32.size()) {
sum += fabs(pcmf32[i + j]);
}
}
pcm_avg[i] = sum/(2*hw + 1);
}
}
struct token_info {
int64_t t0 = -1;
int64_t t1 = -1;
int64_t tt0 = -1;
int64_t tt1 = -1;
whisper_token id;
whisper_token tid;
float p = 0.0f;
float pt = 0.0f;
float ptsum = 0.0f;
std::string text;
float vlen = 0.0f; // voice length of this token
void calc_vlen(struct whisper_context * ctx) {
if (id >= whisper_token_eot(ctx)) {
vlen = 0.1f;
return;
}
vlen = voice_length(text);
}
bool is_voice() const {
return vlen > 0.5f;
}
};
int64_t t_beg = 0;
int64_t t_last = 0;
whisper_token tid_last = 0;
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
fout << "!/bin/bash" << "\n";
// TODO: become parameter
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
fout << "#!/bin/bash" << "\n";
fout << "\n";
fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << float(pcm_avg.size() + 1000)/WHISPER_SAMPLE_RATE << ":rate=25:color=black -vf \"";
bool is_first = true;
fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";
for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
const char *text = whisper_full_get_segment_text(ctx, i);
const int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
const int s1 = std::min((int) pcm_avg.size(), (int) (t1*WHISPER_SAMPLE_RATE/100));
const int n = whisper_full_n_tokens(ctx, i);
std::vector<token_info> tokens(n);
if (n <= 1) {
continue;
}
std::vector<whisper_token_data> tokens(n);
for (int j = 0; j < n; ++j) {
struct whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
if (j == 0) {
if (token.id == whisper_token_beg(ctx)) {
tokens[j ].t0 = t0;
tokens[j ].t1 = t0;
tokens[j + 1].t0 = t0;
t_beg = t0;
t_last = t0;
tid_last = whisper_token_beg(ctx);
} else {
tokens[j ].t0 = t_last;
}
}
const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(ctx));
tokens[j].id = token.id;
tokens[j].tid = token.tid;
tokens[j].p = token.p;
tokens[j].pt = token.pt;
tokens[j].ptsum = token.ptsum;
tokens[j].text = whisper_token_to_str(ctx, token.id);
tokens[j].calc_vlen(ctx);
if (token.pt > params.word_thold && token.ptsum > 0.01 && token.tid > tid_last) {
if (j > 0) {
tokens[j - 1].t1 = tt;
}
tokens[j].t0 = tt;
tid_last = token.tid;
}
tokens[j] = whisper_full_get_token_data(ctx, i, j);
}
tokens[n - 2].t1 = t1;
tokens[n - 1].t0 = t1;
tokens[n - 1].t1 = t1;
t_last = t1;
{
{
int p0 = 0;
int p1 = 0;
while (true) {
while (p1 < n && tokens[p1].t1 < 0) {
p1++;
}
if (p1 >= n) {
p1--;
}
if (p1 > p0) {
IntervalArray arr;
const int s0 = std::max(0, (int) (tokens[p0].t0*WHISPER_SAMPLE_RATE/100));
const int s1 = std::min((int) pcm_avg.size() - 1, (int) (tokens[p1].t1*WHISPER_SAMPLE_RATE/100));
const int ns = s1 - s0;
float sum = 0.0f;
for (int k = s0; k < s1; k++) {
sum += pcm_avg[k];
}
const float thold = sum/ns;
printf("segment %4d: s0 = %6d, s1 = %6d, ns = %6d, thold = %f\n", i, s0, s1, ns, thold);
{
int last_s = -1;
int last_type = -1;
for (int k = s0; k < s1; k++) {
const int type = pcm_avg[k] > thold ? 1 : 0;
if (type != last_type) {
if (last_type != -1) {
arr.push_back({ last_s, k, last_type });
}
last_s = k;
last_type = type;
}
}
}
//for (int k = 0; k < arr.size(); k++) {
// printf(" %4d: %6d, %6d, %d\n", k, arr[k].x0, arr[k].x1, arr[k].type);
//}
int n_voice = 0;
for (int j = p0; j <= p1; ++j) {
if (tokens[j].is_voice()) {
n_voice++;
}
}
if (n_voice > 0 && arr.size() > n_voice) {
printf("xxxxxxxx n = %d, n_voice = %d, arr.size() = %d\n", n, n_voice, (int) arr.size());
auto res = fit_text_to_audio(arr, n_voice, 2.0f);
printf("done fit_text_to_audio, F = %d\n", res[n_voice].F);
{
int tid = p0;
for (int k = 0; k < (int) res[n_voice].size(); ++k) {
while (!tokens[tid].is_voice() && tid <= p1) {
//if (tokens[tid].t0 < 0) {
// tokens[tid].t0 = (int64_t) (100*res[n_voice][k].x0/WHISPER_SAMPLE_RATE);
//}
//if (tokens[tid].t1 < 0) {
// tokens[tid].t1 = tokens[tid].t0;
//}
tid++;
}
if (tid > p1) {
break;
}
if (tokens[tid].t0 < 0) {
tokens[tid].t0 = (int64_t) (100*res[n_voice][k].x0/WHISPER_SAMPLE_RATE);
if (tid > 0) {
tokens[tid - 1].t1 = tokens[tid].t0;
}
}
if (tokens[tid].t1 < 0) {
tokens[tid].t1 = (int64_t) (100*res[n_voice][k].x1/WHISPER_SAMPLE_RATE);
}
tid++;
}
printf("xxxxxxxx n = %d, tid = %d\n", n, tid);
}
}
}
p1++;
p0 = p1;
if (p1 >= n) {
break;
}
}
}
{
int p0 = 0;
int p1 = 0;
while (true) {
while (p1 < n && tokens[p1].t1 < 0) {
p1++;
}
if (p1 >= n) {
p1--;
}
if (p1 > p0) {
double psum = 0.0;
for (int j = p0; j <= p1; j++) {
psum += tokens[j].vlen;
}
//printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
const double dt = tokens[p1].t1 - tokens[p0].t0;
for (int j = p0 + 1; j <= p1; j++) {
const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
//const double ct = tokens[j - 1].t0 + (dt*(j - p0))/(p1 - p0 + 1);
//const double ct = tokens[p0].t0 + (dt*(j - p0))/(p1 - p0 + 1);
tokens[j - 1].t1 = ct;
tokens[j ].t0 = ct;
}
}
p1++;
p0 = p1;
if (p1 >= n) {
break;
}
}
}
for (int j = 0; j < n - 1; j++) {
if (tokens[j + 1].t0 < 0) {
tokens[j + 1].t0 = tokens[j].t1;
}
tokens[j].tt0 = tokens[j].t0;
tokens[j].tt1 = tokens[j].t1;
if (j < n - 2) {
tokens[j].tt1 = std::max(tokens[j].tt1, tokens[j + 1].t0);
}
}
// VAD
{
const int hw = WHISPER_SAMPLE_RATE; // take one second of audio around the token
for (int j = 0; j < n; j++) {
const int64_t t0 = tokens[j].t0;
const int64_t t1 = tokens[j].t1;
int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
int s1 = std::min((int) pcm_avg.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100));
const int ss0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100) - hw);
const int ss1 = std::min((int) pcm_avg.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100) + hw);
const int n = ss1 - ss0;
float sum = 0.0f;
for (int k = ss0; k < ss1; k++) {
sum += pcm_avg[k];
}
const float avg = sum/n;
const float thold = 0.5*avg;
{
int k = s0;
if (pcm_avg[k] > thold && j > 0) {
while (k > 0 && pcm_avg[k] > thold) {
k--;
}
tokens[j].t0 = (int64_t) (100*k/WHISPER_SAMPLE_RATE);
if (tokens[j].t0 < tokens[j - 1].t1) {
tokens[j].t0 = tokens[j - 1].t1;
} else {
s0 = k;
}
} else {
while (pcm_avg[k] < thold && k < s1) {
k++;
}
s0 = k;
tokens[j].t0 = 100*k/WHISPER_SAMPLE_RATE;
}
}
{
int k = s1;
if (pcm_avg[k] > thold) {
while (k < (int) pcm_avg.size() - 1 && pcm_avg[k] > thold) {
k++;
}
tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
if (j < n - 1 && tokens[j].t1 > tokens[j + 1].t0) {
tokens[j].t1 = tokens[j + 1].t0;
} else {
s1 = k;
}
} else {
while (pcm_avg[k] < thold && k > s0) {
k--;
}
s1 = k;
tokens[j].t1 = 100*k/WHISPER_SAMPLE_RATE;
}
}
}
}
const int t_expand = 0;
for (int j = 0; j < n; j++) {
if (j > 0) {
tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand));
}
if (j < n - 1) {
tokens[j].t1 = tokens[j].t1 + t_expand;
}
}
}
{
const std::string fname_tokens = "tokens-" + std::to_string(i) + ".txt";
std::ofstream fout(fname_tokens);
int s0 = std::max(0, (int) (t0*WHISPER_SAMPLE_RATE/100));
int s1 = std::min((int) pcm_avg.size() - 1, (int) (t1*WHISPER_SAMPLE_RATE/100));
for (int j = s0; j < s1; j++) {
int k = -1;
for (int r = 0; r < n; r++) {
if (j >= (int) (tokens[r].t0*WHISPER_SAMPLE_RATE/100) && j < (int) (tokens[r].t1*WHISPER_SAMPLE_RATE/100)) {
k = r;
break;
}
}
fout << j << " " << pcm_avg[j] << " " << float(k%3 + 1)/30.0 << std::endl;
}
fout.close();
}
for (int j = 0; j < n; ++j) {
const auto & token = tokens[j];
const auto tt = token.pt > params.word_thold && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, token.text.c_str());
if (tokens[j].id >= whisper_token_eot(ctx)) {
continue;
}
//printf("[%s --> %s] %s\n", to_timestamp(token.t0).c_str(), to_timestamp(token.t1).c_str(), whisper_token_to_str(ctx, token.id));
//fout << "# " << to_timestamp(token.t0) << " --> " << to_timestamp(token.t1) << " " << whisper_token_to_str(ctx, token.id) << "\n";
}
static const int line_wrap = 60;
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
if (!is_first) {
if (i > 0) {
fout << ",";
}
// background text
fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
is_first = false;
bool is_first = true;
for (int j = 0; j < n; ++j) {
const auto & token = tokens[j];
@ -886,10 +353,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
continue;
}
//if (!tokens[j].is_voice()) {
// continue;
//}
std::string txt_bg;
std::string txt_fg; // highlight token
std::string txt_ul; // underline
@ -925,17 +388,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
}
ncnt += txt.size();
if (ncnt > line_wrap) {
if (k < j) {
txt_bg = "> ";
txt_fg = "> ";
txt_ul = "\\ \\ ";
ncnt = 0;
} else {
break;
}
}
}
::replace_all(txt_bg, "'", "");
@ -944,8 +396,11 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
::replace_all(txt_fg, "\"", "\\\"");
}
// background text
fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << token.tt0/100.0 << "," << token.tt1/100.0 << ")'";
if (is_first) {
// background text
fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
is_first = false;
}
// foreground text
fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
@ -1003,9 +458,30 @@ int main(int argc, char ** argv) {
std::vector<float> pcmf32;
{
drwav wav;
if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
whisper_print_usage(argc, argv, {});
if (fname_inp == "-") {
std::vector<uint8_t> wav_data;
{
uint8_t buf[1024];
while (true)
{
const size_t n = fread(buf, 1, sizeof(buf), stdin);
if (n == 0)
{
break;
}
wav_data.insert(wav_data.end(), buf, buf + n);
}
}
if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), NULL) == false)
{
fprintf(stderr, "error: failed to open WAV file from stdin\n");
return 4;
}
}
else if (drwav_init_file(&wav, fname_inp.c_str(), NULL) == false) {
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
return 4;
}
@ -1085,6 +561,13 @@ int main(int argc, char ** argv) {
wparams.n_threads = params.n_threads;
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
wparams.offset_ms = params.offset_t_ms;
wparams.duration_ms = params.duration_ms;
wparams.token_timestamps = params.output_wts || params.max_len > 0;
wparams.thold_pt = params.word_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.speed_up = params.speed_up;
// this callback is called on each new segment
if (!wparams.print_realtime) {
@ -1123,7 +606,7 @@ int main(int argc, char ** argv) {
// output to WTS file
if (params.output_wts) {
const auto fname_wts = fname_inp + ".wts";
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, pcmf32);
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
}
}
}

View File

@ -40,7 +40,10 @@ struct whisper_params {
int32_t step_ms = 3000;
int32_t length_ms = 10000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
bool speed_up = false;
bool verbose = false;
bool translate = false;
bool no_context = true;
@ -68,6 +71,12 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
params.length_ms = std::stoi(argv[++i]);
} else if (arg == "-c" || arg == "--capture") {
params.capture_id = std::stoi(argv[++i]);
} else if (arg == "-mt" || arg == "--max_tokens") {
params.max_tokens = std::stoi(argv[++i]);
} else if (arg == "-ac" || arg == "--audio_ctx") {
params.audio_ctx = std::stoi(argv[++i]);
} else if (arg == "-su" || arg == "--speed-up") {
params.speed_up = true;
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else if (arg == "--translate") {
@ -113,6 +122,9 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
fprintf(stderr, " -mt N, --max_tokens N maximum number of tokens per audio chunk (default: %d)\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio_ctx N audio context size (default: %d, 0 - all)\n", params.audio_ctx);
fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -v, --verbose verbose output\n");
fprintf(stderr, " --translate translate from source language to english\n");
fprintf(stderr, " -kc, --keep-context keep text context from earlier audio (default: false)\n");
@ -217,6 +229,7 @@ int main(int argc, char ** argv) {
const int n_samples = (params.step_ms/1000.0)*WHISPER_SAMPLE_RATE;
const int n_samples_len = (params.length_ms/1000.0)*WHISPER_SAMPLE_RATE;
const int n_samples_30s = 30*WHISPER_SAMPLE_RATE;
const int n_samples_keep = 0.2*WHISPER_SAMPLE_RATE;
std::vector<float> pcmf32(n_samples_30s, 0.0f);
std::vector<float> pcmf32_old;
@ -299,7 +312,7 @@ int main(int argc, char ** argv) {
//const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
// take up to params.length_ms audio from previous iteration
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_len - n_samples_new));
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
@ -323,9 +336,14 @@ int main(int argc, char ** argv) {
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = params.no_context;
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
return 6;
@ -373,7 +391,8 @@ int main(int argc, char ** argv) {
if ((n_iter % n_new_line) == 0) {
printf("\n");
pcmf32_old.clear();
// keep part of the audio for next iteration to try to mitigate word boundary issues
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
}
}
}

View File

@ -78,6 +78,14 @@ There are a lot of ways to improve this idea and I don't have much experience wi
*"optimize by sorting the data first"*
The plugin would then make an appropriate query using the selected text and code context to Copilot or GPT-3 and return the result.
Here is a proof-of-concept:
https://user-images.githubusercontent.com/1991296/199078847-0278fcde-5667-4748-ba0d-7d55381d6047.mp4
https://user-images.githubusercontent.com/1991296/200067939-f98d2ac2-7519-438a-85f9-79db0841ba4f.mp4
For explanation how this works see: https://twitter.com/ggerganov/status/1587168771789258756
## Discussion

7
extra/sha-all.sh Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
# Compute the SHA1 of all model files in ./models/ggml-*.bin
for f in ./models/ggml-*.bin; do
shasum "$f" -a 1
done

28
ggml.c
View File

@ -14,7 +14,7 @@
#include <stdint.h>
#include <stdio.h>
#if defined _MSC_VER
#if defined _MSC_VER || defined(__MINGW32__)
#include <Windows.h>
typedef volatile LONG atomic_int;
@ -37,13 +37,24 @@ typedef HANDLE pthread_t;
typedef DWORD thread_ret_t;
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
out = CreateThread(NULL, 0, func, arg, 0, NULL);
return out != NULL;
HANDLE handle = CreateThread(NULL, 0, func, arg, 0, NULL);
if (handle == NULL)
{
return EAGAIN;
}
*out = handle;
return 0;
}
static int pthread_join(pthread_t thread, void* unused) {
return (int) WaitForSingleObject(thread, INFINITE);
}
static int sched_yield (void) {
Sleep (0);
return 0;
}
#else
#include <pthread.h>
#include <stdatomic.h>
@ -193,7 +204,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
// timing
//
#if defined(_MSC_VER)
#if defined(_MSC_VER) || defined(__MINGW32__)
static int64_t timer_freq;
void ggml_time_init(void) {
LARGE_INTEGER frequency;
@ -3145,7 +3156,10 @@ void ggml_compute_forward_add_f32(
GGML_ASSERT(nb00 == sizeof(float));
if (nb10 == sizeof(float)) {
for (int j = ith; j < n; j += nth) {
const int j0 = (n/nth)*ith;
const int j1 = ith == nth - 1 ? n : (n/nth)*(ith + 1);
for (int j = j0; j < j1; j++) {
ggml_vec_add_f32(nc,
(float *) ((char *) dst->data + j*nb1),
(float *) ((char *) src0->data + j*nb01),
@ -6852,7 +6866,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} break;
case GGML_OP_ADD:
{
node->n_tasks = 1;
node->n_tasks = n_threads;
} break;
case GGML_OP_SUB:
case GGML_OP_MUL:
@ -8084,7 +8098,7 @@ int ggml_cpu_has_avx512(void) {
}
int ggml_cpu_has_neon(void) {
#if defined(__ARM_NEON__)
#if defined(__ARM_NEON)
return 1;
#else
return 0;

175
ggml.h
View File

@ -1,5 +1,174 @@
#pragma once
//
// GGML Tensor Library
//
// This documentation is still a work in progress.
// If you wish some specific topics to be covered, feel free to drop a comment:
//
// https://github.com/ggerganov/whisper.cpp/issues/40
//
// ## Overview
//
// This library implements:
//
// - a set of tensor operations
// - automatic differentiation
// - basic optimization algorithms
//
// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
// but is not limited to, the following:
//
// - linear regression
// - support vector machines
// - neural networks
//
// The library allows the user to define a certain function using the available tensor operations. This function
// definition is represented internally via a computation graph. Each tensor operation in the function definition
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
// using one of the available optimization algorithms.
//
// For example, here we define the function: f(x) = a*x^2 + b
//
// {
// struct ggml_init_params params = {
// .mem_size = 16*1024*1024,
// .mem_buffer = NULL,
// };
//
// // memory allocation happens here
// struct ggml_context * ctx = ggml_init(params);
//
// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
//
// ggml_set_param(ctx, x); // x is an input variable
//
// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
//
// ...
// }
//
// Notice that the function definition above does not involve any actual computation. The computation is performed only
// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
//
// {
// ...
//
// struct ggml_cgraph gf = ggml_build_forward(f);
//
// // set the input variable and parameter values
// ggml_set_f32(x, 2.0f);
// ggml_set_f32(a, 3.0f);
// ggml_set_f32(b, 4.0f);
//
// ggml_graph_compute(ctx0, &gf);
//
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
//
// ...
// }
//
// The actual computation is performed in the ggml_graph_compute() function.
//
// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
// actually needed.
//
// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
// differentiation and optimization algorithms.
//
// The described approach allows to define the function graph once and then compute its forward or backward graphs
// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
// the user can avoid the memory allocation overhead at runtime.
//
// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
// citizens, but in theory the library can be extended to support FP8 and integer data types.
//
// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
// clear that the library needs to support more complex operations. The way to support these operations is not clear
// yet, but a few examples are demonstrated in the following operations:
//
// - ggml_permute()
// - ggml_conv_1d_1s()
// - ggml_conv_1d_2s()
//
// For each tensor operator, the library implements a forward and backward computation function. The forward function
// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
// calculus class, or watch the following video:
//
// What is Automatic Differentiation?
// https://www.youtube.com/watch?v=wG_nF1awSSY
//
//
// ## Tensor data (struct ggml_tensor)
//
// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
//
// {
// struct ggml_tensor * c = ggml_add(ctx, a, b);
//
// assert(c->src[0] == a);
// assert(c->src[1] == b);
// }
//
// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
// contiguous in memory.
//
// The data of the tensor is accessed via the "data" pointer. For example:
//
// {
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
//
// // a[1, 2] = 1.0f;
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
//
// // a[2, 0] = 2.0f;
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
//
// ...
// }
//
// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
//
// ## The matrix multiplication operator (ggml_mul_mat)
//
// TODO
//
//
// ## Multi-threading
//
// TODO
//
//
// ## Overview of ggml.c
//
// TODO
//
//
// ## SIMD optimizations
//
// TODO
//
//
// ## Debugging ggml
//
// TODO
//
//
#ifdef __cplusplus
extern "C" {
#endif
@ -21,7 +190,8 @@ typedef __fp16 ggml_fp16_t;
typedef uint16_t ggml_fp16_t;
#endif
float ggml_fp16_to_fp32(ggml_fp16_t x);
// convert FP16 <-> FP32
float ggml_fp16_to_fp32(ggml_fp16_t x);
ggml_fp16_t ggml_fp32_to_fp16(float x);
struct ggml_object;
@ -36,6 +206,7 @@ enum ggml_type {
GGML_TYPE_COUNT,
};
// available tensor operations:
enum ggml_op {
GGML_OP_NONE = 0,
@ -136,7 +307,7 @@ struct ggml_init_params {
void * mem_buffer; // if NULL, memory will be allocated internally
};
void ggml_time_init(void);
void ggml_time_init(void); // call this once at the beginning of the program
int64_t ggml_time_ms(void);
int64_t ggml_time_us(void);
int64_t ggml_cycles(void);

View File

@ -22,6 +22,20 @@ A third option to obtain the model files is to download them from Hugging Face:
https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
## Available models
| Model | Disk | Mem | SHA |
| --- | --- | --- | --- |
| tiny | 75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
| tiny.en | 75 MB | ~390 MB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
| base | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| base.en | 142 MB | ~500 MB | `137c40403d78fd54d454da0f9bd998f78703390c` |
| small | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| small.en | 466 MB | ~1.0 GB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
| large | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
## Model files for testing purposes
The model files pefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for testing purposes.

View File

@ -297,8 +297,6 @@ for name in list_vars.keys():
name == "encoder.conv2.bias" or \
name == "encoder.positional_embedding" or \
name == "decoder.positional_embedding":
ftype = 0
data = data.astype(np.float32)
print(" Converting to float32")
data = data.astype(np.float32)
ftype = 0

View File

@ -0,0 +1,63 @@
@echo off
pushd %~dp0
set models_path=%CD%
popd
set argc=0
for %%x in (%*) do set /A argc+=1
set models=tiny.en tiny base.en base small.en small medium.en medium large
if %argc% neq 1 (
echo.
echo Usage: download-ggml-model.cmd model
CALL :list_models
goto :eof
)
set model=%1
for %%b in (%models%) do (
if "%%b"=="%model%" (
CALL :download_model
goto :eof
)
)
echo Invalid model: %model%
CALL :list_models
goto :eof
:download_model
echo Downloading ggml model %model%...
cd %models_path%
if exist "ggml-%model%.bin" (
echo Model %model% already exists. Skipping download.
goto :eof
)
PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://ggml.ggerganov.com/ggml-model-whisper-%model%.bin -OutFile ggml-%model%.bin"
if %ERRORLEVEL% neq 0 (
echo Failed to download ggml model %model%
echo Please try again later or download the original Whisper model files and convert them yourself.
goto :eof
)
echo Done! Model %model% saved in %models_path%\models\ggml-%model%.bin
echo You can now use it like this:
echo main.exe -m %models_path%\models\ggml-%model%.bin -f %models_path%\samples\jfk.wav
goto :eof
:list_models
echo.
echo Available models:
(for %%a in (%models%) do (
echo %%a
))
echo.
exit /b

View File

@ -133,11 +133,19 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
static const size_t MB = 1024*1024;
static const std::map<e_model, size_t> MEM_REQ_MODEL = {
{ MODEL_TINY, 86ull*MB },
{ MODEL_BASE, 165ull*MB },
{ MODEL_SMALL, 540ull*MB },
{ MODEL_MEDIUM, 1650ull*MB },
{ MODEL_LARGE, 3260ull*MB },
{ MODEL_TINY, 74ull*MB },
{ MODEL_BASE, 142ull*MB },
{ MODEL_SMALL, 466ull*MB },
{ MODEL_MEDIUM, 1464ull*MB },
{ MODEL_LARGE, 2952ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_MEMORY = {
{ MODEL_TINY, 12ull*MB },
{ MODEL_BASE, 24ull*MB },
{ MODEL_SMALL, 70ull*MB },
{ MODEL_MEDIUM, 184ull*MB },
{ MODEL_LARGE, 306ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
@ -410,6 +418,15 @@ struct whisper_context {
std::vector<whisper_segment> result_all;
std::vector<whisper_token> prompt_past;
// [EXPERIMENTAL] token-level timestamps data
int64_t t_beg;
int64_t t_last;
whisper_token tid_last;
std::vector<float> energy; // PCM signal energy
// [EXPERIMENTAL] speed-up techniques
int32_t exp_n_audio_ctx; // 0 - use default
};
// load the model from a ggml file
@ -423,7 +440,7 @@ struct whisper_context {
//
// see the convert-pt-to-ggml.py script for details
//
bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
static bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());
auto & model = wctx.model;
@ -498,7 +515,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
wctx.buf_model = new std::vector<uint8_t>();
wctx.buf_model->resize(MEM_REQ_MODEL.at(model.type));
wctx.buf_memory.resize(std::max(MEM_REQ_MODEL.at(model.type), MEM_REQ_MODEL.at(model.type))); // TODO: TMP !!!
wctx.buf_memory.resize(MEM_REQ_MEMORY.at(model.type));
wctx.buf_compute.resize(std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
wctx.buf_compute_layer.resize(std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
@ -599,7 +616,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
const int n_audio_state = hparams.n_audio_state;
const int n_audio_layer = hparams.n_audio_layer;
const int n_text_ctx = hparams.n_text_ctx;
const int n_text_ctx = hparams.n_text_ctx;
const int n_text_state = hparams.n_text_state;
const int n_text_layer = hparams.n_text_layer;
@ -722,20 +739,6 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
}
}
// create the ggml memory context
{
struct ggml_init_params params = {
.mem_size = wctx.buf_memory.size(),
.mem_buffer = wctx.buf_memory.data(),
};
model.ctx_mem = ggml_init(params);
if (!model.ctx_mem) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
// prepare memory for the weights
{
auto & ctx = model.ctx;
@ -748,7 +751,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
const int n_audio_state = hparams.n_audio_state;
const int n_audio_layer = hparams.n_audio_layer;
const int n_text_ctx = hparams.n_text_ctx;
const int n_text_ctx = hparams.n_text_ctx;
const int n_text_state = hparams.n_text_state;
const int n_text_layer = hparams.n_text_layer;
@ -932,6 +935,20 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
}
}
// create the ggml memory context
{
struct ggml_init_params params = {
.mem_size = wctx.buf_memory.size(),
.mem_buffer = wctx.buf_memory.data(),
};
model.ctx_mem = ggml_init(params);
if (!model.ctx_mem) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
}
// key + value memory
{
auto & ctx = model.ctx_mem;
@ -953,7 +970,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
// key/value memory for the cross-attention layer
{
const int n_audio_ctx = hparams.n_audio_ctx;
const int n_audio_ctx = hparams.n_audio_ctx;
const int n_mem = n_text_layer*n_audio_ctx;
const int n_elements = n_text_state*n_mem;
@ -1054,7 +1071,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
// - n_threads: number of threads to use
// - mel_offset: offset in the mel spectrogram (i.e. audio offset)
//
bool whisper_encode(
static bool whisper_encode(
whisper_context & wctx,
const int n_threads,
const int mel_offset) {
@ -1062,13 +1079,11 @@ bool whisper_encode(
const auto & mel_inp = wctx.mel;
const auto & hparams = model.hparams;
const int n_ctx = hparams.n_audio_ctx;
const int n_ctx = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
const int n_state = hparams.n_audio_state;
const int n_head = hparams.n_audio_head;
const int n_layer = hparams.n_audio_layer;
const int N = n_ctx;
const int n_mels = hparams.n_mels;
assert(mel_inp.n_mel == n_mels);
@ -1118,7 +1133,30 @@ bool whisper_encode(
cur = ggml_gelu(ctx0, cur);
}
cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
// ===================================================================
// NOTE: experimenting with partial evaluation of the encoder (ignore)
//static int iter = -1;
//const int n_iter = 1500/n_ctx;
//iter = (iter + 1) % n_iter;
//if (iter == 0) {
// memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
// memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
//}
static int iter = 0;
const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
// ===================================================================
// original:
//cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
struct ggml_tensor * inpL = cur;
@ -1184,14 +1222,14 @@ bool whisper_encode(
ggml_permute(ctxL,
ggml_cpy(ctxL,
Qcur,
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
0, 2, 1, 3);
struct ggml_tensor * K =
ggml_permute(ctxL,
ggml_cpy(ctxL,
Kcur,
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
0, 2, 1, 3);
struct ggml_tensor * V =
@ -1199,9 +1237,9 @@ bool whisper_encode(
ggml_permute(ctxL,
ggml_reshape_3d(ctxL,
Vcur,
n_state/n_head, n_head, N),
n_state/n_head, n_head, n_ctx),
1, 2, 0, 3),
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, N, n_state/n_head, n_head)
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_ctx, n_state/n_head, n_head)
);
struct ggml_tensor * KQV = ggml_flash_attn(ctxL, Q, K, V, false);
@ -1210,14 +1248,14 @@ bool whisper_encode(
ggml_permute(ctxL,
ggml_cpy(ctxL,
Qcur,
ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)),
ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
0, 2, 1, 3);
struct ggml_tensor * K =
ggml_permute(ctxL,
ggml_cpy(ctxL,
Kcur,
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
0, 2, 1, 3);
// K * Q
@ -1235,7 +1273,7 @@ bool whisper_encode(
// ggml_permute(ctxL,
// ggml_cpy(ctxL,
// Vcur,
// ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, N)),
// ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_head, n_ctx)),
// 1, 2, 0, 3);
//struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
@ -1245,9 +1283,9 @@ bool whisper_encode(
ggml_permute(ctxL,
ggml_reshape_3d(ctxL,
Vcur,
n_state/n_head, n_head, N),
n_state/n_head, n_head, n_ctx),
0, 2, 1, 3),
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, N, n_head)
ggml_new_tensor_3d(ctxL, GGML_TYPE_F16, n_state/n_head, n_ctx, n_head)
);
struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max);
@ -1257,7 +1295,7 @@ bool whisper_encode(
cur = ggml_cpy(ctxL,
KQV_merged,
ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N));
ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, n_ctx));
}
// projection
@ -1411,6 +1449,8 @@ bool whisper_encode(
Vcross),
Vcross);
//struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
//struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_cross_k, n_state*n_ctx, (ggml_element_size(model.memory_cross_k)*n_state)*(il*n_ctx));
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_cross_v, n_state*n_ctx, (ggml_element_size(model.memory_cross_v)*n_state)*(il*n_ctx));
@ -1440,7 +1480,7 @@ bool whisper_encode(
// - n_tokens: number of tokens in the prompt
// - n_past: number of past tokens to prefix the prompt with
//
bool whisper_decode(
static bool whisper_decode(
whisper_context & wctx,
const int n_threads,
const whisper_token * tokens,
@ -1460,7 +1500,7 @@ bool whisper_decode(
const int n_layer = hparams.n_text_layer;
const int N = n_tokens;
const int M = hparams.n_audio_ctx;
const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
struct ggml_init_params params = {
.mem_size = wctx.buf_compute.size(),
@ -1803,10 +1843,12 @@ bool whisper_decode(
}
// the most basic sampling scheme - select the top token
whisper_token_data whisper_sample_best(
static whisper_token_data whisper_sample_best(
const whisper_vocab & vocab,
const float * probs) {
whisper_token_data result;
whisper_token_data result = {
0, 0, 0.0f, 0.0f, 0.0f, -1, -1, 0.0f,
};
int n_logits = vocab.id_to_token.size();
@ -1879,7 +1921,7 @@ whisper_token_data whisper_sample_best(
}
// samples only from the timestamps tokens
whisper_vocab::id whisper_sample_timestamp(
static whisper_vocab::id whisper_sample_timestamp(
const whisper_vocab & vocab,
const float * probs) {
int n_logits = vocab.id_to_token.size();
@ -1931,7 +1973,7 @@ static std::string to_timestamp(int64_t t, bool comma = false) {
// naive Discrete Fourier Transform
// input is real-valued
// output is complex-valued
void dft(const std::vector<float> & in, std::vector<float> & out) {
static void dft(const std::vector<float> & in, std::vector<float> & out) {
int N = in.size();
out.resize(N*2);
@ -1955,7 +1997,7 @@ void dft(const std::vector<float> & in, std::vector<float> & out) {
// poor man's implementation - use something better
// input is real-valued
// output is complex-valued
void fft(const std::vector<float> & in, std::vector<float> & out) {
static void fft(const std::vector<float> & in, std::vector<float> & out) {
out.resize(in.size()*2);
int N = in.size();
@ -2006,7 +2048,7 @@ void fft(const std::vector<float> & in, std::vector<float> & out) {
}
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
bool log_mel_spectrogram(
static bool log_mel_spectrogram(
const float * samples,
const int n_samples,
const int sample_rate,
@ -2015,6 +2057,7 @@ bool log_mel_spectrogram(
const int n_mel,
const int n_threads,
const whisper_filters & filters,
const bool speed_up,
whisper_mel & mel) {
// Hanning window
@ -2028,7 +2071,7 @@ bool log_mel_spectrogram(
mel.n_len = (n_samples)/fft_step;
mel.data.resize(mel.n_mel*mel.n_len);
const int n_fft = 1 + fft_size/2;
const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2);
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
//printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
@ -2075,6 +2118,13 @@ bool log_mel_spectrogram(
//}
}
if (speed_up) {
// scale down in the frequency domain results in a speed up in the time domain
for (int j = 0; j < n_fft; j++) {
fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]);
}
}
// mel spectrogram
for (int j = 0; j < mel.n_mel; j++) {
double sum = 0.0;
@ -2155,7 +2205,21 @@ void whisper_free(struct whisper_context * ctx) {
int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
const int64_t t_start_us = ggml_time_us();
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, ctx->mel)) {
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, ctx->mel)) {
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
ctx->t_mel_us = ggml_time_us() - t_start_us;
return 0;
}
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
const int64_t t_start_us = ggml_time_us();
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, 2*WHISPER_N_FFT, 2*WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, ctx->mel)) {
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
@ -2323,14 +2387,25 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
/*.n_max_text_ctx =*/ 16384,
/*.offset_ms =*/ 0,
/*.duration_ms =*/ 0,
/*.translate =*/ false,
/*.no_context =*/ false,
/*.single_segment =*/ false,
/*.print_special_tokens =*/ false,
/*.print_progress =*/ true,
/*.print_realtime =*/ false,
/*.print_timestamps =*/ true,
/*.token_timestamps =*/ false,
/*.thold_pt =*/ 0.01f,
/*.thold_ptsum =*/ 0.01f,
/*.max_len =*/ 0,
/*.max_tokens =*/ 0,
/*.speed_up =*/ false,
/*.audio_ctx =*/ 0,
/*.language =*/ "en",
/*.greedy =*/ {
@ -2355,14 +2430,25 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
/*.n_max_text_ctx =*/ 16384,
/*.offset_ms =*/ 0,
/*.duration_ms =*/ 0,
/*.translate =*/ false,
/*.no_context =*/ false,
/*.single_segment =*/ false,
/*.print_special_tokens =*/ false,
/*.print_progress =*/ true,
/*.print_realtime =*/ false,
/*.print_timestamps =*/ true,
/*.token_timestamps =*/ false,
/*.thold_pt =*/ 0.01f,
/*.thold_ptsum =*/ 0.01f,
/*.max_len =*/ 0,
/*.max_tokens =*/ 0,
/*.speed_up =*/ false,
/*.audio_ctx =*/ 0,
/*.language =*/ "en",
/*.greedy =*/ {
@ -2384,6 +2470,68 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
return result;
}
// forward declarations
static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window);
static void whisper_exp_compute_token_level_timestamps(
struct whisper_context * ctx,
int i_segment,
float thold_pt,
float thold_ptsum);
// wrap the last segment to max_len characters
// returns the number of new segments
static int whisper_wrap_segment(struct whisper_context * ctx, int max_len) {
auto segment = ctx->result_all.back();
int res = 1;
int acc = 0;
std::string text;
for (int i = 0; i < (int) segment.tokens.size(); i++) {
const auto & token = segment.tokens[i];
if (token.id >= whisper_token_eot(ctx)) {
continue;
}
const auto txt = whisper_token_to_str(ctx, token.id);
const int cur = strlen(txt);
if (acc + cur > max_len && i > 0) {
// split here
ctx->result_all.back().text = std::move(text);
ctx->result_all.back().t1 = token.t0;
ctx->result_all.back().tokens.resize(i);
ctx->result_all.push_back({});
ctx->result_all.back().t0 = token.t0;
ctx->result_all.back().t1 = segment.t1;
// add tokens [i, end] to the new segment
ctx->result_all.back().tokens.insert(
ctx->result_all.back().tokens.end(),
segment.tokens.begin() + i,
segment.tokens.end());
acc = 0;
text = "";
segment = ctx->result_all.back();
i = -1;
res++;
} else {
acc += cur;
text += txt;
}
}
ctx->result_all.back().text = std::move(text);
return res;
}
int whisper_full(
struct whisper_context * ctx,
struct whisper_full_params params,
@ -2395,17 +2543,32 @@ int whisper_full(
result_all.clear();
// compute log mel spectrogram
if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
return -1;
if (params.speed_up) {
if (whisper_pcm_to_mel_phase_vocoder(ctx, samples, n_samples, params.n_threads) != 0) {
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
return -1;
}
} else {
if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
return -1;
}
}
if (params.token_timestamps) {
ctx->t_beg = 0;
ctx->t_last = 0;
ctx->tid_last = 0;
ctx->energy = get_signal_energy(samples, n_samples, 32);
}
const int seek_start = params.offset_ms/10;
const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len(ctx) : params.duration_ms/10);
// if length of spectrogram is less than 1s (100 samples), then return
// basically don't process anything that is less than 1s
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
if (whisper_n_len(ctx) < 100 + seek_start) {
if (seek_end < 100 + seek_start) {
return 0;
}
@ -2415,6 +2578,9 @@ int whisper_full(
prompt_past.clear();
}
// overwrite audio_ctx
ctx->exp_n_audio_ctx = params.audio_ctx;
// these tokens determine the task that will be performed
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
if (whisper_is_multilingual(ctx)) {
@ -2438,7 +2604,7 @@ int whisper_full(
// main loop
int seek = seek_start;
while (true) {
int progress_cur = (100*seek)/whisper_n_len(ctx);
const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
while (progress_cur >= progress_prev + progress_step) {
progress_prev += progress_step;
if (params.print_progress) {
@ -2446,7 +2612,7 @@ int whisper_full(
}
}
if (seek + 100 >= whisper_n_len(ctx)) {
if (seek + 100 >= seek_end) {
break;
}
@ -2525,15 +2691,21 @@ int whisper_full(
//}
// end of text token
if (token.id == whisper_token_eot(ctx)) {
if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
if (result_len == 0) {
if (seek + seek_delta + 100 >= whisper_n_len(ctx)) {
if (seek + seek_delta + 100 >= seek_end) {
result_len = i + 1;
} else {
// TODO: figure out how to resolve this
fprintf(stderr, "\n%s: failed to generate timestamp token - this should not happen\n\n", __func__);
}
}
if (params.single_segment) {
result_len = i + 1;
seek_delta = 100*WHISPER_CHUNK_SIZE;
}
break;
}
@ -2549,6 +2721,7 @@ int whisper_full(
}
}
// shrink down to result_len
tokens_cur.resize(result_len);
for (const auto & r : tokens_cur) {
@ -2574,21 +2747,35 @@ int whisper_full(
if (tokens_cur[i].id > whisper_token_beg(ctx)) {
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
if (!text.empty()) {
const auto tt0 = params.speed_up ? 2*t0 : t0;
const auto tt1 = params.speed_up ? 2*t1 : t1;
if (params.print_realtime) {
if (params.print_timestamps) {
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text.c_str());
printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
} else {
printf("%s", text.c_str());
fflush(stdout);
}
}
result_all.push_back({ t0, t1, text, {} });
result_all.push_back({ tt0, tt1, text, {} });
for (int j = i0; j <= i; j++) {
result_all.back().tokens.push_back(tokens_cur[j]);
}
int n_new = 1;
if (params.token_timestamps) {
whisper_exp_compute_token_level_timestamps(
ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
if (params.max_len > 0) {
n_new = whisper_wrap_segment(ctx, params.max_len);
}
}
if (params.new_segment_callback) {
params.new_segment_callback(ctx, params.new_segment_callback_user_data);
params.new_segment_callback(ctx, n_new, params.new_segment_callback_user_data);
}
}
text = "";
@ -2604,21 +2791,35 @@ int whisper_full(
if (!text.empty()) {
const auto t1 = seek + seek_delta;
const auto tt0 = params.speed_up ? 2*t0 : t0;
const auto tt1 = params.speed_up ? 2*t1 : t1;
if (params.print_realtime) {
if (params.print_timestamps) {
printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text.c_str());
printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
} else {
printf("%s", text.c_str());
fflush(stdout);
}
}
result_all.push_back({ t0, t1, text, {} });
result_all.push_back({ tt0, tt1, text, {} });
for (int j = i0; j < (int) tokens_cur.size(); j++) {
result_all.back().tokens.push_back(tokens_cur[j]);
}
int n_new = 1;
if (params.token_timestamps) {
whisper_exp_compute_token_level_timestamps(
ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
if (params.max_len > 0) {
n_new = whisper_wrap_segment(ctx, params.max_len);
}
}
if (params.new_segment_callback) {
params.new_segment_callback(ctx, params.new_segment_callback_user_data);
params.new_segment_callback(ctx, n_new, params.new_segment_callback_user_data);
}
}
}
@ -2684,7 +2885,7 @@ int whisper_full_parallel(
// key/value memory for the cross-attention layer
{
const int n_audio_ctx = hparams.n_audio_ctx;
const int n_audio_ctx = hparams.n_audio_ctx;
const int n_mem = n_text_layer*n_audio_ctx;
const int n_elements = n_text_state*n_mem;
@ -2752,7 +2953,7 @@ int whisper_full_parallel(
// call the new_segment_callback for each segment
if (params.new_segment_callback) {
params.new_segment_callback(ctx, params.new_segment_callback_user_data);
params.new_segment_callback(ctx, 1, params.new_segment_callback_user_data);
}
}
@ -2828,3 +3029,304 @@ const char * whisper_print_system_info() {
return s.c_str();
}
// =================================================================================================
//
// Experimental stuff below
//
// Not sure if these should be part of the library at all, because the quality of the results is not
// guaranteed. Might get removed at some point unless a robust algorithm implementation is found
//
// =================================================================================================
//
// token-level timestamps
//
static int timestamp_to_sample(int64_t t, int n_samples) {
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
}
static int64_t sample_to_timestamp(int i_sample) {
return (100*i_sample)/WHISPER_SAMPLE_RATE;
}
// a cost-function / heuristic that is high for text that takes longer to pronounce
// obviously, can be improved
static float voice_length(const std::string & text) {
float res = 0.0f;
for (size_t i = 0; i < text.size(); ++i) {
if (text[i] == ' ') {
res += 0.01f;
} else if (text[i] == ',') {
res += 2.00f;
} else if (text[i] == '.') {
res += 3.00f;
} else if (text[i] == '!') {
res += 3.00f;
} else if (text[i] == '?') {
res += 3.00f;
} else if (text[i] >= '0' && text[i] <= '9') {
res += 3.00f;
} else {
res += 1.00f;
}
}
return res;
}
// average the fabs of the signal
static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window) {
const int hw = n_samples_per_half_window;
std::vector<float> result(n_samples);
for (int i = 0; i < n_samples; i++) {
float sum = 0;
for (int j = -hw; j <= hw; j++) {
if (i + j >= 0 && i + j < n_samples) {
sum += fabs(signal[i + j]);
}
}
result[i] = sum/(2*hw + 1);
}
return result;
}
static void whisper_exp_compute_token_level_timestamps(
struct whisper_context * ctx,
int i_segment,
float thold_pt,
float thold_ptsum) {
auto & segment = ctx->result_all[i_segment];
auto & tokens = segment.tokens;
const int n_samples = ctx->energy.size();
if (n_samples == 0) {
fprintf(stderr, "%s: no signal data available\n", __func__);
return;
}
const int64_t t0 = segment.t0;
const int64_t t1 = segment.t1;
const int s0 = timestamp_to_sample(t0, n_samples);
const int s1 = timestamp_to_sample(t1, n_samples);
const int n = tokens.size();
if (n == 0) {
return;
}
if (n == 1) {
tokens[0].t0 = t0;
tokens[0].t1 = t1;
return;
}
auto & t_beg = ctx->t_beg;
auto & t_last = ctx->t_last;
auto & tid_last = ctx->tid_last;
for (int j = 0; j < n; ++j) {
auto & token = tokens[j];
if (j == 0) {
if (token.id == whisper_token_beg(ctx)) {
tokens[j ].t0 = t0;
tokens[j ].t1 = t0;
tokens[j + 1].t0 = t0;
t_beg = t0;
t_last = t0;
tid_last = whisper_token_beg(ctx);
} else {
tokens[j ].t0 = t_last;
}
}
const int64_t tt = t_beg + 2*(token.tid - whisper_token_beg(ctx));
tokens[j].id = token.id;
tokens[j].tid = token.tid;
tokens[j].p = token.p;
tokens[j].pt = token.pt;
tokens[j].ptsum = token.ptsum;
tokens[j].vlen = voice_length(whisper_token_to_str(ctx, token.id));
if (token.pt > thold_pt && token.ptsum > thold_ptsum && token.tid > tid_last && tt <= t1) {
if (j > 0) {
tokens[j - 1].t1 = tt;
}
tokens[j].t0 = tt;
tid_last = token.tid;
}
}
tokens[n - 2].t1 = t1;
tokens[n - 1].t0 = t1;
tokens[n - 1].t1 = t1;
t_last = t1;
// find intervals of tokens with unknown timestamps
// fill the timestamps by proportionally splitting the interval based on the token voice lengths
{
int p0 = 0;
int p1 = 0;
while (true) {
while (p1 < n && tokens[p1].t1 < 0) {
p1++;
}
if (p1 >= n) {
p1--;
}
if (p1 > p0) {
double psum = 0.0;
for (int j = p0; j <= p1; j++) {
psum += tokens[j].vlen;
}
//printf("analyzing %d - %d, psum = %f\n", p0, p1, psum);
const double dt = tokens[p1].t1 - tokens[p0].t0;
// split the time proportionally to the voice length
for (int j = p0 + 1; j <= p1; j++) {
const double ct = tokens[j - 1].t0 + dt*tokens[j - 1].vlen/psum;
tokens[j - 1].t1 = ct;
tokens[j ].t0 = ct;
}
}
p1++;
p0 = p1;
if (p1 >= n) {
break;
}
}
}
// fix up (just in case)
for (int j = 0; j < n - 1; j++) {
if (tokens[j].t1 < 0) {
tokens[j + 1].t0 = tokens[j].t1;
}
if (j > 0) {
if (tokens[j - 1].t1 > tokens[j].t0) {
tokens[j].t0 = tokens[j - 1].t1;
tokens[j].t1 = std::max(tokens[j].t0, tokens[j].t1);
}
}
}
// VAD
// expand or contract tokens based on voice activity
{
const int hw = WHISPER_SAMPLE_RATE/8;
for (int j = 0; j < n; j++) {
if (tokens[j].id >= whisper_token_eot(ctx)) {
continue;
}
int s0 = timestamp_to_sample(tokens[j].t0, n_samples);
int s1 = timestamp_to_sample(tokens[j].t1, n_samples);
const int ss0 = std::max(s0 - hw, 0);
const int ss1 = std::min(s1 + hw, n_samples);
const int ns = ss1 - ss0;
float sum = 0.0f;
for (int k = ss0; k < ss1; k++) {
sum += ctx->energy[k];
}
const float thold = 0.5*sum/ns;
{
int k = s0;
if (ctx->energy[k] > thold && j > 0) {
while (k > 0 && ctx->energy[k] > thold) {
k--;
}
tokens[j].t0 = sample_to_timestamp(k);
if (tokens[j].t0 < tokens[j - 1].t1) {
tokens[j].t0 = tokens[j - 1].t1;
} else {
s0 = k;
}
} else {
while (ctx->energy[k] < thold && k < s1) {
k++;
}
s0 = k;
tokens[j].t0 = sample_to_timestamp(k);
}
}
{
int k = s1;
if (ctx->energy[k] > thold) {
while (k < n_samples - 1 && ctx->energy[k] > thold) {
k++;
}
tokens[j].t1 = sample_to_timestamp(k);
if (j < ns - 1 && tokens[j].t1 > tokens[j + 1].t0) {
tokens[j].t1 = tokens[j + 1].t0;
} else {
s1 = k;
}
} else {
while (ctx->energy[k] < thold && k > s0) {
k--;
}
s1 = k;
tokens[j].t1 = sample_to_timestamp(k);
}
}
}
}
// fixed token expand (optional)
//{
// const int t_expand = 0;
// for (int j = 0; j < n; j++) {
// if (j > 0) {
// tokens[j].t0 = std::max(0, (int) (tokens[j].t0 - t_expand));
// }
// if (j < n - 1) {
// tokens[j].t1 = tokens[j].t1 + t_expand;
// }
// }
//}
// debug info
//for (int j = 0; j < n; ++j) {
// const auto & token = tokens[j];
// const auto tt = token.pt > thold_pt && token.ptsum > 0.01 ? whisper_token_to_str(ctx, token.tid) : "[?]";
// printf("%s: %10s %6.3f %6.3f %6.3f %6.3f %5d %5d '%s'\n", __func__,
// tt, token.p, token.pt, token.ptsum, token.vlen, (int) token.t0, (int) token.t1, whisper_token_to_str(ctx, token.id));
// if (tokens[j].id >= whisper_token_eot(ctx)) {
// continue;
// }
//}
}

View File

@ -68,14 +68,21 @@ extern "C" {
typedef int whisper_token;
struct whisper_token_data {
typedef struct whisper_token_data {
whisper_token id; // token id
whisper_token tid; // forced timestamp token id
float p; // probability of the token
float pt; // probability of the timestamp token
float ptsum; // sum of probabilities of all timestamp tokens
};
// token-level timestamp data
// do not use if you haven't computed token-level timestamps
int64_t t0; // start time of the token
int64_t t1; // end time of the token
float vlen; // voice length of the token
} whisper_token_data;
// Allocates all memory needed for the model and loads the model from the given file.
// Returns NULL on failure.
@ -129,7 +136,7 @@ extern "C" {
// You can also implement your own sampling method using the whisper_get_probs() function.
// whisper_sample_best() returns the token with the highest probability
// whisper_sample_timestamp() returns the most probable timestamp token
WHISPER_API struct whisper_token_data whisper_sample_best(struct whisper_context * ctx);
WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
// Return the id of the specified language, returns -1 if not found
@ -172,22 +179,35 @@ extern "C" {
// Text segment callback
// Called on every newly generated text segment
// Use the whisper_full_...() functions to obtain the text segments
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, void * user_data);
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
struct whisper_full_params {
enum whisper_sampling_strategy strategy;
int n_threads;
int n_max_text_ctx;
int offset_ms;
int offset_ms; // start offset in ms
int duration_ms; // audio duration to process in ms
bool translate;
bool no_context;
bool single_segment; // force single segment output (useful for streaming)
bool print_special_tokens;
bool print_progress;
bool print_realtime;
bool print_timestamps;
// [EXPERIMENTAL] token-level timestamps
bool token_timestamps; // enable token-level timestamps
float thold_pt; // timestamp token probability threshold (~0.01)
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
int max_len; // max segment length in characters
int max_tokens; // max tokens per segment (0 = no limit)
// [EXPERIMENTAL] speed-up techniques
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
int audio_ctx; // overwrite the audio context size (0 = use default)
const char * language;
struct {
@ -244,7 +264,7 @@ extern "C" {
// Get token data for the specified token in the specified segment.
// This contains probabilities, timestamps, etc.
WHISPER_API struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
// Get the probability of the specified token in the specified segment.
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);