mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-03 07:50:28 +02:00
Compare commits
11 Commits
Author | SHA1 | Date | |
---|---|---|---|
59c997ca2d | |||
459753342d | |||
9764782bd9 | |||
3b010f9bed | |||
113fcec513 | |||
cfc06bf8df | |||
2bfe0ebc0f | |||
4dd7119deb | |||
ab1916fc59 | |||
a1c1583cc7 | |||
d012b5c7e4 |
48
.github/workflows/examples.yml
vendored
Normal file
48
.github/workflows/examples.yml
vendored
Normal file
@ -0,0 +1,48 @@
|
||||
name: Examples Tests
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- examples/addon.node/**
|
||||
- whisper.h
|
||||
pull_request:
|
||||
paths:
|
||||
- examples/addon.node/**
|
||||
- whisper.h
|
||||
|
||||
jobs:
|
||||
addon_node-ubuntu-latest:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
node-version: [ 16.x, 18.x ]
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install cmake
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Use Node.js ${{ matrix.node-version }}
|
||||
uses: actions/setup-node@v1
|
||||
with:
|
||||
node-version: ${{ matrix.node-version }}
|
||||
cache: 'npm'
|
||||
|
||||
- name: Install package.json dependencies
|
||||
working-directory: ./examples/addon.node
|
||||
run: npm install
|
||||
|
||||
- name: Compile addon.node
|
||||
run: npx cmake-js compile -T whisper-addon -B Release
|
||||
|
||||
- name: Download test model
|
||||
run: |
|
||||
bash ./models/download-ggml-model.sh base.en
|
||||
- name: Test
|
||||
run: |
|
||||
cd examples/addon.node
|
||||
npm run test
|
@ -226,10 +226,13 @@ target_compile_definitions(${TARGET} PUBLIC
|
||||
${WHISPER_EXTRA_FLAGS}
|
||||
)
|
||||
|
||||
set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")
|
||||
|
||||
install(TARGETS ${TARGET}
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib/static
|
||||
RUNTIME DESTINATION bin
|
||||
PUBLIC_HEADER DESTINATION include
|
||||
)
|
||||
|
||||
#
|
||||
@ -242,7 +245,7 @@ add_subdirectory(bindings)
|
||||
# programs, examples and tests
|
||||
#
|
||||
|
||||
if (WHISPER_BUILD_TESTS)
|
||||
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
enable_testing()
|
||||
add_subdirectory(tests)
|
||||
endif ()
|
||||
|
@ -465,6 +465,9 @@ in [models](models).
|
||||
- [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
|
||||
- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
|
||||
- [X] .NET:
|
||||
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
|
||||
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
|
||||
- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)
|
||||
|
||||
## Examples
|
||||
|
@ -24,3 +24,8 @@ target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
|
||||
#==================================================================
|
||||
|
||||
target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
|
||||
# Generate node.lib
|
||||
execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
|
||||
endif()
|
||||
|
@ -14,14 +14,14 @@ npm install
|
||||
Make sure it is in the project root directory and compiled with make-js.
|
||||
|
||||
```shell
|
||||
npx cmake-js compile -T whisper-addon
|
||||
npx cmake-js compile -T whisper-addon -B Release
|
||||
```
|
||||
|
||||
For Electron addon and cmake-js options, you can see [cmake-js](https://github.com/cmake-js/cmake-js) and make very few configuration changes.
|
||||
|
||||
> Such as appointing special cmake path:
|
||||
> ```shell
|
||||
> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon
|
||||
> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon -B Release
|
||||
> ```
|
||||
|
||||
## Run
|
||||
|
15
examples/addon.node/__test__/whisper.spec.js
Normal file
15
examples/addon.node/__test__/whisper.spec.js
Normal file
@ -0,0 +1,15 @@
|
||||
const path = require('path');
|
||||
const { whisper } = require(path.join(__dirname, '../../../build/Release/whisper-addon'));
|
||||
|
||||
const whisperParamsMock = {
|
||||
language: 'en',
|
||||
model: path.join(__dirname, '../../../models/ggml-base.en.bin'),
|
||||
fname_inp: path.join(__dirname, '../../../samples/jfk.wav'),
|
||||
};
|
||||
|
||||
describe("Run whisper.node", () => {
|
||||
|
||||
test("it should receive a non-empty value", () => {
|
||||
expect(whisper(whisperParamsMock).length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
@ -1,3 +1,4 @@
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
@ -398,9 +399,9 @@ Napi::Object whisper(const Napi::CallbackInfo& info) {
|
||||
}
|
||||
|
||||
Napi::Object res = Napi::Array::New(env, result.size());
|
||||
for (u_int32_t i = 0; i < result.size(); ++i) {
|
||||
for (uint64_t i = 0; i < result.size(); ++i) {
|
||||
Napi::Object tmp = Napi::Array::New(env, 3);
|
||||
for (u_int32_t j = 0; j < 3; ++j) {
|
||||
for (uint64_t j = 0; j < 3; ++j) {
|
||||
tmp[j] = Napi::String::New(env, result[i][j]);
|
||||
}
|
||||
res[i] = tmp;
|
||||
|
@ -5,8 +5,12 @@
|
||||
"main": "index.js",
|
||||
"author": "Qanhe Chen",
|
||||
"license": "MIT",
|
||||
"scripts": {
|
||||
"test": "jest"
|
||||
},
|
||||
"devDependencies": {
|
||||
"cmake-js": "^7.1.1",
|
||||
"jest": "^29.4.0",
|
||||
"node-addon-api": "^5.0.0"
|
||||
}
|
||||
}
|
||||
|
10
examples/chess/CMakeLists.txt
Normal file
10
examples/chess/CMakeLists.txt
Normal file
@ -0,0 +1,10 @@
|
||||
if (WHISPER_SUPPORT_SDL2)
|
||||
# chess
|
||||
set(TARGET chess)
|
||||
add_executable(${TARGET} chess.cpp)
|
||||
|
||||
include(DefaultTargetOptions)
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
||||
target_link_libraries(${TARGET} PRIVATE common whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif ()
|
634
examples/chess/chess.cpp
Normal file
634
examples/chess/chess.cpp
Normal file
@ -0,0 +1,634 @@
|
||||
// Input chess moves via voice
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "whisper.h"
|
||||
|
||||
#include <SDL.h>
|
||||
#include <SDL_audio.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
|
||||
// 500 -> 00:05.000
|
||||
// 6000 -> 01:00.000
|
||||
std::string to_timestamp(int64_t t) {
|
||||
int64_t sec = t/100;
|
||||
int64_t msec = t - sec*100;
|
||||
int64_t min = sec/60;
|
||||
sec = sec - min*60;
|
||||
|
||||
char buf[32];
|
||||
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
|
||||
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t step_ms = 3000;
|
||||
int32_t length_ms = 10000;
|
||||
int32_t keep_ms = 200;
|
||||
int32_t capture_id = -1;
|
||||
int32_t max_tokens = 32;
|
||||
int32_t audio_ctx = 0;
|
||||
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
|
||||
bool translate = false;
|
||||
bool print_special = false;
|
||||
bool no_context = true;
|
||||
bool no_timestamps = false;
|
||||
|
||||
std::string language = "en";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
std::string fname_inp;
|
||||
};
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
|
||||
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); }
|
||||
else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); }
|
||||
else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_inp = argv[++i]; }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.step_ms);
|
||||
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.length_ms);
|
||||
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n", params.keep_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", params.fname_inp.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
//
|
||||
// SDL Audio capture
|
||||
//
|
||||
|
||||
class audio_async {
|
||||
public:
|
||||
audio_async(int len_ms);
|
||||
~audio_async();
|
||||
|
||||
bool init(int capture_id, int sample_rate);
|
||||
|
||||
// start capturing audio via the provided SDL callback
|
||||
// keep last len_ms seconds of audio in a circular buffer
|
||||
bool resume();
|
||||
bool pause();
|
||||
bool clear();
|
||||
|
||||
// callback to be called by SDL
|
||||
void callback(uint8_t * stream, int len);
|
||||
|
||||
// get audio data from the circular buffer
|
||||
void get(int ms, std::vector<float> & audio);
|
||||
|
||||
private:
|
||||
SDL_AudioDeviceID m_dev_id_in = 0;
|
||||
|
||||
int m_len_ms = 0;
|
||||
int m_sample_rate = 0;
|
||||
|
||||
std::atomic_bool m_running;
|
||||
std::mutex m_mutex;
|
||||
|
||||
std::vector<float> m_audio;
|
||||
std::vector<float> m_audio_new;
|
||||
size_t m_audio_pos = 0;
|
||||
size_t m_audio_len = 0;
|
||||
};
|
||||
|
||||
audio_async::audio_async(int len_ms) {
|
||||
m_len_ms = len_ms;
|
||||
|
||||
m_running = false;
|
||||
}
|
||||
|
||||
audio_async::~audio_async() {
|
||||
if (m_dev_id_in) {
|
||||
SDL_CloseAudioDevice(m_dev_id_in);
|
||||
}
|
||||
}
|
||||
|
||||
bool audio_async::init(int capture_id, int sample_rate) {
|
||||
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
|
||||
|
||||
if (SDL_Init(SDL_INIT_AUDIO) < 0) {
|
||||
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
|
||||
|
||||
{
|
||||
int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
|
||||
fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
|
||||
for (int i = 0; i < nDevices; i++) {
|
||||
fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
|
||||
}
|
||||
}
|
||||
|
||||
SDL_AudioSpec capture_spec_requested;
|
||||
SDL_AudioSpec capture_spec_obtained;
|
||||
|
||||
SDL_zero(capture_spec_requested);
|
||||
SDL_zero(capture_spec_obtained);
|
||||
|
||||
capture_spec_requested.freq = sample_rate;
|
||||
capture_spec_requested.format = AUDIO_F32;
|
||||
capture_spec_requested.channels = 1;
|
||||
capture_spec_requested.samples = 1024;
|
||||
capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
|
||||
audio_async * audio = (audio_async *) userdata;
|
||||
audio->callback(stream, len);
|
||||
};
|
||||
capture_spec_requested.userdata = this;
|
||||
|
||||
if (capture_id >= 0) {
|
||||
fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
|
||||
m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
} else {
|
||||
fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
|
||||
m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
||||
}
|
||||
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
|
||||
m_dev_id_in = 0;
|
||||
|
||||
return false;
|
||||
} else {
|
||||
fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
|
||||
fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
|
||||
fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
|
||||
capture_spec_requested.format);
|
||||
fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
|
||||
capture_spec_requested.channels);
|
||||
fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
|
||||
}
|
||||
|
||||
m_sample_rate = capture_spec_obtained.freq;
|
||||
|
||||
m_audio.resize((m_sample_rate*m_len_ms)/1000);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::resume() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to resume!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (m_running) {
|
||||
fprintf(stderr, "%s: already running!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_PauseAudioDevice(m_dev_id_in, 0);
|
||||
|
||||
m_running = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::pause() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to pause!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: already paused!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
SDL_PauseAudioDevice(m_dev_id_in, 1);
|
||||
|
||||
m_running = false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool audio_async::clear() {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to clear!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: not running!\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
m_audio_pos = 0;
|
||||
m_audio_len = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// callback to be called by SDL
|
||||
void audio_async::callback(uint8_t * stream, int len) {
|
||||
if (!m_running) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n_samples = len / sizeof(float);
|
||||
|
||||
m_audio_new.resize(n_samples);
|
||||
memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
|
||||
|
||||
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (m_audio_pos + n_samples > m_audio.size()) {
|
||||
const size_t n0 = m_audio.size() - m_audio_pos;
|
||||
|
||||
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
|
||||
memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
|
||||
|
||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||
m_audio_len = m_audio.size();
|
||||
} else {
|
||||
memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
|
||||
|
||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||
m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void audio_async::get(int ms, std::vector<float> & result) {
|
||||
if (!m_dev_id_in) {
|
||||
fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!m_running) {
|
||||
fprintf(stderr, "%s: not running!\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (ms <= 0) {
|
||||
ms = m_len_ms;
|
||||
}
|
||||
|
||||
size_t n_samples = (m_sample_rate * ms) / 1000;
|
||||
if (n_samples > m_audio_len) {
|
||||
n_samples = m_audio_len;
|
||||
}
|
||||
|
||||
result.resize(n_samples);
|
||||
|
||||
int s0 = m_audio_pos - n_samples;
|
||||
if (s0 < 0) {
|
||||
s0 += m_audio.size();
|
||||
}
|
||||
|
||||
if (s0 + n_samples > m_audio.size()) {
|
||||
const size_t n0 = m_audio.size() - s0;
|
||||
|
||||
memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
|
||||
memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
|
||||
} else {
|
||||
memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
whisper_params params;
|
||||
|
||||
if (whisper_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.keep_ms = std::min(params.keep_ms, params.step_ms);
|
||||
params.length_ms = std::max(params.length_ms, params.step_ms);
|
||||
|
||||
const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
|
||||
|
||||
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
|
||||
|
||||
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
|
||||
|
||||
params.no_timestamps = !use_vad;
|
||||
params.no_context |= use_vad;
|
||||
params.max_tokens = 0;
|
||||
|
||||
// init audio
|
||||
|
||||
audio_async audio(params.length_ms);
|
||||
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
|
||||
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
audio.resume();
|
||||
|
||||
// whisper init
|
||||
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
||||
|
||||
std::vector<float> pcmf32 (n_samples_30s, 0.0f);
|
||||
std::vector<float> pcmf32_old;
|
||||
std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
|
||||
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
|
||||
// print some info about the processing
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
if (!whisper_is_multilingual(ctx)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
|
||||
__func__,
|
||||
n_samples_step,
|
||||
float(n_samples_step)/WHISPER_SAMPLE_RATE,
|
||||
float(n_samples_len )/WHISPER_SAMPLE_RATE,
|
||||
float(n_samples_keep)/WHISPER_SAMPLE_RATE,
|
||||
params.n_threads,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1);
|
||||
|
||||
if (!use_vad) {
|
||||
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
|
||||
} else {
|
||||
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
int n_iter = 0;
|
||||
|
||||
bool is_running = true;
|
||||
|
||||
printf("[Start speaking]");
|
||||
fflush(stdout);
|
||||
|
||||
auto t_last = std::chrono::high_resolution_clock::now();
|
||||
const auto t_start = t_last;
|
||||
|
||||
// main audio loop
|
||||
while (is_running) {
|
||||
// handle Ctrl + C
|
||||
{
|
||||
SDL_Event event;
|
||||
while (SDL_PollEvent(&event)) {
|
||||
switch (event.type) {
|
||||
case SDL_QUIT:
|
||||
{
|
||||
is_running = false;
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_running) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_running) {
|
||||
break;
|
||||
}
|
||||
|
||||
// process new audio
|
||||
|
||||
if (!use_vad) {
|
||||
while (true) {
|
||||
audio.get(params.step_ms, pcmf32_new);
|
||||
|
||||
if ((int) pcmf32_new.size() > 2*n_samples_step) {
|
||||
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
|
||||
audio.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((int) pcmf32_new.size() >= n_samples_step) {
|
||||
audio.clear();
|
||||
break;
|
||||
}
|
||||
|
||||
SDL_Delay(1);
|
||||
}
|
||||
|
||||
const int n_samples_new = pcmf32_new.size();
|
||||
|
||||
// take up to params.length_ms audio from previous iteration
|
||||
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
|
||||
|
||||
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
|
||||
|
||||
pcmf32.resize(n_samples_new + n_samples_take);
|
||||
|
||||
for (int i = 0; i < n_samples_take; i++) {
|
||||
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
|
||||
}
|
||||
|
||||
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
|
||||
|
||||
pcmf32_old = pcmf32;
|
||||
} else {
|
||||
const auto t_now = std::chrono::high_resolution_clock::now();
|
||||
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
|
||||
|
||||
if (t_diff < 2000) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
audio.get(2000, pcmf32_new);
|
||||
|
||||
if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
|
||||
audio.get(params.length_ms, pcmf32);
|
||||
} else {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
t_last = t_now;
|
||||
}
|
||||
|
||||
// run the inference
|
||||
{
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = !use_vad;
|
||||
wparams.max_tokens = params.max_tokens;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
|
||||
// disable temperature fallback
|
||||
wparams.temperature_inc = -1.0f;
|
||||
|
||||
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
|
||||
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
||||
return 6;
|
||||
}
|
||||
|
||||
// print result;
|
||||
{
|
||||
if (!use_vad) {
|
||||
printf("\33[2K\r");
|
||||
|
||||
// print long empty line to clear the previous line
|
||||
printf("%s", std::string(100, ' ').c_str());
|
||||
|
||||
printf("\33[2K\r");
|
||||
} else {
|
||||
const int64_t t1 = (t_last - t_start).count()/1000000;
|
||||
const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
|
||||
|
||||
printf("\n");
|
||||
printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
|
||||
if (params.no_timestamps) {
|
||||
printf("%s", text);
|
||||
fflush(stdout);
|
||||
} else {
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
|
||||
}
|
||||
}
|
||||
|
||||
if (use_vad){
|
||||
printf("\n");
|
||||
printf("### Transcription %d END\n", n_iter);
|
||||
}
|
||||
}
|
||||
|
||||
++n_iter;
|
||||
|
||||
if (!use_vad && (n_iter % n_new_line) == 0) {
|
||||
printf("\n");
|
||||
|
||||
// keep part of the audio for next iteration to try to mitigate word boundary issues
|
||||
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
|
||||
|
||||
// Add tokens of the last full length segment as the prompt
|
||||
if (!params.no_context) {
|
||||
prompt_tokens.clear();
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const int token_count = whisper_full_n_tokens(ctx, i);
|
||||
for (int j = 0; j < token_count; ++j) {
|
||||
prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
audio.pause();
|
||||
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
@ -69,6 +69,7 @@ struct whisper_params {
|
||||
bool speed_up = false;
|
||||
bool translate = false;
|
||||
bool diarize = false;
|
||||
bool split_on_word = false;
|
||||
bool no_fallback = false;
|
||||
bool output_txt = false;
|
||||
bool output_vtt = false;
|
||||
@ -118,6 +119,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
||||
else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
|
||||
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
|
||||
else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
|
||||
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
|
||||
@ -156,6 +158,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
||||
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
||||
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
||||
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
|
||||
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
||||
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
||||
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
||||
@ -651,6 +654,7 @@ int main(int argc, char ** argv) {
|
||||
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
wparams.split_on_word = params.split_on_word;
|
||||
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
|
@ -1,20 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Small shell script to more easily automatically download and transcribe live stream VODs.
|
||||
# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
|
||||
# Use `./examples/yt-wsp.sh help` to print help info.
|
||||
#
|
||||
# Sample usage:
|
||||
#
|
||||
# git clone https://github.com/ggerganov/whisper.cpp
|
||||
# cd whisper.cpp
|
||||
# make
|
||||
# ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
|
||||
#
|
||||
# shellcheck disable=2086
|
||||
|
||||
# MIT License
|
||||
|
||||
# Copyright (c) 2022 Daniils Petrovs
|
||||
# Copyright (c) 2023 Jennifer Capasso
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
@ -34,114 +24,181 @@
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
# Small shell script to more easily automatically download and transcribe live stream VODs.
|
||||
# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
|
||||
# Use `./examples/yt-wsp.sh help` to print help info.
|
||||
#
|
||||
# Sample usage:
|
||||
#
|
||||
# git clone https://github.com/ggerganov/whisper.cpp
|
||||
# cd whisper.cpp
|
||||
# make
|
||||
# ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
|
||||
#
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
# You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
|
||||
MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
|
||||
WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
|
||||
WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
|
||||
# get script file location
|
||||
SCRIPT_PATH="$(realpath -e ${BASH_SOURCE[0]})";
|
||||
SCRIPT_DIR="${SCRIPT_PATH%/*}"
|
||||
|
||||
################################################################################
|
||||
# Documentation on downloading models can be found in the whisper.cpp repo:
|
||||
# https://github.com/ggerganov/whisper.cpp/#usage
|
||||
#
|
||||
# note: unless a multilingual model is specified, WHISPER_LANG will be ignored
|
||||
# and the video will be transcribed as if the audio were in the English language
|
||||
################################################################################
|
||||
MODEL_PATH="${MODEL_PATH:-${SCRIPT_DIR}/../models/ggml-base.en.bin}"
|
||||
|
||||
################################################################################
|
||||
# Where to find the whisper.cpp executable. default to the examples directory
|
||||
# which holds this script in source control
|
||||
################################################################################
|
||||
WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-${SCRIPT_DIR}/../main}";
|
||||
|
||||
# Set to desired language to be translated into english
|
||||
WHISPER_LANG="${WHISPER_LANG:-en}";
|
||||
|
||||
# Default to 4 threads (this was most performant on my 2020 M1 MBP)
|
||||
WHISPER_THREAD_COUNT="${WHISPER_THREAD_COUNT:-4}";
|
||||
|
||||
msg() {
|
||||
echo >&2 -e "${1-}"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# create a temporary directory to work in
|
||||
# set the temp_dir and temp_filename variables
|
||||
################################################################################
|
||||
temp_dir="$(mktemp -d ${SCRIPT_DIR}/tmp.XXXXXX)";
|
||||
temp_filename="${temp_dir}/yt-dlp-filename";
|
||||
|
||||
################################################################################
|
||||
# for now we only take one argument
|
||||
# TODO: a for loop
|
||||
################################################################################
|
||||
source_url="${1}"
|
||||
|
||||
|
||||
title_name="";
|
||||
|
||||
|
||||
cleanup() {
|
||||
msg "Cleaning up..."
|
||||
rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
|
||||
local -r clean_me="${1}";
|
||||
|
||||
if [ -d "${clean_me}" ]; then
|
||||
msg "Cleaning up...";
|
||||
rm -rf "${clean_me}";
|
||||
else
|
||||
msg "'${clean_me}' does not appear to be a directory!";
|
||||
exit 1;
|
||||
fi;
|
||||
}
|
||||
|
||||
print_help() {
|
||||
echo "################################################################################"
|
||||
echo "Usage: ./examples/yt-wsp.sh <video_url>"
|
||||
echo "See configurable env variables in the script"
|
||||
echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
|
||||
echo "Requirements: ffmpeg yt-dlp whisper"
|
||||
echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
|
||||
echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
|
||||
echo "# See configurable env variables in the script; there are many!"
|
||||
echo "# This script will produce an MP4 muxed file in the working directory; it will"
|
||||
echo "# be named for the title and id of the video."
|
||||
echo "# passing in https://youtu.be/VYJtb2YXae8 produces a file named";
|
||||
echo "# 'Why_we_all_need_subtitles_now-VYJtb2YXae8-res.mp4'"
|
||||
echo "# Requirements: ffmpeg yt-dlp whisper.cpp"
|
||||
echo "################################################################################"
|
||||
}
|
||||
|
||||
check_requirements() {
|
||||
if ! command -v ffmpeg &>/dev/null; then
|
||||
echo "ffmpeg is required (https://ffmpeg.org)."
|
||||
echo "ffmpeg is required: https://ffmpeg.org";
|
||||
exit 1
|
||||
fi
|
||||
fi;
|
||||
|
||||
if ! command -v yt-dlp &>/dev/null; then
|
||||
echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
|
||||
exit 1
|
||||
fi
|
||||
echo "yt-dlp is required: https://github.com/yt-dlp/yt-dlp";
|
||||
exit 1;
|
||||
fi;
|
||||
|
||||
if ! command -v "${WHISPER_EXECUTABLE}" &>/dev/null; then
|
||||
echo "The C++ implementation of Whisper is required: https://github.com/ggerganov/whisper.cpp"
|
||||
echo "Sample usage:";
|
||||
echo "";
|
||||
echo " git clone https://github.com/ggerganov/whisper.cpp";
|
||||
echo " cd whisper.cpp";
|
||||
echo " make";
|
||||
echo " ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890";
|
||||
echo "";
|
||||
exit 1;
|
||||
fi;
|
||||
|
||||
if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
|
||||
WHISPER_EXECUTABLE="./main"
|
||||
if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
|
||||
echo "Whisper is required (https://github.com/ggerganov/whisper.cpp):"
|
||||
echo "Sample usage:"
|
||||
echo ""
|
||||
echo " git clone https://github.com/ggerganov/whisper.cpp"
|
||||
echo " cd whisper.cpp"
|
||||
echo " make"
|
||||
echo " ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890"
|
||||
echo ""
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
if [[ $# -lt 1 ]]; then
|
||||
print_help
|
||||
exit 1
|
||||
if [[ "${#}" -lt 1 ]]; then
|
||||
print_help;
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [[ "$1" == "help" ]]; then
|
||||
print_help
|
||||
exit 0
|
||||
if [[ "${1##-*}" == "help" ]]; then
|
||||
print_help;
|
||||
exit 0;
|
||||
fi
|
||||
|
||||
temp_dir="tmp"
|
||||
source_url="$1"
|
||||
check_requirements;
|
||||
|
||||
check_requirements
|
||||
msg "Downloading VOD...";
|
||||
|
||||
msg "Downloading VOD..."
|
||||
|
||||
# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
|
||||
################################################################################
|
||||
# Download the video, put the dynamic output filename into a variable.
|
||||
# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]
|
||||
# for videos only available to logged-in users.
|
||||
################################################################################
|
||||
yt-dlp \
|
||||
-f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
|
||||
-o "${temp_dir}/%(title)s-%(id)s.vod.mp4" \
|
||||
--print-to-file "%(filename)s" "${temp_filename}" \
|
||||
--no-simulate \
|
||||
--no-write-auto-subs \
|
||||
--restrict-filenames \
|
||||
--embed-thumbnail \
|
||||
--embed-chapters \
|
||||
--xattrs \
|
||||
"${source_url}" -o "${temp_dir}/vod.mp4"
|
||||
"${source_url}";
|
||||
|
||||
msg "Extracting audio and resampling..."
|
||||
title_name="$(xargs basename -s .vod.mp4 < ${temp_filename})";
|
||||
|
||||
ffmpeg -i "${temp_dir}/vod.mp4" \
|
||||
msg "Extracting audio and resampling...";
|
||||
|
||||
ffmpeg -i "${temp_dir}/${title_name}.vod.mp4" \
|
||||
-hide_banner \
|
||||
-vn \
|
||||
-loglevel error \
|
||||
-ar 16000 \
|
||||
-ac 1 \
|
||||
-c:a \
|
||||
pcm_s16le -y "vod-resampled.wav"
|
||||
-c:a pcm_s16le \
|
||||
-y \
|
||||
"${temp_dir}/${title_name}.vod-resampled.wav";
|
||||
|
||||
msg "Transcribing to subtitle file..."
|
||||
msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
|
||||
msg "Transcribing to subtitle file...";
|
||||
msg "Whisper specified at: '${WHISPER_EXECUTABLE}'";
|
||||
|
||||
$WHISPER_EXECUTABLE \
|
||||
"${WHISPER_EXECUTABLE}" \
|
||||
-m "${MODEL_PATH}" \
|
||||
-l "${WHISPER_LANG}" \
|
||||
-f "vod-resampled.wav" \
|
||||
-t 8 \
|
||||
-f "${temp_dir}/${title_name}.vod-resampled.wav" \
|
||||
-t "${WHISPER_THREAD_COUNT}" \
|
||||
-osrt \
|
||||
--translate
|
||||
--translate;
|
||||
|
||||
msg "Embedding subtitle track..."
|
||||
msg "Embedding subtitle track...";
|
||||
|
||||
ffmpeg -i "${temp_dir}/vod.mp4" \
|
||||
ffmpeg -i "${temp_dir}/${title_name}.vod.mp4" \
|
||||
-hide_banner \
|
||||
-loglevel error \
|
||||
-i "vod-resampled.wav.srt" \
|
||||
-i "${temp_dir}/${title_name}.vod-resampled.wav.srt" \
|
||||
-c copy \
|
||||
-c:s mov_text \
|
||||
-y res.mp4
|
||||
-y "${title_name}-res.mp4";
|
||||
|
||||
cleanup
|
||||
cleanup "${temp_dir}";
|
||||
|
||||
msg "Done! Your finished file is ready: res.mp4"
|
||||
msg "Done! Your finished file is ready: ${title_name}-res.mp4";
|
||||
|
92
whisper.cpp
92
whisper.cpp
@ -592,6 +592,8 @@ struct whisper_context {
|
||||
|
||||
mutable std::mt19937 rng; // used for sampling at t > 0.0
|
||||
|
||||
int lang_id;
|
||||
|
||||
// [EXPERIMENTAL] token-level timestamps data
|
||||
int64_t t_beg;
|
||||
int64_t t_last;
|
||||
@ -2903,7 +2905,7 @@ const char * whisper_print_system_info(void) {
|
||||
|
||||
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
||||
struct whisper_full_params result = {
|
||||
/*.strategy =*/ WHISPER_SAMPLING_GREEDY,
|
||||
/*.strategy =*/ strategy,
|
||||
|
||||
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
||||
/*.n_max_text_ctx =*/ 16384,
|
||||
@ -2922,6 +2924,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
||||
/*.thold_pt =*/ 0.01f,
|
||||
/*.thold_ptsum =*/ 0.01f,
|
||||
/*.max_len =*/ 0,
|
||||
/*.split_on_word =*/ false,
|
||||
/*.max_tokens =*/ 0,
|
||||
|
||||
/*.speed_up =*/ false,
|
||||
@ -2933,6 +2936,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
||||
/*.language =*/ "en",
|
||||
|
||||
/*.suppress_blank =*/ true,
|
||||
/*.suppress_non_speech_tokens =*/true,
|
||||
|
||||
/*.temperature =*/ 0.0f,
|
||||
/*.max_initial_ts =*/ 1.0f,
|
||||
@ -2988,9 +2992,35 @@ static void whisper_exp_compute_token_level_timestamps(
|
||||
float thold_pt,
|
||||
float thold_ptsum);
|
||||
|
||||
// trim from start (in place)
|
||||
static inline void ltrim(std::string &s) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
|
||||
return !std::isspace(ch);
|
||||
}));
|
||||
}
|
||||
|
||||
// trim from end (in place)
|
||||
static inline void rtrim(std::string &s) {
|
||||
s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
|
||||
return !std::isspace(ch);
|
||||
}).base(), s.end());
|
||||
}
|
||||
|
||||
// trim from both ends (in place)
|
||||
static inline void trim(std::string &s) {
|
||||
rtrim(s);
|
||||
ltrim(s);
|
||||
}
|
||||
|
||||
static inline bool should_split_on_word(const char * txt, bool split_on_word) {
|
||||
if (!split_on_word) return true;
|
||||
|
||||
return txt[0] == ' ';
|
||||
}
|
||||
|
||||
// wrap the last segment to max_len characters
|
||||
// returns the number of new segments
|
||||
static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
|
||||
static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool split_on_word) {
|
||||
auto segment = ctx.result_all.back();
|
||||
|
||||
int res = 1;
|
||||
@ -3005,11 +3035,14 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
|
||||
}
|
||||
|
||||
const auto txt = whisper_token_to_str(&ctx, token.id);
|
||||
|
||||
const int cur = strlen(txt);
|
||||
|
||||
if (acc + cur > max_len && i > 0) {
|
||||
if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
|
||||
// split here
|
||||
if (split_on_word) {
|
||||
trim(text);
|
||||
}
|
||||
|
||||
ctx.result_all.back().text = std::move(text);
|
||||
ctx.result_all.back().t1 = token.t0;
|
||||
ctx.result_all.back().tokens.resize(i);
|
||||
@ -3037,11 +3070,22 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
|
||||
}
|
||||
}
|
||||
|
||||
if (split_on_word) {
|
||||
trim(text);
|
||||
}
|
||||
ctx.result_all.back().text = std::move(text);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static const std::vector<std::string> non_speech_tokens
|
||||
{
|
||||
"\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
|
||||
"_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
|
||||
"---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
|
||||
"♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
|
||||
};
|
||||
|
||||
// process the logits for the selected decoder
|
||||
// - applies logit filters
|
||||
// - computes logprobs and probs
|
||||
@ -3102,6 +3146,33 @@ static void whisper_process_logits(
|
||||
logits[vocab.token_translate] = -INFINITY;
|
||||
logits[vocab.token_transcribe] = -INFINITY;
|
||||
|
||||
|
||||
// suppress non-speech tokens
|
||||
// ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
|
||||
if (params.suppress_non_speech_tokens)
|
||||
{
|
||||
for (const std::string &token : non_speech_tokens)
|
||||
{
|
||||
std::string suppress_tokens[] = {token, " " + token};
|
||||
for (const std::string &suppress_token : suppress_tokens)
|
||||
{
|
||||
if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end())
|
||||
{
|
||||
logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
// allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
|
||||
if (vocab.token_to_id.find(" -") != vocab.token_to_id.end())
|
||||
{
|
||||
logits[vocab.token_to_id.at(" -")] = -INFINITY;
|
||||
}
|
||||
if (vocab.token_to_id.find(" '") != vocab.token_to_id.end())
|
||||
{
|
||||
logits[vocab.token_to_id.at(" '")] = -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
// timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
|
||||
// https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L414-L424
|
||||
{
|
||||
@ -3449,7 +3520,7 @@ int whisper_full(
|
||||
fprintf(stderr, "%s: failed to auto-detect language\n", __func__);
|
||||
return -3;
|
||||
}
|
||||
|
||||
ctx->lang_id = lang_id;
|
||||
params.language = whisper_lang_str(lang_id);
|
||||
|
||||
fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
|
||||
@ -3546,6 +3617,7 @@ int whisper_full(
|
||||
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
|
||||
if (whisper_is_multilingual(ctx)) {
|
||||
const int lang_id = whisper_lang_id(params.language);
|
||||
ctx->lang_id = lang_id;
|
||||
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
|
||||
if (params.translate) {
|
||||
prompt_init.push_back(whisper_token_translate());
|
||||
@ -3793,7 +3865,7 @@ int whisper_full(
|
||||
|
||||
auto & cur = beam_candidates[cur_c++];
|
||||
|
||||
while (beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
|
||||
while (beam_candidates.size() > cur_c && beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
|
||||
++cur_c;
|
||||
}
|
||||
|
||||
@ -4069,7 +4141,7 @@ int whisper_full(
|
||||
*ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
|
||||
|
||||
if (params.max_len > 0) {
|
||||
n_new = whisper_wrap_segment(*ctx, params.max_len);
|
||||
n_new = whisper_wrap_segment(*ctx, params.max_len, params.split_on_word);
|
||||
}
|
||||
}
|
||||
if (params.new_segment_callback) {
|
||||
@ -4113,7 +4185,7 @@ int whisper_full(
|
||||
*ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
|
||||
|
||||
if (params.max_len > 0) {
|
||||
n_new = whisper_wrap_segment(*ctx, params.max_len);
|
||||
n_new = whisper_wrap_segment(*ctx, params.max_len, params.split_on_word);
|
||||
}
|
||||
}
|
||||
if (params.new_segment_callback) {
|
||||
@ -4266,6 +4338,10 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
|
||||
return ctx->result_all.size();
|
||||
}
|
||||
|
||||
int whisper_full_lang_id(struct whisper_context * ctx) {
|
||||
return ctx->lang_id;
|
||||
}
|
||||
|
||||
int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
|
||||
return ctx->result_all[i_segment].t0;
|
||||
}
|
||||
|
15
whisper.h
15
whisper.h
@ -113,6 +113,16 @@ extern "C" {
|
||||
int n_samples,
|
||||
int n_threads);
|
||||
|
||||
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
|
||||
// The resulting spectrogram is stored inside the provided whisper context.
|
||||
// Returns 0 on success
|
||||
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
|
||||
struct whisper_context* ctx,
|
||||
const float* samples,
|
||||
int n_samples,
|
||||
int n_threads);
|
||||
|
||||
|
||||
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
|
||||
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
||||
// n_mel must be 80
|
||||
@ -257,6 +267,7 @@ extern "C" {
|
||||
float thold_pt; // timestamp token probability threshold (~0.01)
|
||||
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
|
||||
int max_len; // max segment length in characters
|
||||
bool split_on_word; // split on word rather than on token (when used with max_len)
|
||||
int max_tokens; // max tokens per segment (0 = no limit)
|
||||
|
||||
// [EXPERIMENTAL] speed-up techniques
|
||||
@ -274,6 +285,7 @@ extern "C" {
|
||||
|
||||
// common decoding parameters:
|
||||
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
|
||||
bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
|
||||
|
||||
float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
|
||||
float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
|
||||
@ -329,6 +341,9 @@ extern "C" {
|
||||
// A segment can be a few words, a sentence, or even a paragraph.
|
||||
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
|
||||
|
||||
// Language id associated with the current context
|
||||
WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
|
||||
|
||||
// Get the start and end time of the specified segment.
|
||||
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
|
||||
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
|
||||
|
Reference in New Issue
Block a user