mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-08-08 04:44:40 +02:00
* stream.wasm : add language selection support This commit adds support for selecting the language in the stream.wasm example. This is includes adding the model `base` which supports multilingual transcription, and allowing the user to select a language from a dropdown menu in the HTML interface. The motivation for this is that it allows users to transcribe audio in various languages. Refs: https://github.com/ggml-org/whisper.cpp/issues/3347 * squash! stream.wasm : add language selection support Remove strdup() for language in stream.wasm and update butten text for base (should not be "base.en" but just "base").
491 lines
20 KiB
HTML
491 lines
20 KiB
HTML
<!doctype html>
|
|
<html lang="en-us">
|
|
<head>
|
|
<title>stream : Real-time Whisper transcription in WebAssembly</title>
|
|
|
|
<style>
|
|
#output {
|
|
width: 100%;
|
|
height: 100%;
|
|
margin: 0 auto;
|
|
margin-top: 10px;
|
|
border-left: 0px;
|
|
border-right: 0px;
|
|
padding-left: 0px;
|
|
padding-right: 0px;
|
|
display: block;
|
|
background-color: black;
|
|
color: white;
|
|
font-size: 10px;
|
|
font-family: 'Lucida Console', Monaco, monospace;
|
|
outline: none;
|
|
white-space: pre;
|
|
overflow-wrap: normal;
|
|
overflow-x: scroll;
|
|
}
|
|
</style>
|
|
<script src="../coi-serviceworker.js"></script>
|
|
<link rel="icon" href="data:,">
|
|
</head>
|
|
<body>
|
|
<div id="main-container">
|
|
<b>stream : Real-time Whisper transcription in WebAssembly</b>
|
|
|
|
<br><br>
|
|
|
|
You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">GitHub</a>.
|
|
|
|
<br><br>
|
|
|
|
<b>More examples:</b>
|
|
<a href="../">main</a> |
|
|
<a href="../bench.wasm/">bench</a> |
|
|
<a href="../stream.wasm">stream</a> |
|
|
<a href="../command.wasm/">command</a> |
|
|
|
|
<br><br>
|
|
|
|
<hr>
|
|
|
|
Select the model you would like to use, click the "Start" button and start speaking
|
|
|
|
<br><br>
|
|
|
|
<div id="model-whisper">
|
|
Whisper model: <span id="model-whisper-status"></span>
|
|
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
|
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
|
<button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
|
|
<br><br>
|
|
Quantized models:<br><br>
|
|
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
|
|
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
|
|
<span id="fetch-whisper-progress"></span>
|
|
|
|
<!--
|
|
<input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
|
-->
|
|
</div>
|
|
|
|
<table>
|
|
<tr>
|
|
<td>
|
|
Language:
|
|
<select id="language" name="language">
|
|
<option value="en">English</option>
|
|
<option value="ar">Arabic</option>
|
|
<option value="hy">Armenian</option>
|
|
<option value="az">Azerbaijani</option>
|
|
<option value="eu">Basque</option>
|
|
<option value="be">Belarusian</option>
|
|
<option value="bn">Bengali</option>
|
|
<option value="bg">Bulgarian</option>
|
|
<option value="ca">Catalan</option>
|
|
<option value="zh">Chinese</option>
|
|
<option value="hr">Croatian</option>
|
|
<option value="cs">Czech</option>
|
|
<option value="da">Danish</option>
|
|
<option value="nl">Dutch</option>
|
|
<option value="en">English</option>
|
|
<option value="et">Estonian</option>
|
|
<option value="tl">Filipino</option>
|
|
<option value="fi">Finnish</option>
|
|
<option value="fr">French</option>
|
|
<option value="gl">Galician</option>
|
|
<option value="ka">Georgian</option>
|
|
<option value="de">German</option>
|
|
<option value="el">Greek</option>
|
|
<option value="gu">Gujarati</option>
|
|
<option value="iw">Hebrew</option>
|
|
<option value="hi">Hindi</option>
|
|
<option value="hu">Hungarian</option>
|
|
<option value="is">Icelandic</option>
|
|
<option value="id">Indonesian</option>
|
|
<option value="ga">Irish</option>
|
|
<option value="it">Italian</option>
|
|
<option value="ja">Japanese</option>
|
|
<option value="kn">Kannada</option>
|
|
<option value="ko">Korean</option>
|
|
<option value="la">Latin</option>
|
|
<option value="lv">Latvian</option>
|
|
<option value="lt">Lithuanian</option>
|
|
<option value="mk">Macedonian</option>
|
|
<option value="ms">Malay</option>
|
|
<option value="mt">Maltese</option>
|
|
<option value="no">Norwegian</option>
|
|
<option value="fa">Persian</option>
|
|
<option value="pl">Polish</option>
|
|
<option value="pt">Portuguese</option>
|
|
<option value="ro">Romanian</option>
|
|
<option value="ru">Russian</option>
|
|
<option value="sr">Serbian</option>
|
|
<option value="sk">Slovak</option>
|
|
<option value="sl">Slovenian</option>
|
|
<option value="es">Spanish</option>
|
|
<option value="sw">Swahili</option>
|
|
<option value="sv">Swedish</option>
|
|
<option value="ta">Tamil</option>
|
|
<option value="te">Telugu</option>
|
|
<option value="th">Thai</option>
|
|
<option value="tr">Turkish</option>
|
|
<option value="uk">Ukrainian</option>
|
|
<option value="ur">Urdu</option>
|
|
<option value="vi">Vietnamese</option>
|
|
<option value="cy">Welsh</option>
|
|
<option value="yi">Yiddish</option>
|
|
</select>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<br>
|
|
|
|
<div id="input">
|
|
<button id="start" onclick="onStart()" disabled>Start</button>
|
|
<button id="stop" onclick="onStop()" disabled>Stop</button>
|
|
<button id="clear" onclick="clearCache()">Clear Cache</button>
|
|
</div>
|
|
|
|
<br>
|
|
|
|
<div id="state">
|
|
Status: <b><span id="state-status">not started</span></b>
|
|
|
|
<pre id="state-transcribed">[The transcribed text will be displayed here]</pre>
|
|
</div>
|
|
|
|
<hr>
|
|
|
|
Debug output:
|
|
<textarea id="output" rows="20"></textarea>
|
|
|
|
<br>
|
|
|
|
<b>Troubleshooting</b>
|
|
|
|
<br><br>
|
|
|
|
The page does some heavy computations, so make sure:
|
|
|
|
<ul>
|
|
<li>To use a modern web browser (e.g. Chrome, Firefox)</li>
|
|
<li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
|
|
<li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
|
|
</ul>
|
|
|
|
<div class="cell-version">
|
|
<span>
|
|
|
|
|
Build time: <span class="nav-link">@GIT_DATE@</span> |
|
|
Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
|
|
Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
|
|
<a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">Source Code</a> |
|
|
</span>
|
|
</div>
|
|
</div>
|
|
|
|
<script type="text/javascript" src="helpers.js"></script>
|
|
<script type='text/javascript'>
|
|
// web audio context
|
|
var context = null;
|
|
|
|
// audio data
|
|
var audio = null;
|
|
var audio0 = null;
|
|
|
|
// the stream instance
|
|
var instance = null;
|
|
|
|
// model name
|
|
var model_whisper = null;
|
|
|
|
var Module = {
|
|
print: printTextarea,
|
|
printErr: printTextarea,
|
|
setStatus: function(text) {
|
|
printTextarea('js: ' + text);
|
|
},
|
|
monitorRunDependencies: function(left) {
|
|
},
|
|
preRun: function() {
|
|
printTextarea('js: Preparing ...');
|
|
},
|
|
postRun: function() {
|
|
printTextarea('js: Initialized successfully!');
|
|
}
|
|
};
|
|
|
|
//
|
|
// fetch models
|
|
//
|
|
|
|
let dbVersion = 1
|
|
let dbName = 'whisper.ggerganov.com';
|
|
let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
|
|
|
|
function storeFS(fname, buf) {
|
|
// write to WASM file using FS_createDataFile
|
|
// if the file exists, delete it
|
|
try {
|
|
Module.FS_unlink(fname);
|
|
} catch (e) {
|
|
// ignore
|
|
}
|
|
|
|
Module.FS_createDataFile("/", fname, buf, true, true);
|
|
|
|
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
|
|
|
|
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
|
|
|
|
if (model_whisper != null) {
|
|
document.getElementById('start').disabled = false;
|
|
document.getElementById('stop' ).disabled = true;
|
|
}
|
|
}
|
|
|
|
function loadWhisper(model) {
|
|
let urls = {
|
|
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
|
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
|
'base' : 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
|
|
|
|
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
|
|
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
|
|
};
|
|
|
|
let sizes = {
|
|
'tiny.en': 75,
|
|
'base.en': 142,
|
|
'base': 142,
|
|
|
|
'tiny-en-q5_1': 31,
|
|
'base-en-q5_1': 57,
|
|
};
|
|
|
|
let url = urls[model];
|
|
let dst = 'whisper.bin';
|
|
let size_mb = sizes[model];
|
|
|
|
model_whisper = model;
|
|
|
|
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
|
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
|
document.getElementById('fetch-whisper-base').style.display = 'none';
|
|
|
|
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
|
|
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
|
|
|
|
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
|
|
|
|
cbProgress = function(p) {
|
|
let el = document.getElementById('fetch-whisper-progress');
|
|
el.innerHTML = Math.round(100*p) + '%';
|
|
};
|
|
|
|
cbCancel = function() {
|
|
var el;
|
|
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
|
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
|
el = document.getElementById('fetch-whisper-base'); if (el) el.style.display = 'inline-block';
|
|
|
|
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
|
|
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
|
|
|
|
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
|
|
};
|
|
|
|
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
|
}
|
|
|
|
//
|
|
// microphone
|
|
//
|
|
|
|
const kSampleRate = 16000;
|
|
const kRestartRecording_s = 120;
|
|
const kIntervalAudio_ms = 5000; // pass the recorded audio to the C++ instance at this rate
|
|
|
|
var mediaRecorder = null;
|
|
var doRecording = false;
|
|
var startTime = 0;
|
|
|
|
window.AudioContext = window.AudioContext || window.webkitAudioContext;
|
|
window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
|
|
|
|
function stopRecording() {
|
|
Module.set_status("paused");
|
|
doRecording = false;
|
|
audio0 = null;
|
|
audio = null;
|
|
context = null;
|
|
}
|
|
|
|
function startRecording() {
|
|
if (!context) {
|
|
context = new AudioContext({
|
|
sampleRate: kSampleRate,
|
|
channelCount: 1,
|
|
echoCancellation: false,
|
|
autoGainControl: true,
|
|
noiseSuppression: true,
|
|
});
|
|
}
|
|
|
|
Module.set_status("");
|
|
|
|
document.getElementById('start').disabled = true;
|
|
document.getElementById('stop').disabled = false;
|
|
|
|
doRecording = true;
|
|
startTime = Date.now();
|
|
|
|
var chunks = [];
|
|
var stream = null;
|
|
|
|
navigator.mediaDevices.getUserMedia({audio: true, video: false})
|
|
.then(function(s) {
|
|
stream = s;
|
|
mediaRecorder = new MediaRecorder(stream);
|
|
mediaRecorder.ondataavailable = function(e) {
|
|
chunks.push(e.data);
|
|
|
|
var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
|
|
var reader = new FileReader();
|
|
|
|
reader.onload = function(event) {
|
|
var buf = new Uint8Array(reader.result);
|
|
|
|
if (!context) {
|
|
return;
|
|
}
|
|
context.decodeAudioData(buf.buffer, function(audioBuffer) {
|
|
var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
|
|
var source = offlineContext.createBufferSource();
|
|
source.buffer = audioBuffer;
|
|
source.connect(offlineContext.destination);
|
|
source.start(0);
|
|
|
|
offlineContext.startRendering().then(function(renderedBuffer) {
|
|
audio = renderedBuffer.getChannelData(0);
|
|
|
|
//printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
|
|
|
|
var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
|
|
if (audio0 != null) {
|
|
audioAll.set(audio0, 0);
|
|
}
|
|
audioAll.set(audio, audio0 == null ? 0 : audio0.length);
|
|
|
|
if (instance) {
|
|
Module.set_audio(instance, audioAll);
|
|
}
|
|
});
|
|
}, function(e) {
|
|
audio = null;
|
|
});
|
|
}
|
|
|
|
reader.readAsArrayBuffer(blob);
|
|
};
|
|
|
|
mediaRecorder.onstop = function(e) {
|
|
if (doRecording) {
|
|
setTimeout(function() {
|
|
startRecording();
|
|
});
|
|
}
|
|
};
|
|
|
|
mediaRecorder.start(kIntervalAudio_ms);
|
|
})
|
|
.catch(function(err) {
|
|
printTextarea('js: error getting audio stream: ' + err);
|
|
});
|
|
|
|
var interval = setInterval(function() {
|
|
if (!doRecording) {
|
|
clearInterval(interval);
|
|
mediaRecorder.stop();
|
|
stream.getTracks().forEach(function(track) {
|
|
track.stop();
|
|
});
|
|
|
|
document.getElementById('start').disabled = false;
|
|
document.getElementById('stop').disabled = true;
|
|
|
|
mediaRecorder = null;
|
|
}
|
|
|
|
// if audio length is more than kRestartRecording_s seconds, restart recording
|
|
if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
|
|
if (doRecording) {
|
|
//printTextarea('js: restarting recording');
|
|
|
|
clearInterval(interval);
|
|
audio0 = audio;
|
|
audio = null;
|
|
mediaRecorder.stop();
|
|
stream.getTracks().forEach(function(track) {
|
|
track.stop();
|
|
});
|
|
}
|
|
}
|
|
}, 100);
|
|
}
|
|
|
|
//
|
|
// main
|
|
//
|
|
|
|
var nLines = 0;
|
|
var intervalUpdate = null;
|
|
var transcribedAll = '';
|
|
|
|
function onStart() {
|
|
if (!instance) {
|
|
instance = Module.init('whisper.bin', document.getElementById('language').value);
|
|
|
|
if (instance) {
|
|
printTextarea("js: whisper initialized, instance: " + instance);
|
|
}
|
|
}
|
|
|
|
if (!instance) {
|
|
printTextarea("js: failed to initialize whisper");
|
|
return;
|
|
}
|
|
|
|
startRecording();
|
|
|
|
intervalUpdate = setInterval(function() {
|
|
var transcribed = Module.get_transcribed();
|
|
|
|
if (transcribed != null && transcribed.length > 1) {
|
|
transcribedAll += transcribed + '<br>';
|
|
nLines++;
|
|
|
|
// if more than 10 lines, remove the first line
|
|
if (nLines > 10) {
|
|
var i = transcribedAll.indexOf('<br>');
|
|
if (i > 0) {
|
|
transcribedAll = transcribedAll.substring(i + 4);
|
|
nLines--;
|
|
}
|
|
}
|
|
}
|
|
|
|
document.getElementById('state-status').innerHTML = Module.get_status();
|
|
document.getElementById('state-transcribed').innerHTML = transcribedAll;
|
|
}, 100);
|
|
}
|
|
|
|
function onStop() {
|
|
stopRecording();
|
|
}
|
|
|
|
</script>
|
|
<script type="text/javascript" src="stream.js"></script>
|
|
</body>
|
|
</html>
|