<!doctype html> <html lang="en-us"> <head> <title>Talk - GPT-2 meets Whisper in WebAssembly</title> <style> #output { width: 100%; height: 100%; margin: 0 auto; margin-top: 10px; border-left: 0px; border-right: 0px; padding-left: 0px; padding-right: 0px; display: block; background-color: black; color: white; font-size: 10px; font-family: 'Lucida Console', Monaco, monospace; outline: none; white-space: pre; overflow-wrap: normal; overflow-x: scroll; } </style> </head> <body> <div id="main-container"> <b>Talk - GPT-2 meets Whisper in WebAssembly</b> <br><br> On this page you can talk with an AI entity. It uses: <ul> <li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> model to listen to you as you speak in the microphone</li> <li><a href="https://github.com/ggerganov/ggml/tree/master/examples/gpt-2">OpenAI's GPT-2</a> model to generate a text response</li> <li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to speak the response to you through the speakers</li> </ul> All of this runs <b>locally in your browser</b> using WebAssembly.<br> You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">GitHub</a>. <br><br> <hr> Select the models you would like to use and click the "Start" button to begin the conversation <br><br> <div id="model-whisper"> <span id="model-whisper-status">Whisper model:</span> <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button> <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button> <span id="fetch-whisper-progress"></span> <!-- <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" /> --> </div> <br> <div id="model-gpt-2"> <span id="model-gpt-2-status">GPT-2 model:</span> <button id="fetch-gpt-2-small" onclick="loadGPT2('small')">small 117M (240 MB)</button> <!--<button id="fetch-gpt-2-medium" onclick="loadGPT2('medium')">medium 345M (720 MB)</button>--> <span id="fetch-gpt-2-progress"></span> <!-- <input type="file" id="file" name="file" onchange="loadFile(event, 'gpt-2.bin')" /> --> </div> <br> <div id="input"> <button id="start" onclick="onStart()">Start</button> <button id="stop" onclick="onStop()" disabled>Stop</button> <select id="voice" onchange="onVoiceChange()"> <option value="0">Default</option> </select> <button id="speak" onclick="onSpeak('Hello')">Say hello</button> <button id="speak" onclick="onSpeakRandom()">Say something</button> <button id="speak" onclick="clearCache()">Clear Cache</button> </div> <br> <div id="state"> Status: <b><span id="state-status">idle</span></b> <pre id="state-context">[The text context will be displayed here]</pre> </div> <hr> Debug output: <textarea id="output" rows="20"></textarea> <br> <b>Troubleshooting</b> <br><br> The page does some heavy computations, so make sure: <ul> <li>To use a modern web browser (e.g. Chrome, Firefox)</li> <li>To use a fast desktop or laptop computer (e.g. not a mobile phone)</li> <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li> </ul> <br><br> <div class="cell-version"> <span> | Build time: <span class="nav-link">@GIT_DATE@</span> | Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> | Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> | <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">Source Code</a> | </span> </div> </div> <script type='text/javascript'> var printTextarea = (function() { var element = document.getElementById('output'); if (element) element.alue = ''; // clear browser cache return function(text) { if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' '); console.log(text); if (element) { element.value += text + "\n"; element.scrollTop = element.scrollHeight; // focus on bottom } }; })(); const kRestartRecording_s = 15; const kSampleRate = 16000; window.AudioContext = window.AudioContext || window.webkitAudioContext; window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext; // web audio context var context = null; // audio data var audio = null; var audio0 = null; // the talk instance var instance = null; // model names var model_whisper = null; var model_gpt_2 = null; // speech synthesis const synth = window.speechSynthesis; var voice = null; var Module = { print: printTextarea, printErr: printTextarea, setStatus: function(text) { printTextarea('js: ' + text); }, monitorRunDependencies: function(left) { }, preRun: function() { printTextarea('js: Preparing ...'); }, postRun: function() { printTextarea('js: Initialized successfully!'); // populate the voice list var voices = synth.getVoices(); var el = document.getElementById('voice'); var n = 0; voices.forEach(function(voice, i) { if (!voice.lang.startsWith('en')) return; var option = document.createElement('option'); option.value = i; option.innerHTML = voice.name + ' (' + voice.lang + ')'; el.appendChild(option); n++; }); // select random voice if (n > 0) { for (var k = 0; k < 10; k++) { var i = Math.floor(Math.random() * n); el.selectedIndex = i; voice = voices[document.getElementById('voice').options[i].value]; // give preference to Google voices if (voice.name.startsWith('Google')) break; } } } }; // helper function function convertTypedArray(src, type) { var buffer = new ArrayBuffer(src.byteLength); var baseView = new src.constructor(buffer).set(src); return new type(buffer); } // // fetch models // function storeFS(fname, buf) { // write to WASM file using FS_createDataFile // if the file exists, delete it try { Module.FS_unlink(fname); } catch (e) { // ignore } Module.FS_createDataFile("/", fname, buf, true, true); printTextarea('js: stored model: ' + fname + ' size: ' + buf.length); if (fname == 'whisper.bin') { document.getElementById('model-whisper').innerHTML = 'Whisper model: loaded "' + model_whisper + '"!'; } else if (fname == 'gpt-2.bin') { document.getElementById('model-gpt-2').innerHTML = 'GPT-2 model: loaded "' + model_gpt_2 + '"!'; } } let dbVersion = 1 let dbName = 'talk.ggerganov.com'; let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB // fetch a remote file from remote URL using the Fetch API async function fetchRemote(url, elProgress) { printTextarea('js: downloading with fetch()...'); const response = await fetch( url, { method: 'GET', headers: { 'Content-Type': 'application/octet-stream', }, } ); if (!response.ok) { printTextarea('js: failed to fetch ' + url); return; } const contentLength = response.headers.get('content-length'); const total = parseInt(contentLength, 10); const reader = response.body.getReader(); var chunks = []; var receivedLength = 0; var progressLast = -1; while (true) { const { done, value } = await reader.read(); if (done) { break; } chunks.push(value); receivedLength += value.length; if (contentLength) { // update progress bar element with the new percentage elProgress.innerHTML = Math.round((receivedLength / total) * 100) + '%'; var progressCur = Math.round((receivedLength / total) * 10); if (progressCur != progressLast) { printTextarea('js: fetching ' + 10*progressCur + '% ...'); progressLast = progressCur; } } } var chunksAll = new Uint8Array(receivedLength); var position = 0; for (var chunk of chunks) { chunksAll.set(chunk, position); position += chunk.length; } return chunksAll; } // load remote data // - check if the data is already in the IndexedDB // - if not, fetch it from the remote URL and store it in the IndexedDB // - store it in WASM memory function loadRemote(url, dst, elProgress, size_mb) { // query the storage quota and print it navigator.storage.estimate().then(function (estimate) { printTextarea('js: storage quota: ' + estimate.quota + ' bytes'); printTextarea('js: storage usage: ' + estimate.usage + ' bytes'); }); // check if the data is already in the IndexedDB var request = indexedDB.open(dbName, dbVersion); request.onupgradeneeded = function (event) { var db = event.target.result; if (db.version == 1) { var objectStore = db.createObjectStore('models', { autoIncrement: false }); printTextarea('js: created IndexedDB ' + db.name + ' version ' + db.version); } else { // clear the database var objectStore = event.currentTarget.transaction.objectStore('models'); objectStore.clear(); printTextarea('js: cleared IndexedDB ' + db.name + ' version ' + db.version); } }; request.onsuccess = function (event) { var db = event.target.result; var transaction = db.transaction(['models'], 'readonly'); var objectStore = transaction.objectStore('models'); var request = objectStore.get(url); request.onsuccess = function (event) { if (request.result) { printTextarea('js: "' + url + '" is already in the IndexedDB'); storeFS(dst, request.result); } else { // data is not in the IndexedDB printTextarea('js: "' + url + '" is not in the IndexedDB'); // alert and ask the user to confirm if (!confirm('You are about to download ' + size_mb + ' MB of data.\nThe model data will be cached in the browser for future use.\n\nPress OK to continue.')) { document.getElementById('fetch-whisper-tiny-en').style.display = 'inline-block'; document.getElementById('fetch-whisper-base-en').style.display = 'inline-block'; document.getElementById('fetch-gpt-2-small').style.display = 'inline-block'; return; } fetchRemote(url, elProgress).then(function (data) { if (data) { // store the data in the IndexedDB var request = indexedDB.open(dbName, dbVersion); request.onsuccess = function (event) { var db = event.target.result; var transaction = db.transaction(['models'], 'readwrite'); var objectStore = transaction.objectStore('models'); var request = objectStore.put(data, url); request.onsuccess = function (event) { printTextarea('js: "' + url + '" stored in the IndexedDB'); storeFS(dst, data); }; request.onerror = function (event) { printTextarea('js: failed to store "' + url + '" in the IndexedDB'); }; }; } }); } }; request.onerror = function (event) { printTextarea('js: failed to get data from the IndexedDB'); }; }; request.onerror = function (event) { printTextarea('js: failed to open IndexedDB'); }; request.onblocked = function (event) { printTextarea('js: failed to open IndexedDB: blocked'); }; request.onabort = function (event) { printTextarea('js: failed to open IndexedDB: abort'); }; } function loadWhisper(model) { let urls = { 'tiny.en': 'https://talk.ggerganov.com/ggml-model-whisper-tiny.en.bin', 'base.en': 'https://talk.ggerganov.com/ggml-model-whisper-base.en.bin', }; let sizes = { 'tiny.en': 75, 'base.en': 142, }; let url = urls[model]; let dst = 'whisper.bin'; let el = document.getElementById('fetch-whisper-progress'); let size_mb = sizes[model]; model_whisper = model; document.getElementById('fetch-whisper-tiny-en').style.display = 'none'; document.getElementById('fetch-whisper-base-en').style.display = 'none'; document.getElementById('model-whisper-status').innerHTML = 'Whisper model: loading "' + model + '" ... '; loadRemote(url, dst, el, size_mb); } function loadGPT2(model) { let urls = { 'small': 'https://talk.ggerganov.com/ggml-model-gpt-2-117M.bin', 'medium': 'https://talk.ggerganov.com/ggml-model-gpt-2-345M.bin', }; let sizes = { 'small': 240, 'medium': 712, }; let url = urls[model]; let dst = 'gpt-2.bin'; let el = document.getElementById('fetch-gpt-2-progress'); let size_mb = sizes[model]; model_gpt_2 = model; document.getElementById('fetch-gpt-2-small').style.display = 'none'; document.getElementById('model-gpt-2-status').innerHTML = 'GPT-2 model: loading "' + model + '" ... '; loadRemote(url, dst, el, size_mb); } // // microphone // var mediaRecorder = null; var doRecording = false; var startTime = 0; function stopRecording() { Module.set_status("paused"); doRecording = false; audio0 = null; audio = null; } function startRecording() { if (!context) { context = new AudioContext({sampleRate: 16000}); } Module.set_status(""); document.getElementById('start').disabled = true; document.getElementById('stop').disabled = false; doRecording = true; startTime = Date.now(); var chunks = []; var stream = null; navigator.mediaDevices.getUserMedia({audio: true, video: false}) .then(function(s) { stream = s; mediaRecorder = new MediaRecorder(stream); mediaRecorder.ondataavailable = function(e) { chunks.push(e.data); var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' }); var reader = new FileReader(); reader.onload = function(event) { var buf = new Uint8Array(reader.result); context.decodeAudioData(buf.buffer, function(audioBuffer) { var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate); var source = offlineContext.createBufferSource(); source.buffer = audioBuffer; source.connect(offlineContext.destination); source.start(0); offlineContext.startRendering().then(function(renderedBuffer) { audio = renderedBuffer.getChannelData(0); //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length)); var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length); if (audio0 != null) { audioAll.set(audio0, 0); } audioAll.set(audio, audio0 == null ? 0 : audio0.length); if (instance) { Module.set_audio(instance, audioAll); } }); }, function(e) { audio = null; }); } reader.readAsArrayBuffer(blob); }; mediaRecorder.onstop = function(e) { if (doRecording) { setTimeout(function() { startRecording(); }); } }; mediaRecorder.start(250); }) .catch(function(err) { printTextarea('js: error getting audio stream: ' + err); }); var interval = setInterval(function() { if (!doRecording) { clearInterval(interval); mediaRecorder.stop(); stream.getTracks().forEach(function(track) { track.stop(); }); document.getElementById('start').disabled = false; document.getElementById('stop').disabled = true; mediaRecorder = null; } // if audio length is more than kRestartRecording_s seconds, restart recording if (audio != null && audio.length > kSampleRate*kRestartRecording_s) { if (doRecording) { //printTextarea('js: restarting recording'); clearInterval(interval); audio0 = audio; audio = null; mediaRecorder.stop(); stream.getTracks().forEach(function(track) { track.stop(); }); } } }, 250); } // // speak // function onSpeak(text) { var voices = synth.getVoices(); var msg = new SpeechSynthesisUtterance(text); if (voice == null) { voice = voices[0]; } msg.voice = voice; synth.speak(msg); if (doRecording) { Module.set_status("speaking ..."); printTextarea('js: speaking'); stopRecording(); var interval = setInterval(function() { if (!synth.speaking) { printTextarea('js: done speaking'); clearInterval(interval); startRecording(); } else { Module.set_status(""); } }, 100); } } function onSpeakRandom() { Module.force_speak(instance); } async function clearCache() { if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) { indexedDB.deleteDatabase(dbName); } } // // main // var intervalUpdate = null; function onStart() { if (!instance) { instance = Module.init('whisper.bin'); if (instance) { printTextarea("js: whisper initialized, instance: " + instance); } } if (!instance) { printTextarea("js: failed to initialize whisper"); return; } startRecording(); intervalUpdate = setInterval(function() { var textToSpeak = Module.get_text_to_speak(); if (textToSpeak != null && textToSpeak.length > 1) { onSpeak(textToSpeak); } document.getElementById('state-status').innerHTML = Module.get_status(); document.getElementById('state-context').innerHTML = Module.get_text_context(); }, 100); } function onStop() { stopRecording(); } function onVoiceChange() { printTextarea('js: voice changed to: ' + document.getElementById('voice').value); voice = synth.getVoices()[document.getElementById('voice').value]; } </script> <script type="text/javascript" src="talk.js"></script> </body> </html>