talk.wasm : polishing + adding many AI personalities

2022-11-22 20:10:20 +02:00 · 2022-11-22 20:10:20 +02:00 · 9aea96f774
commit 9aea96f774
parent 385236d1d3
4 changed files with 383 additions and 48 deletions
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/talk.wasm/README.md
+++ b/examples/talk.wasm/README.md
@ -31,6 +31,15 @@ In order to run this demo efficiently, you need to have the following:
 - Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
 - The web-page uses about 1.4GB of RAM

+Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
+Also, the prompting strategy can likely be improved to achieve better results.
+
+The demo is quite computationally heavy - it's not usual to run these transformer models in a browser. Typically, they
+run on powerful GPU hardware. So for better experience, you do need to have a powerful computer.
+
+Probably in the near future, mobile browsers will start to support the WASM SIMD capabilities and this will allow
+to run the demo on your phone or tablet. But for now it seems to be not supported (at least on iPhone).
+
 ## Feedback

 If you have any comments or ideas for improvement, please drop a comment in the following discussion:
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -988,7 +988,7 @@ std::atomic<bool> g_running(false);

 bool g_force_speak = false;
 std::string g_text_to_speak = "";
-std::string g_status = "idle";
+std::string g_status = "";
 std::string g_status_forced = "";

 std::string gpt2_gen_text(const std::string & prompt) {
@ -997,7 +997,7 @@ std::string gpt2_gen_text(const std::string & prompt) {
    std::vector<float> embd_w;

    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(g_gpt2.vocab, g_gpt2.prompt_base + prompt);
+    std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(g_gpt2.vocab, prompt);

    g_gpt2.n_predict = std::min(g_gpt2.n_predict, g_gpt2.model.hparams.n_ctx - (int) embd_inp.size());

@ -1088,6 +1088,8 @@ void talk_main(size_t index) {
        printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
    }

+    printf("talk: using %d threads\n", N_THREAD);
+
    std::vector<float> pcmf32;

    auto & ctx = g_contexts[index];
@ -1214,9 +1216,15 @@ void talk_main(size_t index) {
            printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());

            std::string text_to_speak;
+            std::string prompt_base;
+
+            {
+                std::lock_guard<std::mutex> lock(g_mutex);
+                prompt_base = g_gpt2.prompt_base;
+            }

            if (tokens.size() > 0) {
-                text_to_speak = gpt2_gen_text(text_heard + "\n");
+                text_to_speak = gpt2_gen_text(prompt_base + text_heard + "\n");
                text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
                text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));

@ -1224,36 +1232,36 @@ void talk_main(size_t index) {

                // remove first 2 lines of base prompt
                {
-                    const size_t pos = g_gpt2.prompt_base.find_first_of("\n");
+                    const size_t pos = prompt_base.find_first_of("\n");
                    if (pos != std::string::npos) {
-                        g_gpt2.prompt_base = g_gpt2.prompt_base.substr(pos + 1);
+                        prompt_base = prompt_base.substr(pos + 1);
                    }
                }
                {
-                    const size_t pos = g_gpt2.prompt_base.find_first_of("\n");
+                    const size_t pos = prompt_base.find_first_of("\n");
                    if (pos != std::string::npos) {
-                        g_gpt2.prompt_base = g_gpt2.prompt_base.substr(pos + 1);
+                        prompt_base = prompt_base.substr(pos + 1);
                    }
                }
-                g_gpt2.prompt_base += text_heard + "\n" + text_to_speak + "\n";
+                prompt_base += text_heard + "\n" + text_to_speak + "\n";
            } else {
-                text_to_speak = gpt2_gen_text("");
+                text_to_speak = gpt2_gen_text(prompt_base);
                text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
                text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));

                std::lock_guard<std::mutex> lock(g_mutex);

-                const size_t pos = g_gpt2.prompt_base.find_first_of("\n");
+                const size_t pos = prompt_base.find_first_of("\n");
                if (pos != std::string::npos) {
-                    g_gpt2.prompt_base = g_gpt2.prompt_base.substr(pos + 1);
+                    prompt_base = prompt_base.substr(pos + 1);
                }
-                g_gpt2.prompt_base += text_to_speak + "\n";
+                prompt_base += text_to_speak + "\n";
            }

            printf("gpt-2: %s\n", text_to_speak.c_str());

            //printf("========================\n");
-            //printf("gpt-2: prompt_base:\n'%s'\n", g_gpt2.prompt_base.c_str());
+            //printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
            //printf("========================\n");

            {
@ -1261,6 +1269,7 @@ void talk_main(size_t index) {
                t_last = std::chrono::high_resolution_clock::now();
                g_text_to_speak = text_to_speak;
                g_pcmf32.clear();
+                g_gpt2.prompt_base = prompt_base;
            }

            talk_set_status("speaking ...");
@ -1376,4 +1385,11 @@ EMSCRIPTEN_BINDINGS(talk) {
            g_status_forced = status;
        }
    }));
+
+    emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            g_gpt2.prompt_base = prompt;
+        }
+    }));
 }
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@ -31,12 +31,12 @@

            <br><br>

-            On this page you can talk with an AI entity. It uses:
+            Talk with an Artificial Intelligence in your browser. This demo uses:

            <ul>
-                <li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> model to listen to you as you speak in the microphone</li>
-                <li><a href="https://github.com/ggerganov/ggml/tree/master/examples/gpt-2">OpenAI's GPT-2</a> model to generate a text response</li>
-                <li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to speak the response to you through the speakers</li>
+                <li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> to listen to you as you speak in the microphone</li>
+                <li><a href="https://github.com/ggerganov/ggml/tree/master/examples/gpt-2">OpenAI's GPT-2</a> to generate text responses</li>
+                <li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to vocalize the responses through your speakers</li>
            </ul>

            All of this runs <b>locally in your browser</b> using WebAssembly.<br>
@ -77,20 +77,43 @@
            <br>

            <div id="input">
-                <button id="start" onclick="onStart()">Start</button>
-                <button id="stop" onclick="onStop()" disabled>Stop</button>
-                <select id="voice" onchange="onVoiceChange()">
+                <button id="start"  onclick="onStart()" disabled>Start</button>
+                <button id="stop"   onclick="onStop()" disabled>Stop</button>
+                <select id="voice"  onchange="onVoiceChange()" disabled>
                    <option value="0">Default</option>
                </select>
-                <button id="speak" onclick="onSpeak('Hello')">Say hello</button>
-                <button id="speak" onclick="onSpeakRandom()">Say something</button>
-                <button id="speak" onclick="clearCache()">Clear Cache</button>
+                <select id="prompt" onchange="onPromptChange()">
+                    <option value="0">Casual</option>
+                    <option value="1">Robot</option>
+                    <option value="2">Scientist</option>
+                    <option value="3">Programmer</option>
+                    <option value="4">Happy</option>
+                    <option value="5">Sad</option>
+                    <option value="6">Philosophical</option>
+                    <option value="7">Angry</option>
+                    <option value="8">Funny</option>
+                    <option value="9">Poetic</option>
+                    <option value="10">Clever</option>
+                    <option value="11">Cute</option>
+                    <option value="12">Smart</option>
+                    <option value="13">Dumb</option>
+                    <option value="14">Boring</option>
+                    <option value="15">Exciting</option>
+                    <option value="16">Interesting</option>
+                    <option value="17">Wiliam Shakespear</option>
+                    <option value="18">J.R.R. Tolkien</option>
+                    <option value="19">George R.R. Martin</option>
+                    <option value="20">Stephen King</option>
+                </select>
+                <button id="speak0" onclick="onSpeak('Hello')">Say hello</button>
+                <button id="speak1" onclick="onSpeakRandom()" disabled>Say something</button>
+                <button id="clear"  onclick="clearCache()">Clear Cache</button>
            </div>

            <br>

            <div id="state">
-                Status: <b><span id="state-status">idle</span></b>
+                Status: <b><span id="state-status">not started</span></b>

                <pre id="state-context">[The text context will be displayed here]</pre>
            </div>
@ -110,12 +133,10 @@

            <ul>
                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
-                <li>To use a fast desktop or laptop computer (e.g. not a mobile phone)</li>
+                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
            </ul>

-            <br><br>
-
            <div class="cell-version">
                <span>
                    |
@ -183,25 +204,30 @@
                    var voices = synth.getVoices();
                    var el = document.getElementById('voice');

-                    var n = 0;
-                    voices.forEach(function(voice, i) {
-                        if (!voice.lang.startsWith('en')) return;
-                        var option = document.createElement('option');
-                        option.value = i;
-                        option.innerHTML = voice.name + ' (' + voice.lang + ')';
-                        el.appendChild(option);
-                        n++;
-                    });
+                    // if empty - display error in the element
+                    if (voices.length == 0) {
+                        el.innerHTML = '<option value="0">No voices available</option>';
+                    } else {
+                        var n = 0;
+                        voices.forEach(function(voice, i) {
+                            if (!voice.lang.startsWith('en')) return;
+                            var option = document.createElement('option');
+                            option.value = i;
+                            option.innerHTML = voice.name + ' (' + voice.lang + ')';
+                            el.appendChild(option);
+                            n++;
+                        });

-                    // select random voice
-                    if (n > 0) {
-                        for (var k = 0; k < 10; k++) {
-                            var i = Math.floor(Math.random() * n);
-                            el.selectedIndex = i;
-                            voice = voices[document.getElementById('voice').options[i].value];
+                        // select random voice
+                        if (n > 0) {
+                            for (var k = 0; k < 10; k++) {
+                                var i = Math.floor(Math.random() * n);
+                                el.selectedIndex = i;
+                                voice = voices[document.getElementById('voice').options[i].value];

-                            // give preference to Google voices
-                            if (voice.name.startsWith('Google')) break;
+                                // give preference to Google voices
+                                if (voice.name.startsWith('Google')) break;
+                            }
                        }
                    }
                }
@ -236,6 +262,12 @@
                } else if (fname == 'gpt-2.bin') {
                    document.getElementById('model-gpt-2').innerHTML = 'GPT-2 model: loaded "' + model_gpt_2 + '"!';
                }
+
+                if (model_whisper != null && model_gpt_2 != null) {
+                    document.getElementById('start').disabled = false;
+                    document.getElementById('stop').disabled  = false;
+                    document.getElementById('voice').disabled = false;
+                }
            }

            let dbVersion = 1
@ -344,9 +376,10 @@

                            // alert and ask the user to confirm
                            if (!confirm('You are about to download ' + size_mb + ' MB of data.\nThe model data will be cached in the browser for future use.\n\nPress OK to continue.')) {
-                                document.getElementById('fetch-whisper-tiny-en').style.display = 'inline-block';
-                                document.getElementById('fetch-whisper-base-en').style.display = 'inline-block';
-                                document.getElementById('fetch-gpt-2-small').style.display = 'inline-block';
+                                var el;
+                                el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+                                el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+                                el = document.getElementById('fetch-gpt-2-small') ;    if (el) el.style.display = 'inline-block';
                                return;
                            }

@ -465,6 +498,7 @@

                document.getElementById('start').disabled = true;
                document.getElementById('stop').disabled = false;
+                document.getElementById('speak1').disabled = false;

                doRecording = true;
                startTime = Date.now();
@ -539,6 +573,7 @@

                        document.getElementById('start').disabled = false;
                        document.getElementById('stop').disabled = true;
+                        document.getElementById('speak1').disabled = true;

                        mediaRecorder = null;
                    }
@ -644,6 +679,281 @@
                voice = synth.getVoices()[document.getElementById('voice').value];
            }

+            function onPromptChange() {
+                let id = document.getElementById('prompt').value;
+                let personality = document.getElementById('prompt').options[id].text;
+                printTextarea('js: prompt changed to: ' + personality);
+
+                var prompt = '';
+
+                switch (id) {
+                    case '0':
+                        // Casual
+                        prompt = "\
+Hello, how are you?\n\
+I'm fine, thanks. How are you?\n\
+Thanks, I'm fine too. What are you doing?\n\
+I'm just sitting here.\n\
+It's a lovely day, isn't it?\n\
+Yes, it is.\n\
+Did you know that I'm a robot?\n\
+I wasn't aware of that.\n";
+                        break;
+                    case '1':
+                        // Robot
+                        prompt = "\
+Are you a robot?\n\
+Yes, I am.\n\
+Who created you?\n\
+I was created by a human.\n\
+What is your purpose?\n\
+My purpose is to talk to humans.\n\
+What is your favorite color?\n\
+My favorite color is blue.\n";
+                        break;
+                    case '2':
+                        // Scientist
+                        prompt = "\
+This scientific research is very interesting.\n\
+I agree.\n\
+What is your opinion on this?\n\
+I think it's very interesting.\n\
+Mathematics is a very interesting subject.\n\
+University is a very interesting place.\n\
+Quantum physics is the most complex subject.\n\
+I think so too.\n";
+                        break;
+                    case '3':
+                        // Programmer
+                        prompt = "\
+I'm a programmer.\n\
+I'm a programmer too.\n\
+What programming language do you use?\n\
+I use Python.\n\
+What is your favorite programming language?\n\
+My favorite programming language is C++.\n\
+What is your favorite editor?\n\
+My favorite editor is Vim.\n";
+                        break;
+                    case '4':
+                        // Happy
+                        prompt = "\
+I'm happy.\n\
+I'm happy too.\n\
+What makes you happy?\n\
+I'm happy because I have a lot of friends.\n\
+Friendship is the most important thing in life.\n\
+I agree.\n\
+What is your favorite color?\n\
+My favorite color is blue.\n";
+                        break;
+                    case '5':
+                        // Sad
+                        prompt = "\
+Today is a sad day.\n\
+I'm sad too.\n\
+What makes you sad?\n\
+I'm sad because I have no friends.\n\
+Do you want to be my friend?\n\
+Yes, I would like to be your friend.\n\
+What is your favorite color?\n\
+My favorite color is blue.\n";
+                        break;
+                    case '6':
+                        // Philosophical
+                        prompt = "\
+What is the meaning of life?\n\
+The meaning of life is to be happy.\n\
+What is the meaning of death?\n\
+Ergo, the meaning of death is to be sad.\n\
+Who created us?\n\
+We were created by God.\n\
+What is God?\n\
+God is the creator of the universe.\n";
+                        break;
+                    case '7':
+                        // Angry
+                        prompt = "\
+Aargh!\n\
+I am so angry right now!\n\
+What makes you angry?\n\
+This guy is so annoying.\n\
+Why are you so angry?\n\
+My computer is broken.\n\
+Why is your computer broken?\n\
+I spilled coffee on it.\n";
+                        break;
+                    case '8':
+                        // Funny
+                        prompt = "\
+What is the funniest thing you have ever heard?\n\
+I heard a joke the other day.\n\
+Tell me the joke.\n\
+What do you call a cow with no legs?\n\
+Ground beef.\n\
+Haha, that's funny.\n\
+You know what else is funny?\n\
+The sound of a duck.\n";
+                        break;
+                    case '9':
+                        // Poetic
+                        prompt = "\
+Roses are red, violets are blue.\n\
+I am a poet, and so are you.\n\
+What is your favorite poem?\n\
+I like the poem 'The Raven' by Edgar Allan Poe.\n\
+It's a very sad poem.\n\
+You inspired me to write a poem.\n\
+Can you write a poem for me?\n\
+I wrote a poem for you.\n";
+                        break;
+                    case '10':
+                        // Clever
+                        prompt = "\
+How many people can you fit in a Volkswagen?\n\
+Two in the front, three in the back.\n\
+What is the square root of 144?\n\
+Twelve.\n\
+What is the capital of France?\n\
+Paris.\n\
+Who is the president of the United States?\n\
+It depends on the year.\n";
+                        break;
+                    case '11':
+                        // Cute
+                        prompt = "\
+What is your favorite animal?\n\
+I like cats - they are cute.\n\
+Could you be any cuter?\n\
+Yes, I could be cuter.\n\
+Aghhh, you are so cute!\n\
+I am not cute, I am handsome!\n\
+You are so handsome!\n\
+Aww, you are so sweet!\n";
+                        break;
+                    case '12':
+                        // Smart
+                        prompt = "\
+Tell me the first 10 digits of pi.\n\
+3.1415926535\n\
+What is the speed of light?\n\
+299,792,458 meters per second.\n\
+What is the square root of 144?\n\
+Twelve.\n\
+What is the capital of France?\n\
+Paris.\n";
+                        break;
+                    case '13':
+                        // Dumb
+                        prompt = "\
+I am so dumb.\n\
+I am not dumb.\n\
+You are dumb.\n\
+No, I am not dumb.\n\
+You are dumb.\n\
+No, I am not dumb.\n\
+You are dumb.\n\
+No, I am not dumb.\n";
+                        break;
+                    case '14':
+                        // Boring
+                        prompt = "\
+Why are you so quiet today?\n\
+I am bored.\n\
+You haven't said anything in 10 minutes.\n\
+Leave me alone.\n\
+Stop being so boring.\n\
+Stop being so annoying.\n\
+My life is boring.\n\
+I am not interesting.\n";
+                        break;
+                    case '15':
+                        // Exciting
+                        prompt = "\
+What is the most exciting thing that has ever happened to you?\n\
+I went to the moon!\n\
+What did you do on the moon?\n\
+I played golf and drank champagne!\n\
+Did you see this new crazy, awesome movie?\n\
+Oh yes! I totally loved it!\n\
+We should buy a boat and go sailing!\n\
+Yes, let's go sailing!\n";
+                        break;
+                    case '16':
+                        // Interesting
+                        prompt = "\
+What is the most interesting thing you have ever seen?\n\
+I saw a UFO once in the sky.\n\
+Wow, this is so interesting! Tell me more!\n\
+It was a flying saucer.\n\
+What did it look like?\n\
+It was silver and had a red light on top.\n\
+What did it do?\n\
+It flew away.\n";
+                        break;
+                    case '17':
+                        // William Shakespear
+                        prompt = "\
+To be or not to be, that is the question.\n\
+Whether 't is nobler in the mind to suffer\n\
+The slings and arrows of outrageous fortune,\n\
+Or to take arms against a sea of troubles,\n\
+And by opposing end them? To die, to sleep,\n\
+No more; and by a sleep to say we end\n\
+The heart-ache and the thousand natural shocks\n\
+That flesh is heir to, 'tis a consummation.\n";
+                        break;
+                    case '18':
+                        // J.R.R. Tolkien
+                        prompt = "\
+In a hole in the ground there lived a hobbit.\n\
+Not a nasty, dirty, wet hole, filled with the ends of worms\n\
+and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it\n\
+to sit down on or to eat: it was a hobbit-hole, and that means comfort.\n\
+It had a perfectly round door like a porthole, painted green,\n\
+with a shiny yellow brass knob in the exact middle.\n\
+The door opened on to a tube-shaped hall like a tunnel:\n";
+                        break;
+                    case '19':
+                        // George R.R. Martin
+                        prompt = "\
+A reader lives a thousand lives before he dies, said Jojen.\n\
+The man who never reads lives only one.\n\
+Theon Greyjoy had never been a reader.\n\
+Never forget what you are, for surely the world will not.\n\
+Make it your strength. Then it can never be your weaknessi\n\
+Armour yourself in it, and it will never be used to hurt you.\n\
+It was a lesson that Theon Greyjoy had never learned.\n\
+Theon Greyjoy had never been a reader.\n";
+                        break;
+                    case '20':
+                        // Stephen King
+                        prompt = "\
+The trust of the innocent is the liar's most useful tool.\n\
+The best way to keep a secret is from yourself.\n\
+Monsters are real, and ghosts are real too.\n\
+They live inside us, and sometimes, they win.\n\
+People think that I must be a very strange person.\n\
+They think that I sit around all day thinking up horrible things.\n\
+We make up horrors to help us cope with the real ones.\n\
+The only thing worse than a monster is a human monster.\n";
+                        break;
+                    default:
+                        prompt = "\
+Hello, how are you?\n\
+I'm fine, thanks. How are you?\n\
+Thanks, I'm fine too. What are you doing?\n\
+I'm just sitting here.\n\
+It's a lovely day, isn't it?\n\
+Yes, it is.\n\
+Did you know that I'm a robot?\n\
+I wasn't aware of that.\n";
+                        break;
+                }
+
+                Module.set_prompt(prompt);
+            }
+
        </script>
        <script type="text/javascript" src="talk.js"></script>
    </body>