sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) (#1422)

* sync : ggml (backend v2, k-quants, CUDA opts, Metal opts, etc.) * metal : allow env metal variable to override resource path (#1415) * Allow env variable to override resource path * Update ggml-metal.m --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * sync : restore common / main from `master` * sync : restore whisper from `master` * talk-llama : update to latest llama.cpp * ruby : fix build * ggml : fix 32-bit ARM build * ggml : fix MIN / MAX macro collisions + update ios bindings * ggml : fix ifdefs and MIN / MAX again * exampels : fix Obj-C and Swift examples * ggml : fix 32-bit ARM compatibility * ggml : one more attempt to fix 32-bit ARM compat * whisper : fix support for larger graphs --------- Co-authored-by: Chris Raethke <codesoda@users.noreply.github.com>
2023-11-03 21:35:05 +02:00
parent 8a2bee6717
commit f96e1c5b78
38 changed files with 30875 additions and 7843 deletions
--- a/examples/common.h
+++ b/examples/common.h
@ -17,10 +17,15 @@
 //

 struct gpt_params {
-    int32_t seed      = -1;  // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict = 200; // new tokens to predict
-    int32_t n_batch   = 8;   // batch size for prompt processing
+    int32_t seed         = -1;   // RNG seed
+    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict    = 200;  // new tokens to predict
+    int32_t n_parallel   = 1;    // number of parallel streams
+    int32_t n_batch      = 8;    // batch size for prompt processing
+    int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
+    int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU
+
+    bool ignore_eos = false; // ignore EOS token when generating text

    // sampling parameters
    int32_t top_k          = 40;
@ -35,8 +40,6 @@ struct gpt_params {

    bool    interactive      = false;
    int32_t interactive_port = -1;
-
-    int32_t n_gpu_layers     = 0;
 };

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);