diff --git a/examples/talk-llama/llama-util.h b/examples/talk-llama/llama-util.h index ca4dd162..3cac9f68 100644 --- a/examples/talk-llama/llama-util.h +++ b/examples/talk-llama/llama-util.h @@ -14,6 +14,7 @@ #include #include +#include #ifdef __has_include #if __has_include() @@ -74,7 +75,7 @@ struct llama_file { llama_file(const char * fname, const char * mode) { fp = std::fopen(fname, mode); if (fp == NULL) { - throw format("failed to open %s: %s", fname, std::strerror(errno)); + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); } seek(0, SEEK_END); size = tell(); @@ -100,17 +101,17 @@ struct llama_file { LLAMA_ASSERT(ret == 0); // same } - void read_raw(void * ptr, size_t size) { - if (size == 0) { + void read_raw(void * ptr, size_t len) const { + if (len == 0) { return; } errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); + std::size_t ret = std::fread(ptr, len, 1, fp); if (ferror(fp)) { - throw format("read error: %s", strerror(errno)); + throw std::runtime_error(format("read error: %s", strerror(errno))); } if (ret != 1) { - throw std::string("unexpectedly reached end of file"); + throw std::runtime_error(std::string("unexpectedly reached end of file")); } } @@ -126,14 +127,14 @@ struct llama_file { return std::string(chars.data(), len); } - void write_raw(const void * ptr, size_t size) { - if (size == 0) { + void write_raw(const void * ptr, size_t len) const { + if (len == 0) { return; } errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); + size_t ret = std::fwrite(ptr, len, 1, fp); if (ret != 1) { - throw format("write error: %s", strerror(errno)); + throw std::runtime_error(format("write error: %s", strerror(errno))); } } @@ -171,7 +172,7 @@ struct llama_mmap { #ifdef _POSIX_MAPPED_FILES static constexpr bool SUPPORTED = true; - llama_mmap(struct llama_file * file, bool prefetch = true) { + llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) { size = file->size; int fd = fileno(file->fp); int flags = MAP_SHARED; @@ -180,12 +181,12 @@ struct llama_mmap { #endif addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { - throw format("mmap failed: %s", strerror(errno)); + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); } - if (prefetch) { + if (prefetch > 0) { // Advise the kernel to preload the mapped memory - if (madvise(addr, file->size, MADV_WILLNEED)) { + if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) { fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", strerror(errno)); } @@ -207,7 +208,7 @@ struct llama_mmap { DWORD error = GetLastError(); if (hMapping == NULL) { - throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); + throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); } addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); @@ -215,7 +216,7 @@ struct llama_mmap { CloseHandle(hMapping); if (addr == NULL) { - throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); + throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 @@ -243,8 +244,9 @@ struct llama_mmap { #else static constexpr bool SUPPORTED = false; - llama_mmap(struct llama_file *) { - throw std::string("mmap not supported"); + llama_mmap(struct llama_file *, bool prefetch = true) { + (void)prefetch; + throw std::runtime_error(std::string("mmap not supported")); } #endif }; @@ -265,9 +267,9 @@ struct llama_mlock { } } - void init(void * addr) { - LLAMA_ASSERT(this->addr == NULL && this->size == 0); - this->addr = addr; + void init(void * ptr) { + LLAMA_ASSERT(addr == NULL && size == 0); + addr = ptr; } void grow_to(size_t target_size) { @@ -338,14 +340,14 @@ struct llama_mlock { return (size_t) si.dwPageSize; } - bool raw_lock(void * addr, size_t size) { + bool raw_lock(void * ptr, size_t len) { for (int tries = 1; ; tries++) { - if (VirtualLock(addr, size)) { + if (VirtualLock(ptr, len)) { return true; } if (tries == 2) { fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", - size, this->size, llama_format_win_err(GetLastError()).c_str()); + len, size, llama_format_win_err(GetLastError()).c_str()); return false; } @@ -361,7 +363,7 @@ struct llama_mlock { // is equal to the number of pages in its minimum working set minus // a small overhead." // Hopefully a megabyte is enough overhead: - size_t increment = size + 1048576; + size_t increment = len + 1048576; // The minimum must be <= the maximum, so we need to increase both: min_ws_size += increment; max_ws_size += increment; @@ -373,8 +375,8 @@ struct llama_mlock { } } - void raw_unlock(void * addr, size_t size) { - if (!VirtualUnlock(addr, size)) { + void raw_unlock(void * ptr, size_t len) { + if (!VirtualUnlock(ptr, len)) { fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", llama_format_win_err(GetLastError()).c_str()); } @@ -382,11 +384,16 @@ struct llama_mlock { #else static constexpr bool SUPPORTED = false; - void raw_lock(const void * addr, size_t size) { - fprintf(stderr, "warning: mlock not supported on this system\n"); + size_t lock_granularity() { + return (size_t) 65536; } - void raw_unlock(const void * addr, size_t size) {} + bool raw_lock(const void * addr, size_t len) { + fprintf(stderr, "warning: mlock not supported on this system\n"); + return false; + } + + void raw_unlock(const void * addr, size_t len) {} #endif }; @@ -395,36 +402,70 @@ struct llama_buffer { uint8_t * addr = NULL; size_t size = 0; - void resize(size_t size) { + llama_buffer() = default; + + void resize(size_t len) { delete[] addr; - addr = new uint8_t[size]; - this->size = size; + addr = new uint8_t[len]; + size = len; } ~llama_buffer() { delete[] addr; } + + // disable copy and move + llama_buffer(const llama_buffer&) = delete; + llama_buffer(llama_buffer&&) = delete; + llama_buffer& operator=(const llama_buffer&) = delete; + llama_buffer& operator=(llama_buffer&&) = delete; }; #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" struct llama_ctx_buffer { uint8_t * addr = NULL; + bool is_cuda; size_t size = 0; + llama_ctx_buffer() = default; + void resize(size_t size) { - if (addr) { - ggml_cuda_host_free(addr); - } + free(); + addr = (uint8_t *) ggml_cuda_host_malloc(size); + if (addr) { + is_cuda = true; + } + else { + // fall back to pageable memory + addr = new uint8_t[size]; + is_cuda = false; + } this->size = size; } - ~llama_ctx_buffer() { + void free() { if (addr) { - ggml_cuda_host_free(addr); + if (is_cuda) { + ggml_cuda_host_free(addr); + } + else { + delete[] addr; + } } + addr = NULL; } + + ~llama_ctx_buffer() { + free(); + } + + // disable copy and move + llama_ctx_buffer(const llama_ctx_buffer&) = delete; + llama_ctx_buffer(llama_ctx_buffer&&) = delete; + llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete; + llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete; }; #else typedef llama_buffer llama_ctx_buffer; diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp index 98f49abd..4cbc8d6b 100644 --- a/examples/talk-llama/llama.cpp +++ b/examples/talk-llama/llama.cpp @@ -1,6 +1,7 @@ // Defines fileno on msys: #ifndef _GNU_SOURCE #define _GNU_SOURCE +#include #include #include #endif @@ -45,6 +46,7 @@ enum e_model { MODEL_65B, }; + static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -110,7 +112,7 @@ struct llama_hparams { enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { - return memcmp(this, &other, sizeof(llama_hparams)); + return static_cast(memcmp(this, &other, sizeof(llama_hparams))); } }; @@ -406,6 +408,7 @@ enum llama_file_version { LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab LLAMA_FILE_VERSION_GGJT_V1, // added padding LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format + LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format }; struct llama_file_loader { @@ -424,24 +427,30 @@ struct llama_file_loader { } void read_magic() { uint32_t magic = file.read_u32(); - uint32_t version = 0; - if (magic != 'ggml') { - version = file.read_u32(); - } - - if (magic == 'ggml' && version == 0) { + if (magic == LLAMA_FILE_MAGIC_GGML) { file_version = LLAMA_FILE_VERSION_GGML; - } else if (magic == 'ggmf' && version == 1) { - file_version = LLAMA_FILE_VERSION_GGMF_V1; - } else if (magic == 'ggjt' && version == 1) { - file_version = LLAMA_FILE_VERSION_GGJT_V1; - } else if (magic == 'ggjt' && version == 2) { - file_version = LLAMA_FILE_VERSION_GGJT_V2; - } else { - throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", - magic, version); + return; } + + uint32_t version = file.read_u32(); + + switch (magic) { + case LLAMA_FILE_MAGIC_GGMF: + switch (version) { + case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return; + } + break; + case LLAMA_FILE_MAGIC_GGJT: + switch (version) { + case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return; + case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return; + case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return; + } + } + + throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", + magic, version); } void read_hparams() { hparams.n_vocab = file.read_u32(); @@ -499,7 +508,7 @@ struct llama_file_loader { if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { // skip to the next multiple of 32 bytes - file.seek(-file.tell() & 31, SEEK_CUR); + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); } shard.file_idx = file_idx; shard.file_off = file.tell(); @@ -574,7 +583,7 @@ struct llama_file_saver { file.write_u32(new_type); file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); file.write_raw(tensor.name.data(), tensor.name.size()); - file.seek(-file.tell() & 31, SEEK_CUR); + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); file.write_raw(new_data, new_size); } @@ -641,7 +650,7 @@ struct llama_model_loader { } } - struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne) { + struct ggml_tensor * get_tensor(const std::string & name, const std::vector & ne, ggml_backend backend) { auto it = tensors_map.name_to_idx.find(name); if (it == tensors_map.name_to_idx.end()) { throw format("llama.cpp: tensor '%s' is missing from model", name.c_str()); @@ -652,10 +661,10 @@ struct llama_model_loader { name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()); } - return get_tensor_for(lt); + return get_tensor_for(lt, backend); } - struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) { + struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) { struct ggml_tensor * tensor; if (lt.ne.size() == 2) { tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); @@ -665,6 +674,7 @@ struct llama_model_loader { } ggml_set_name(tensor, lt.name.c_str()); LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor + tensor->backend = backend; lt.ggml_tensor = tensor; num_ggml_tensors_created++; return tensor; @@ -678,12 +688,16 @@ struct llama_model_loader { void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t data_size = 0; + size_t prefetch_size = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + prefetch_size += lt.size; + } } if (use_mmap) { - mapping.reset(new llama_mmap(&file_loaders.at(0)->file)); + mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size)); if (!lmlock) { // Don't call the callback since the actual loading will be lazy // and we can't measure it. @@ -696,6 +710,9 @@ struct llama_model_loader { size_t done_size = 0; for (llama_load_tensor & lt : tensors_map.tensors) { + if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) { + continue; + } if (progress_callback) { progress_callback((float) done_size / data_size, progress_callback_user_data); } @@ -708,9 +725,6 @@ struct llama_model_loader { lmlock->grow_to(done_size); } } - if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); - } } void load_data_for(llama_load_tensor & lt) { @@ -812,10 +826,9 @@ static bool kv_cache_init( struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, - /*.n_parts =*/ -1, /*.gpu_layers =*/ 0, /*.seed =*/ -1, - /*.f16_kv =*/ false, + /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, @@ -836,6 +849,21 @@ bool llama_mlock_supported() { return llama_mlock::SUPPORTED; } +void llama_init_backend() { + ggml_time_init(); + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } +} + +int64_t llama_time_us() { + return ggml_time_us(); +} + // // model loading // @@ -845,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) { case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; - case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; + case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; + case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; } return "unknown"; @@ -925,11 +954,19 @@ static void llama_model_load_internal( fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } - if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { + if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { - throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); + throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"); + } + } + + if (file_version < LLAMA_FILE_VERSION_GGJT_V3) { + if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || + hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || + hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { + throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"); } } @@ -942,27 +979,7 @@ static void llama_model_load_internal( size_t ctx_size; size_t mmapped_size; ml->calc_sizes(&ctx_size, &mmapped_size); - fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0); - - // print memory requirements - { - const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; - - // this is the total memory required to run the inference - const size_t mem_required = - ctx_size + - mmapped_size + - MEM_REQ_SCRATCH0().at(model.type) + - MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at(model.type); - - // this is the memory required by one llama_state - const size_t mem_required_state = - scale*MEM_REQ_KV_SELF().at(model.type); - - fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, - mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); - } + fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0); // create the ggml context { @@ -984,7 +1001,14 @@ static void llama_model_load_internal( } } +#ifdef GGML_USE_CUBLAS +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA +#else +#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU +#endif + // prepare memory for the weights + size_t vram_total = 0; { const uint32_t n_embd = hparams.n_embd; const uint32_t n_layer = hparams.n_layer; @@ -992,33 +1016,87 @@ static void llama_model_load_internal( ml->ggml_ctx = ctx; - model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}); - model.norm = ml->get_tensor("norm.weight", {n_embd}); - model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}); + model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU); + + // "output" tensor + { + ggml_backend backend_output; + if (n_gpu_layers > int(n_layer)) { // NOLINT + backend_output = LLAMA_BACKEND_OFFLOAD; + } else { + backend_output = GGML_BACKEND_CPU; + } + + model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); + } + + const int i_gpu_start = n_layer - n_gpu_layers; model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + auto & layer = model.layers[i]; std::string layers_i = "layers." + std::to_string(i); - layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}); + layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); - layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}); - layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}); - layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}); - layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend); - layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}); + layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); - layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}); - layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}); - layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}); + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend); + + if (backend == GGML_BACKEND_CUDA) { + vram_total += + ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) + + ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + } } } ml->done_getting_tensors(); + // print memory requirements + { + const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; + + // this is the total memory required to run the inference + const size_t mem_required = + ctx_size + + mmapped_size - vram_total + // weights in VRAM not in memory + MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH1().at(model.type) + + MEM_REQ_EVAL().at(model.type); + + // this is the memory required by one llama_state + const size_t mem_required_state = + scale*MEM_REQ_KV_SELF().at(model.type); + + fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); + +#ifdef GGML_USE_CUBLAS + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu); + if (n_gpu_layers > (int) hparams.n_layer) { + fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); + } + fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); +#else + (void) n_gpu_layers; +#endif + } + // populate `tensors_by_name` for (llama_load_tensor & lt : ml->tensors_map.tensors) { model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); @@ -1026,36 +1104,34 @@ static void llama_model_load_internal( ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); - model.mapping = std::move(ml->mapping); #ifdef GGML_USE_CUBLAS { - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - - fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu); - - size_t vram_total = 0; - - for (int i = 0; i < n_gpu; ++i) { - const auto & layer = model.layers[i]; - - ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq); - ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk); - ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv); - ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo); - ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1); - ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2); - ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); + size_t done_size = 0; + size_t data_size = 0; + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + data_size += lt.size; + if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { + done_size += lt.size; + } } - if (n_gpu_layers > (int) hparams.n_layer) { - fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__); - ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + for (llama_load_tensor & lt : ml->tensors_map.tensors) { + if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) { + continue; + } + if (progress_callback) { + progress_callback((float) done_size / data_size, progress_callback_user_data); + } + ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off); + done_size += lt.size; } - - fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); } -#else - (void) n_gpu_layers; -#endif +#endif // GGML_USE_CUBLAS + + if (progress_callback) { + progress_callback(1.0f, progress_callback_user_data); + } + + model.mapping = std::move(ml->mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration @@ -1154,10 +1230,8 @@ static bool llama_eval_internal( { cur = ggml_rms_norm(ctx0, inpL); - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].attention_norm, cur), - cur); + // cur = cur*attention_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm); } // self-attention @@ -1264,10 +1338,8 @@ static bool llama_eval_internal( { cur = ggml_rms_norm(ctx0, inpFF); - // cur = ffn_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), - cur); + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, @@ -1304,10 +1376,8 @@ static bool llama_eval_internal( inpL = ggml_rms_norm(ctx0, inpL); - // inpL = norm*inpL - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model.norm, inpL), - inpL); + // inpL = inpL*norm(broadcasted) + inpL = ggml_mul(ctx0, inpL, model.norm); embeddings = inpL; } @@ -2131,7 +2201,7 @@ struct llama_context * llama_init_from_file( unsigned * cur_percentage_p = (unsigned *) ctx; unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { - ++*cur_percentage_p; + *cur_percentage_p = percentage; fprintf(stderr, "."); fflush(stderr); if (percentage >= 100) { @@ -2224,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * { uint32_t magic; fin.read((char *) &magic, sizeof(magic)); - if (magic != 'ggla') { + if (magic != LLAMA_FILE_MAGIC_GGLA) { fprintf(stderr, "%s: bad file magic\n", __func__); return 1; } @@ -2288,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * // maybe this should in llama_model_loader if (model_loader->use_mmap) { - model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false)); + model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0)); } } @@ -2381,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * } size_t idx = model_loader->tensors_map.name_to_idx[base_name]; llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; - base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }); + base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); lt.data = (uint8_t *) lt.ggml_tensor->data; model_loader->load_data_for(lt); lt.ggml_tensor->data = lt.data; @@ -2607,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { } // Sets the state reading from the specified source address -size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { - const uint8_t * inp = src; +size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { + uint8_t * inp = src; // set rng { diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h index 21cba8cf..37bae535 100644 --- a/examples/talk-llama/llama.h +++ b/examples/talk-llama/llama.h @@ -19,10 +19,16 @@ # define LLAMA_API #endif -#define LLAMA_FILE_VERSION 2 -#define LLAMA_FILE_MAGIC 'ggjt' -#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' -#define LLAMA_SESSION_MAGIC 'ggsn' +#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' + +#define LLAMA_FILE_VERSION 3 +#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT +#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML +#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 1 #ifdef __cplusplus @@ -40,9 +46,9 @@ extern "C" { typedef int llama_token; typedef struct llama_token_data { - llama_token id; // token id - float logit; // log-odds of the token - float p; // probability of the token + llama_token id; // token id + float logit; // log-odds of the token + float p; // probability of the token } llama_token_data; typedef struct llama_token_data_array { @@ -55,7 +61,6 @@ extern "C" { struct llama_context_params { int n_ctx; // text context - int n_parts; // -1 for default int n_gpu_layers; // number of layers to store in VRAM int seed; // RNG seed, -1 for random @@ -74,16 +79,16 @@ extern "C" { // model file types enum llama_ftype { - LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed - // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed + // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); @@ -91,6 +96,13 @@ extern "C" { LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mlock_supported(); + // TODO: not great API - very likely to change + // Initialize the llama + ggml backend + // Call once at the start of the program + LLAMA_API void llama_init_backend(); + + LLAMA_API int64_t llama_time_us(); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure @@ -139,7 +151,7 @@ extern "C" { // Set the state reading from the specified address // Returns the number of bytes read - LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); + LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); // Save/load session file LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp index 45b8cb73..cdeb2d9b 100644 --- a/examples/talk-llama/talk-llama.cpp +++ b/examples/talk-llama/talk-llama.cpp @@ -33,8 +33,6 @@ struct whisper_params { int32_t max_tokens = 32; int32_t audio_ctx = 0; - int32_t n_parts_llama = -1; - float vad_thold = 0.6f; float freq_thold = 100.0f; @@ -72,7 +70,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } - else if (arg == "--n-parts-llama") { params.n_parts_llama = std::stoi(argv[++i]); } else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } @@ -123,7 +120,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str()); fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str()); - fprintf(stderr, " --n-parts-llama N [%-7d] num parts in llama model file\n", params.n_parts_llama); fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str()); fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", ""); fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n"); @@ -239,13 +235,14 @@ int main(int argc, char ** argv) { // llama init + llama_init_backend(); + auto lparams = llama_context_default_params(); // tune these to your liking lparams.n_ctx = 2048; lparams.seed = 1; lparams.f16_kv = true; - lparams.n_parts = params.n_parts_llama; struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);