mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-02-28 16:12:46 +01:00
talk-llama : update to latest llama.cpp
This commit is contained in:
parent
80c1512fd5
commit
1ca4041b86
@ -19,7 +19,7 @@ brew install sdl2
|
|||||||
make talk-llama
|
make talk-llama
|
||||||
|
|
||||||
# Run it
|
# Run it
|
||||||
./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
|
./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
|
||||||
```
|
```
|
||||||
|
|
||||||
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
|
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
|
||||||
@ -36,7 +36,7 @@ This feature is especially helpful for maintaining context in long conversations
|
|||||||
Example usage:
|
Example usage:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
|
./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
|
||||||
```
|
```
|
||||||
|
|
||||||
## TTS
|
## TTS
|
||||||
|
@ -1,474 +0,0 @@
|
|||||||
// Internal header to be included only by llama.cpp.
|
|
||||||
// Contains wrappers around OS interfaces.
|
|
||||||
|
|
||||||
#ifndef LLAMA_UTIL_H
|
|
||||||
#define LLAMA_UTIL_H
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cerrno>
|
|
||||||
#include <cstring>
|
|
||||||
#include <cstdarg>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#ifdef __has_include
|
|
||||||
#if __has_include(<unistd.h>)
|
|
||||||
#include <unistd.h>
|
|
||||||
#if defined(_POSIX_MAPPED_FILES)
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#endif
|
|
||||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
|
||||||
#include <sys/resource.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
#define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#include <io.h>
|
|
||||||
#include <stdio.h> // for _fseeki64
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define LLAMA_ASSERT(x) \
|
|
||||||
do { \
|
|
||||||
if (!(x)) { \
|
|
||||||
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
|
||||||
abort(); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#ifdef __GNUC__
|
|
||||||
#ifdef __MINGW32__
|
|
||||||
__attribute__((format(gnu_printf, 1, 2)))
|
|
||||||
#else
|
|
||||||
__attribute__((format(printf, 1, 2)))
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
static std::string format(const char * fmt, ...) {
|
|
||||||
va_list ap, ap2;
|
|
||||||
va_start(ap, fmt);
|
|
||||||
va_copy(ap2, ap);
|
|
||||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
||||||
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
|
|
||||||
std::vector<char> buf(size + 1);
|
|
||||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
||||||
LLAMA_ASSERT(size2 == size);
|
|
||||||
va_end(ap2);
|
|
||||||
va_end(ap);
|
|
||||||
return std::string(buf.data(), size);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_file {
|
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
|
||||||
FILE * fp;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
llama_file(const char * fname, const char * mode) {
|
|
||||||
fp = std::fopen(fname, mode);
|
|
||||||
if (fp == NULL) {
|
|
||||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
||||||
}
|
|
||||||
seek(0, SEEK_END);
|
|
||||||
size = tell();
|
|
||||||
seek(0, SEEK_SET);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t tell() const {
|
|
||||||
#ifdef _WIN32
|
|
||||||
__int64 ret = _ftelli64(fp);
|
|
||||||
#else
|
|
||||||
long ret = std::ftell(fp);
|
|
||||||
#endif
|
|
||||||
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
|
|
||||||
return (size_t) ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void seek(size_t offset, int whence) {
|
|
||||||
#ifdef _WIN32
|
|
||||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
|
||||||
#else
|
|
||||||
int ret = std::fseek(fp, (long) offset, whence);
|
|
||||||
#endif
|
|
||||||
LLAMA_ASSERT(ret == 0); // same
|
|
||||||
}
|
|
||||||
|
|
||||||
void read_raw(void * ptr, size_t len) const {
|
|
||||||
if (len == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
std::size_t ret = std::fread(ptr, len, 1, fp);
|
|
||||||
if (ferror(fp)) {
|
|
||||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
if (ret != 1) {
|
|
||||||
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::uint32_t read_u32() {
|
|
||||||
std::uint32_t ret;
|
|
||||||
read_raw(&ret, sizeof(ret));
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string read_string(std::uint32_t len) {
|
|
||||||
std::vector<char> chars(len);
|
|
||||||
read_raw(chars.data(), len);
|
|
||||||
return std::string(chars.data(), len);
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t len) const {
|
|
||||||
if (len == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
size_t ret = std::fwrite(ptr, len, 1, fp);
|
|
||||||
if (ret != 1) {
|
|
||||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_u32(std::uint32_t val) {
|
|
||||||
write_raw(&val, sizeof(val));
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_file() {
|
|
||||||
if (fp) {
|
|
||||||
std::fclose(fp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
|
||||||
static std::string llama_format_win_err(DWORD err) {
|
|
||||||
LPSTR buf;
|
|
||||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
|
||||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
|
||||||
if (!size) {
|
|
||||||
return "FormatMessageA failed";
|
|
||||||
}
|
|
||||||
std::string ret(buf, size);
|
|
||||||
LocalFree(buf);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct llama_mmap {
|
|
||||||
void * addr;
|
|
||||||
size_t size;
|
|
||||||
|
|
||||||
llama_mmap(const llama_mmap &) = delete;
|
|
||||||
|
|
||||||
#ifdef _POSIX_MAPPED_FILES
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
|
|
||||||
size = file->size;
|
|
||||||
int fd = fileno(file->fp);
|
|
||||||
int flags = MAP_SHARED;
|
|
||||||
#ifdef __linux__
|
|
||||||
flags |= MAP_POPULATE;
|
|
||||||
#endif
|
|
||||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
|
||||||
if (addr == MAP_FAILED) {
|
|
||||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prefetch > 0) {
|
|
||||||
// Advise the kernel to preload the mapped memory
|
|
||||||
if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
|
|
||||||
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
|
|
||||||
strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_mmap() {
|
|
||||||
munmap(addr, size);
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
|
||||||
size = file->size;
|
|
||||||
|
|
||||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
|
||||||
|
|
||||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
|
||||||
DWORD error = GetLastError();
|
|
||||||
|
|
||||||
if (hMapping == NULL) {
|
|
||||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
|
||||||
error = GetLastError();
|
|
||||||
CloseHandle(hMapping);
|
|
||||||
|
|
||||||
if (addr == NULL) {
|
|
||||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
||||||
if (prefetch) {
|
|
||||||
// Advise the kernel to preload the mapped memory
|
|
||||||
WIN32_MEMORY_RANGE_ENTRY range;
|
|
||||||
range.VirtualAddress = addr;
|
|
||||||
range.NumberOfBytes = (SIZE_T)size;
|
|
||||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
|
||||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
|
||||||
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_mmap() {
|
|
||||||
if (!UnmapViewOfFile(addr)) {
|
|
||||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static constexpr bool SUPPORTED = false;
|
|
||||||
|
|
||||||
llama_mmap(struct llama_file *, bool prefetch = true) {
|
|
||||||
(void)prefetch;
|
|
||||||
throw std::runtime_error(std::string("mmap not supported"));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
|
||||||
// will automatically unlock on destruction.
|
|
||||||
struct llama_mlock {
|
|
||||||
void * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
bool failed_already = false;
|
|
||||||
|
|
||||||
llama_mlock() {}
|
|
||||||
llama_mlock(const llama_mlock &) = delete;
|
|
||||||
|
|
||||||
~llama_mlock() {
|
|
||||||
if (size) {
|
|
||||||
raw_unlock(addr, size);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void init(void * ptr) {
|
|
||||||
LLAMA_ASSERT(addr == NULL && size == 0);
|
|
||||||
addr = ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void grow_to(size_t target_size) {
|
|
||||||
LLAMA_ASSERT(addr);
|
|
||||||
if (failed_already) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
size_t granularity = lock_granularity();
|
|
||||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
|
||||||
if (target_size > size) {
|
|
||||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
|
||||||
size = target_size;
|
|
||||||
} else {
|
|
||||||
failed_already = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef _POSIX_MEMLOCK_RANGE
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
return (size_t) sysconf(_SC_PAGESIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __APPLE__
|
|
||||||
#define MLOCK_SUGGESTION \
|
|
||||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
|
||||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
|
||||||
#else
|
|
||||||
#define MLOCK_SUGGESTION \
|
|
||||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t size) {
|
|
||||||
if (!mlock(addr, size)) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
char* errmsg = std::strerror(errno);
|
|
||||||
bool suggest = (errno == ENOMEM);
|
|
||||||
|
|
||||||
// Check if the resource limit is fine after all
|
|
||||||
struct rlimit lock_limit;
|
|
||||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
|
||||||
suggest = false;
|
|
||||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
|
||||||
suggest = false;
|
|
||||||
|
|
||||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
|
||||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#undef MLOCK_SUGGESTION
|
|
||||||
|
|
||||||
void raw_unlock(void * addr, size_t size) {
|
|
||||||
if (munlock(addr, size)) {
|
|
||||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#elif defined(_WIN32)
|
|
||||||
static constexpr bool SUPPORTED = true;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
SYSTEM_INFO si;
|
|
||||||
GetSystemInfo(&si);
|
|
||||||
return (size_t) si.dwPageSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool raw_lock(void * ptr, size_t len) {
|
|
||||||
for (int tries = 1; ; tries++) {
|
|
||||||
if (VirtualLock(ptr, len)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (tries == 2) {
|
|
||||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
|
||||||
len, size, llama_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// It failed but this was only the first try; increase the working
|
|
||||||
// set size and try again.
|
|
||||||
SIZE_T min_ws_size, max_ws_size;
|
|
||||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
|
||||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Per MSDN: "The maximum number of pages that a process can lock
|
|
||||||
// is equal to the number of pages in its minimum working set minus
|
|
||||||
// a small overhead."
|
|
||||||
// Hopefully a megabyte is enough overhead:
|
|
||||||
size_t increment = len + 1048576;
|
|
||||||
// The minimum must be <= the maximum, so we need to increase both:
|
|
||||||
min_ws_size += increment;
|
|
||||||
max_ws_size += increment;
|
|
||||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
|
||||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void raw_unlock(void * ptr, size_t len) {
|
|
||||||
if (!VirtualUnlock(ptr, len)) {
|
|
||||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static constexpr bool SUPPORTED = false;
|
|
||||||
|
|
||||||
size_t lock_granularity() {
|
|
||||||
return (size_t) 65536;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool raw_lock(const void * addr, size_t len) {
|
|
||||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void raw_unlock(const void * addr, size_t len) {}
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
|
||||||
struct llama_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
llama_buffer() = default;
|
|
||||||
|
|
||||||
void resize(size_t len) {
|
|
||||||
delete[] addr;
|
|
||||||
addr = new uint8_t[len];
|
|
||||||
size = len;
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_buffer() {
|
|
||||||
delete[] addr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// disable copy and move
|
|
||||||
llama_buffer(const llama_buffer&) = delete;
|
|
||||||
llama_buffer(llama_buffer&&) = delete;
|
|
||||||
llama_buffer& operator=(const llama_buffer&) = delete;
|
|
||||||
llama_buffer& operator=(llama_buffer&&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
struct llama_ctx_buffer {
|
|
||||||
uint8_t * addr = NULL;
|
|
||||||
bool is_cuda;
|
|
||||||
size_t size = 0;
|
|
||||||
|
|
||||||
llama_ctx_buffer() = default;
|
|
||||||
|
|
||||||
void resize(size_t size) {
|
|
||||||
free();
|
|
||||||
|
|
||||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
|
||||||
if (addr) {
|
|
||||||
is_cuda = true;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// fall back to pageable memory
|
|
||||||
addr = new uint8_t[size];
|
|
||||||
is_cuda = false;
|
|
||||||
}
|
|
||||||
this->size = size;
|
|
||||||
}
|
|
||||||
|
|
||||||
void free() {
|
|
||||||
if (addr) {
|
|
||||||
if (is_cuda) {
|
|
||||||
ggml_cuda_host_free(addr);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
delete[] addr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
addr = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
~llama_ctx_buffer() {
|
|
||||||
free();
|
|
||||||
}
|
|
||||||
|
|
||||||
// disable copy and move
|
|
||||||
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
|
||||||
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
|
||||||
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
|
||||||
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
|
||||||
};
|
|
||||||
#else
|
|
||||||
typedef llama_buffer llama_ctx_buffer;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
File diff suppressed because it is too large
Load Diff
@ -1,8 +1,16 @@
|
|||||||
#ifndef LLAMA_H
|
#ifndef LLAMA_H
|
||||||
#define LLAMA_H
|
#define LLAMA_H
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||||
|
#else
|
||||||
|
#define LLAMA_MAX_DEVICES 1
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
#ifdef LLAMA_SHARED
|
||||||
@ -19,18 +27,26 @@
|
|||||||
# define LLAMA_API
|
# define LLAMA_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
#ifdef __GNUC__
|
||||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||||
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
#elif defined(_MSC_VER)
|
||||||
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
||||||
|
#else
|
||||||
|
# define DEPRECATED(func, hint) func
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
||||||
|
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_FILE_VERSION 3
|
|
||||||
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
|
||||||
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 1
|
#define LLAMA_SESSION_VERSION 1
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
@ -41,10 +57,57 @@ extern "C" {
|
|||||||
// TODO: show sample usage
|
// TODO: show sample usage
|
||||||
//
|
//
|
||||||
|
|
||||||
|
struct llama_model;
|
||||||
struct llama_context;
|
struct llama_context;
|
||||||
|
|
||||||
typedef int llama_token;
|
typedef int llama_token;
|
||||||
|
|
||||||
|
enum llama_log_level {
|
||||||
|
LLAMA_LOG_LEVEL_ERROR = 2,
|
||||||
|
LLAMA_LOG_LEVEL_WARN = 3,
|
||||||
|
LLAMA_LOG_LEVEL_INFO = 4
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llama_vocab_type {
|
||||||
|
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
||||||
|
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
||||||
|
};
|
||||||
|
|
||||||
|
enum llama_token_type {
|
||||||
|
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||||
|
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||||
|
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
||||||
|
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
||||||
|
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
||||||
|
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
||||||
|
LLAMA_TOKEN_TYPE_BYTE = 6,
|
||||||
|
};
|
||||||
|
|
||||||
|
// model file types
|
||||||
|
enum llama_ftype {
|
||||||
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||||
|
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
||||||
|
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
||||||
|
|
||||||
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
|
};
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data {
|
||||||
llama_token id; // token id
|
llama_token id; // token id
|
||||||
float logit; // log-odds of the token
|
float logit; // log-odds of the token
|
||||||
@ -60,67 +123,152 @@ extern "C" {
|
|||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
int n_ctx; // text context
|
uint32_t seed; // RNG seed, -1 for random
|
||||||
int n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_ctx; // text context
|
||||||
int seed; // RNG seed, -1 for random
|
int32_t n_batch; // prompt processing batch size
|
||||||
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
|
|
||||||
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
|
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
|
float rope_freq_base; // RoPE base frequency
|
||||||
|
float rope_freq_scale; // RoPE frequency scaling factor
|
||||||
|
|
||||||
|
// called with a progress value between 0 and 1, pass NULL to disable
|
||||||
|
llama_progress_callback progress_callback;
|
||||||
|
// context pointer passed to the progress callback
|
||||||
|
void * progress_callback_user_data;
|
||||||
|
|
||||||
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||||
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||||
bool f16_kv; // use fp16 for KV cache
|
bool f16_kv; // use fp16 for KV cache
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
bool use_mlock; // force system to keep model in RAM
|
bool use_mlock; // force system to keep model in RAM
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
|
||||||
llama_progress_callback progress_callback;
|
|
||||||
// context pointer passed to the progress callback
|
|
||||||
void * progress_callback_user_data;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// model file types
|
// Signature for logging events
|
||||||
enum llama_ftype {
|
// Note that text includes the new line character at the end for most events.
|
||||||
LLAMA_FTYPE_ALL_F32 = 0,
|
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
||||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
// if it exists.
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
// It might not exist for progress report where '.' is output repeatedly.
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
// model quantization parameters
|
||||||
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
typedef struct llama_model_quantize_params {
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
||||||
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
|
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
||||||
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
|
// grammar types
|
||||||
|
struct llama_grammar;
|
||||||
|
|
||||||
|
// grammar element type
|
||||||
|
enum llama_gretype {
|
||||||
|
// end of rule definition
|
||||||
|
LLAMA_GRETYPE_END = 0,
|
||||||
|
|
||||||
|
// start of alternate definition for rule
|
||||||
|
LLAMA_GRETYPE_ALT = 1,
|
||||||
|
|
||||||
|
// non-terminal element: reference to rule
|
||||||
|
LLAMA_GRETYPE_RULE_REF = 2,
|
||||||
|
|
||||||
|
// terminal element: character (code point)
|
||||||
|
LLAMA_GRETYPE_CHAR = 3,
|
||||||
|
|
||||||
|
// inverse char(s) ([^a], [^a-b] [^abc])
|
||||||
|
LLAMA_GRETYPE_CHAR_NOT = 4,
|
||||||
|
|
||||||
|
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
||||||
|
// be an inclusive range ([a-z])
|
||||||
|
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
||||||
|
|
||||||
|
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
||||||
|
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
||||||
|
LLAMA_GRETYPE_CHAR_ALT = 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
typedef struct llama_grammar_element {
|
||||||
|
enum llama_gretype type;
|
||||||
|
uint32_t value; // Unicode code point or rule ID
|
||||||
|
} llama_grammar_element;
|
||||||
|
|
||||||
LLAMA_API bool llama_mmap_supported();
|
// performance timing information
|
||||||
LLAMA_API bool llama_mlock_supported();
|
struct llama_timings {
|
||||||
|
double t_start_ms;
|
||||||
|
double t_end_ms;
|
||||||
|
double t_load_ms;
|
||||||
|
double t_sample_ms;
|
||||||
|
double t_p_eval_ms;
|
||||||
|
double t_eval_ms;
|
||||||
|
|
||||||
|
int32_t n_sample;
|
||||||
|
int32_t n_p_eval;
|
||||||
|
int32_t n_eval;
|
||||||
|
};
|
||||||
|
|
||||||
|
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
||||||
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
||||||
|
|
||||||
// TODO: not great API - very likely to change
|
|
||||||
// Initialize the llama + ggml backend
|
// Initialize the llama + ggml backend
|
||||||
|
// If numa is true, use NUMA optimizations
|
||||||
// Call once at the start of the program
|
// Call once at the start of the program
|
||||||
LLAMA_API void llama_init_backend();
|
LLAMA_API void llama_backend_init(bool numa);
|
||||||
|
|
||||||
LLAMA_API int64_t llama_time_us();
|
// Call once at the end of the program - currently only used for MPI
|
||||||
|
LLAMA_API void llama_backend_free(void);
|
||||||
|
|
||||||
// Various functions for loading a ggml llama model.
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
||||||
// Allocate (almost) all memory needed for the model.
|
|
||||||
// Return NULL on failure
|
|
||||||
LLAMA_API struct llama_context * llama_init_from_file(
|
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
struct llama_context_params params);
|
struct llama_context_params params);
|
||||||
|
|
||||||
|
LLAMA_API void llama_free_model(struct llama_model * model);
|
||||||
|
|
||||||
|
LLAMA_API struct llama_context * llama_new_context_with_model(
|
||||||
|
struct llama_model * model,
|
||||||
|
struct llama_context_params params);
|
||||||
|
|
||||||
// Frees all allocated memory
|
// Frees all allocated memory
|
||||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||||
|
|
||||||
// TODO: not great API - very likely to change
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
|
LLAMA_API int llama_max_devices (void);
|
||||||
|
LLAMA_API bool llama_mmap_supported (void);
|
||||||
|
LLAMA_API bool llama_mlock_supported(void);
|
||||||
|
|
||||||
|
LLAMA_API int llama_n_vocab (const struct llama_context * ctx);
|
||||||
|
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
||||||
|
LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
|
||||||
|
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
||||||
|
|
||||||
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
||||||
|
|
||||||
|
LLAMA_API int llama_model_n_vocab (const struct llama_model * model);
|
||||||
|
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
||||||
|
LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
|
||||||
|
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
||||||
|
|
||||||
|
// Get a string describing the model type
|
||||||
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||||
|
// Returns the total size of all the tensors in the model in bytes
|
||||||
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||||
|
// Returns the total number of parameters in the model
|
||||||
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
|
||||||
LLAMA_API int llama_model_quantize(
|
LLAMA_API int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
enum llama_ftype ftype,
|
const llama_model_quantize_params * params);
|
||||||
int nthread);
|
|
||||||
|
|
||||||
// Apply a LoRA adapter to a loaded model
|
// Apply a LoRA adapter to a loaded model
|
||||||
// path_base_model is the path to a higher quality model to use as a base for
|
// path_base_model is the path to a higher quality model to use as a base for
|
||||||
@ -128,8 +276,15 @@ extern "C" {
|
|||||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||||
// will be applied on top of the previous one
|
// will be applied on top of the previous one
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API int llama_apply_lora_from_file(
|
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
|
const char * path_lora,
|
||||||
|
const char * path_base_model,
|
||||||
|
int n_threads),
|
||||||
|
"please use llama_model_apply_lora_from_file instead");
|
||||||
|
|
||||||
|
LLAMA_API int llama_model_apply_lora_from_file(
|
||||||
|
const struct llama_model * model,
|
||||||
const char * path_lora,
|
const char * path_lora,
|
||||||
const char * path_base_model,
|
const char * path_base_model,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
@ -138,7 +293,7 @@ extern "C" {
|
|||||||
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||||
|
|
||||||
// Sets the current rng seed.
|
// Sets the current rng seed.
|
||||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
||||||
|
|
||||||
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||||
// and kv_cache) - will often be smaller after compacting tokens
|
// and kv_cache) - will often be smaller after compacting tokens
|
||||||
@ -168,21 +323,19 @@ extern "C" {
|
|||||||
int n_past,
|
int n_past,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
|
|
||||||
// Convert the provided text into tokens.
|
// Same as llama_eval, but use float matrix input directly.
|
||||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
LLAMA_API int llama_eval_embd(
|
||||||
// Returns the number of tokens on success, no more than n_max_tokens
|
|
||||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
|
||||||
// TODO: not sure if correct
|
|
||||||
LLAMA_API int llama_tokenize(
|
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const char * text,
|
const float * embd,
|
||||||
llama_token * tokens,
|
int n_tokens,
|
||||||
int n_max_tokens,
|
int n_past,
|
||||||
bool add_bos);
|
int n_threads);
|
||||||
|
|
||||||
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
// Export a static computation graph for context of 511 and batch size of 1
|
||||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
// NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
|
||||||
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
// parameters here to keep things simple
|
||||||
|
// IMPORTANT: do not use for anything else other than debugging and testing!
|
||||||
|
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
||||||
|
|
||||||
// Token logits obtained from the last call to llama_eval()
|
// Token logits obtained from the last call to llama_eval()
|
||||||
// The logits for the last token are stored in the last row
|
// The logits for the last token are stored in the last row
|
||||||
@ -195,15 +348,75 @@ extern "C" {
|
|||||||
// shape: [n_embd] (1-dimensional)
|
// shape: [n_embd] (1-dimensional)
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
|
|
||||||
// Token Id -> String. Uses the vocabulary in the provided context
|
//
|
||||||
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
|
// Vocab
|
||||||
|
//
|
||||||
|
|
||||||
|
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
||||||
|
|
||||||
|
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
||||||
|
|
||||||
|
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||||
|
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos();
|
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos();
|
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
||||||
LLAMA_API llama_token llama_token_nl();
|
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
||||||
|
|
||||||
|
//
|
||||||
|
// Tokenization
|
||||||
|
//
|
||||||
|
|
||||||
|
// Convert the provided text into tokens.
|
||||||
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
||||||
|
// Returns the number of tokens on success, no more than n_max_tokens
|
||||||
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
||||||
|
LLAMA_API int llama_tokenize(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * text,
|
||||||
|
llama_token * tokens,
|
||||||
|
int n_max_tokens,
|
||||||
|
bool add_bos);
|
||||||
|
|
||||||
|
LLAMA_API int llama_tokenize_with_model(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const char * text,
|
||||||
|
llama_token * tokens,
|
||||||
|
int n_max_tokens,
|
||||||
|
bool add_bos);
|
||||||
|
|
||||||
|
// Token Id -> Piece.
|
||||||
|
// Uses the vocabulary in the provided context.
|
||||||
|
// Does not write null terminator to the buffer.
|
||||||
|
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
||||||
|
LLAMA_API int llama_token_to_piece(
|
||||||
|
const struct llama_context * ctx,
|
||||||
|
llama_token token,
|
||||||
|
char * buf,
|
||||||
|
int length);
|
||||||
|
|
||||||
|
LLAMA_API int llama_token_to_piece_with_model(
|
||||||
|
const struct llama_model * model,
|
||||||
|
llama_token token,
|
||||||
|
char * buf,
|
||||||
|
int length);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Grammar
|
||||||
|
//
|
||||||
|
|
||||||
|
LLAMA_API struct llama_grammar * llama_grammar_init(
|
||||||
|
const llama_grammar_element ** rules,
|
||||||
|
size_t n_rules,
|
||||||
|
size_t start_rule_index);
|
||||||
|
|
||||||
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
||||||
|
|
||||||
|
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
||||||
|
|
||||||
|
//
|
||||||
// Sampling functions
|
// Sampling functions
|
||||||
|
//
|
||||||
|
|
||||||
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
||||||
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
||||||
@ -211,6 +424,16 @@ extern "C" {
|
|||||||
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
||||||
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
|
||||||
|
|
||||||
|
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
||||||
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
||||||
|
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
||||||
|
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
||||||
|
LLAMA_API void llama_sample_classifier_free_guidance(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_token_data_array * candidates,
|
||||||
|
struct llama_context * guidance_ctx,
|
||||||
|
float scale);
|
||||||
|
|
||||||
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||||
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||||
|
|
||||||
@ -227,6 +450,9 @@ extern "C" {
|
|||||||
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
|
||||||
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
|
||||||
|
|
||||||
|
/// @details Apply constraints from grammar
|
||||||
|
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
|
||||||
|
|
||||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||||
@ -248,13 +474,60 @@ extern "C" {
|
|||||||
/// @details Randomly selects a token from the candidates based on their probabilities.
|
/// @details Randomly selects a token from the candidates based on their probabilities.
|
||||||
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||||
|
|
||||||
|
/// @details Accepts the sampled token into the grammar
|
||||||
|
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Beam search
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_beam_view {
|
||||||
|
const llama_token * tokens;
|
||||||
|
size_t n_tokens;
|
||||||
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||||
|
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
||||||
|
};
|
||||||
|
|
||||||
|
// Passed to beam_search_callback function.
|
||||||
|
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
||||||
|
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
||||||
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
||||||
|
struct llama_beams_state {
|
||||||
|
struct llama_beam_view * beam_views;
|
||||||
|
size_t n_beams; // Number of elements in beam_views[].
|
||||||
|
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
||||||
|
bool last_call; // True iff this is the last callback invocation.
|
||||||
|
};
|
||||||
|
|
||||||
|
// Type of pointer to the beam_search_callback function.
|
||||||
|
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
||||||
|
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
||||||
|
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
||||||
|
|
||||||
|
/// @details Deterministically returns entire sentence constructed by a beam search.
|
||||||
|
/// @param ctx Pointer to the llama_context.
|
||||||
|
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
||||||
|
/// @param callback_data A pointer that is simply passed back to callback.
|
||||||
|
/// @param n_beams Number of beams to use.
|
||||||
|
/// @param n_past Number of tokens already evaluated.
|
||||||
|
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
||||||
|
/// @param n_threads Number of threads as passed to llama_eval().
|
||||||
|
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
||||||
|
|
||||||
// Performance information
|
// Performance information
|
||||||
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||||
|
|
||||||
// Print system information
|
// Print system information
|
||||||
LLAMA_API const char * llama_print_system_info(void);
|
LLAMA_API const char * llama_print_system_info(void);
|
||||||
|
|
||||||
|
// Set callback for all future logging events.
|
||||||
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
|
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -264,10 +537,11 @@ extern "C" {
|
|||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
|
|
||||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||||
|
|
||||||
#endif
|
#endif // LLAMA_API_INTERNAL
|
||||||
|
|
||||||
#endif // LLAMA_H
|
#endif // LLAMA_H
|
||||||
|
0
examples/talk-llama/speak
Normal file → Executable file
0
examples/talk-llama/speak
Normal file → Executable file
@ -25,6 +25,20 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
||||||
|
std::vector<char> result(8, 0);
|
||||||
|
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
|
if (n_tokens < 0) {
|
||||||
|
result.resize(-n_tokens);
|
||||||
|
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
||||||
|
GGML_ASSERT(check == -n_tokens);
|
||||||
|
} else {
|
||||||
|
result.resize(n_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::string(result.data(), result.size());
|
||||||
|
}
|
||||||
|
|
||||||
// command-line parameters
|
// command-line parameters
|
||||||
struct whisper_params {
|
struct whisper_params {
|
||||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
@ -235,7 +249,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// llama init
|
// llama init
|
||||||
|
|
||||||
llama_init_backend();
|
llama_backend_init(true);
|
||||||
|
|
||||||
auto lparams = llama_context_default_params();
|
auto lparams = llama_context_default_params();
|
||||||
|
|
||||||
@ -244,7 +258,9 @@ int main(int argc, char ** argv) {
|
|||||||
lparams.seed = 1;
|
lparams.seed = 1;
|
||||||
lparams.f16_kv = true;
|
lparams.f16_kv = true;
|
||||||
|
|
||||||
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
|
struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lparams);
|
||||||
|
|
||||||
|
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lparams);
|
||||||
|
|
||||||
// print some info about the processing
|
// print some info about the processing
|
||||||
{
|
{
|
||||||
@ -267,7 +283,6 @@ int main(int argc, char ** argv) {
|
|||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// init audio
|
// init audio
|
||||||
|
|
||||||
audio_async audio(30*1000);
|
audio_async audio(30*1000);
|
||||||
@ -278,8 +293,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
audio.resume();
|
audio.resume();
|
||||||
|
|
||||||
int n_iter = 0;
|
|
||||||
|
|
||||||
bool is_running = true;
|
bool is_running = true;
|
||||||
bool force_speak = false;
|
bool force_speak = false;
|
||||||
|
|
||||||
@ -514,7 +527,7 @@ int main(int argc, char ** argv) {
|
|||||||
//printf("\n---\n");
|
//printf("\n---\n");
|
||||||
//printf("resetting: '");
|
//printf("resetting: '");
|
||||||
//for (int i = 0; i < (int) embd.size(); i++) {
|
//for (int i = 0; i < (int) embd.size(); i++) {
|
||||||
// printf("%s", llama_token_to_str(ctx_llama, embd[i]));
|
// printf("%s", llama_token_to_piece(ctx_llama, embd[i]));
|
||||||
//}
|
//}
|
||||||
//printf("'\n");
|
//printf("'\n");
|
||||||
//printf("\n---\n");
|
//printf("\n---\n");
|
||||||
@ -582,7 +595,7 @@ int main(int argc, char ** argv) {
|
|||||||
auto logits = llama_get_logits(ctx_llama);
|
auto logits = llama_get_logits(ctx_llama);
|
||||||
auto n_vocab = llama_n_vocab(ctx_llama);
|
auto n_vocab = llama_n_vocab(ctx_llama);
|
||||||
|
|
||||||
logits[llama_token_eos()] = 0;
|
logits[llama_token_eos(ctx_llama)] = 0;
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
@ -593,13 +606,13 @@ int main(int argc, char ** argv) {
|
|||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
|
||||||
// apply repeat penalty
|
// apply repeat penalty
|
||||||
const float nl_logit = logits[llama_token_nl()];
|
const float nl_logit = logits[llama_token_nl(ctx_llama)];
|
||||||
|
|
||||||
llama_sample_repetition_penalty(ctx_llama, &candidates_p,
|
llama_sample_repetition_penalty(ctx_llama, &candidates_p,
|
||||||
embd_inp.data() + std::max(0, n_past - repeat_last_n),
|
embd_inp.data() + std::max(0, n_past - repeat_last_n),
|
||||||
repeat_last_n, repeat_penalty);
|
repeat_last_n, repeat_penalty);
|
||||||
|
|
||||||
logits[llama_token_nl()] = nl_logit;
|
logits[llama_token_nl(ctx_llama)] = nl_logit;
|
||||||
|
|
||||||
if (temp <= 0) {
|
if (temp <= 0) {
|
||||||
// Greedy sampling
|
// Greedy sampling
|
||||||
@ -613,22 +626,22 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (id != llama_token_eos()) {
|
if (id != llama_token_eos(ctx_llama)) {
|
||||||
// add it to the context
|
// add it to the context
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
text_to_speak += llama_token_to_str(ctx_llama, id);
|
text_to_speak += llama_token_to_piece(ctx_llama, id);
|
||||||
|
|
||||||
printf("%s", llama_token_to_str(ctx_llama, id));
|
printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
std::string last_output;
|
std::string last_output;
|
||||||
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
|
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
|
||||||
last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
|
last_output += llama_token_to_piece(ctx_llama, embd_inp[i]);
|
||||||
}
|
}
|
||||||
last_output += llama_token_to_str(ctx_llama, embd[0]);
|
last_output += llama_token_to_piece(ctx_llama, embd[0]);
|
||||||
|
|
||||||
for (std::string & antiprompt : antiprompts) {
|
for (std::string & antiprompt : antiprompts) {
|
||||||
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
||||||
@ -655,8 +668,6 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
audio.clear();
|
audio.clear();
|
||||||
|
|
||||||
++n_iter;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user