mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2024-11-07 08:34:37 +01:00
ggml : alloc ggml_contexts on the heap (#2525)
* whisper : reduce ggml_context usage * ggml : allocate contexts on the heap (v2) * ggml : aligned malloc -> malloc
This commit is contained in:
parent
19dca2bb14
commit
aa037a60f3
@ -217,7 +217,6 @@
|
|||||||
|
|
||||||
#define GGML_MAX_DIMS 4
|
#define GGML_MAX_DIMS 4
|
||||||
#define GGML_MAX_PARAMS 2048
|
#define GGML_MAX_PARAMS 2048
|
||||||
#define GGML_MAX_CONTEXTS 64
|
|
||||||
#define GGML_MAX_SRC 10
|
#define GGML_MAX_SRC 10
|
||||||
#define GGML_MAX_N_THREADS 512
|
#define GGML_MAX_N_THREADS 512
|
||||||
#define GGML_MAX_OP_PARAMS 64
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
@ -657,6 +656,7 @@ extern "C" {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
|
// TODO: deprecate and remove
|
||||||
struct ggml_scratch {
|
struct ggml_scratch {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
size_t size;
|
size_t size;
|
||||||
@ -761,6 +761,7 @@ extern "C" {
|
|||||||
// main
|
// main
|
||||||
|
|
||||||
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
|
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
|
||||||
|
GGML_API void ggml_reset(struct ggml_context * ctx);
|
||||||
GGML_API void ggml_free (struct ggml_context * ctx);
|
GGML_API void ggml_free (struct ggml_context * ctx);
|
||||||
|
|
||||||
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||||
|
@ -3129,7 +3129,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||||||
|
|
||||||
// default buffer
|
// default buffer
|
||||||
static id<MTLDevice> g_backend_device = nil;
|
static id<MTLDevice> g_backend_device = nil;
|
||||||
static int g_backend_device_ref_count = 0;
|
static int g_backend_device_ref_count = 0; // TODO: make thread-safe
|
||||||
|
|
||||||
static id<MTLDevice> ggml_backend_metal_get_device(void) {
|
static id<MTLDevice> ggml_backend_metal_get_device(void) {
|
||||||
if (g_backend_device == nil) {
|
if (g_backend_device == nil) {
|
||||||
|
@ -308,6 +308,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_DEBUG 0
|
#define GGML_DEBUG 0
|
||||||
|
|
||||||
#define GGML_GELU_FP16
|
#define GGML_GELU_FP16
|
||||||
#define GGML_GELU_QUICK_FP16
|
#define GGML_GELU_QUICK_FP16
|
||||||
|
|
||||||
@ -3234,7 +3235,6 @@ struct ggml_numa_nodes {
|
|||||||
//
|
//
|
||||||
|
|
||||||
struct ggml_state {
|
struct ggml_state {
|
||||||
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
|
|
||||||
struct ggml_numa_nodes numa;
|
struct ggml_numa_nodes numa;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -3816,17 +3816,12 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
||||||
|
|
||||||
g_state = (struct ggml_state) {
|
g_state = (struct ggml_state) {
|
||||||
/*.contexts =*/ { { 0 } },
|
|
||||||
/*.numa =*/ {
|
/*.numa =*/ {
|
||||||
.n_nodes = 0,
|
.n_nodes = 0,
|
||||||
.total_cpus = 0,
|
.total_cpus = 0,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
|
|
||||||
g_state.contexts[i].used = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||||
@ -3839,26 +3834,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
is_first_call = false;
|
is_first_call = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find non-used context in g_state
|
|
||||||
struct ggml_context * ctx = NULL;
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
|
|
||||||
if (!g_state.contexts[i].used) {
|
|
||||||
g_state.contexts[i].used = true;
|
|
||||||
ctx = &g_state.contexts[i].context;
|
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx == NULL) {
|
|
||||||
GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
|
|
||||||
|
|
||||||
ggml_critical_section_end();
|
ggml_critical_section_end();
|
||||||
|
|
||||||
return NULL;
|
struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
|
||||||
}
|
|
||||||
|
|
||||||
// allow to call ggml_init with 0 size
|
// allow to call ggml_init with 0 size
|
||||||
if (params.mem_size == 0) {
|
if (params.mem_size == 0) {
|
||||||
@ -3886,42 +3864,31 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
||||||
|
|
||||||
ggml_critical_section_end();
|
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_reset(struct ggml_context * ctx) {
|
||||||
|
if (ctx == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->n_objects = 0;
|
||||||
|
ctx->objects_begin = NULL;
|
||||||
|
ctx->objects_end = NULL;
|
||||||
|
ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, };
|
||||||
|
ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, };
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_free(struct ggml_context * ctx) {
|
void ggml_free(struct ggml_context * ctx) {
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// make this function thread safe
|
|
||||||
ggml_critical_section_start();
|
|
||||||
|
|
||||||
bool found = false;
|
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
|
|
||||||
if (&g_state.contexts[i].context == ctx) {
|
|
||||||
g_state.contexts[i].used = false;
|
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
|
|
||||||
__func__, i, ggml_used_mem(ctx));
|
|
||||||
|
|
||||||
if (ctx->mem_buffer_owned) {
|
if (ctx->mem_buffer_owned) {
|
||||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
found = true;
|
GGML_FREE(ctx);
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!found) {
|
|
||||||
GGML_PRINT_DEBUG("%s: context not found\n", __func__);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_critical_section_end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
||||||
|
@ -699,9 +699,9 @@ struct whisper_kv_cache {
|
|||||||
struct ggml_tensor * k;
|
struct ggml_tensor * k;
|
||||||
struct ggml_tensor * v;
|
struct ggml_tensor * v;
|
||||||
|
|
||||||
struct ggml_context * ctx = nullptr;
|
|
||||||
|
|
||||||
ggml_backend_buffer_t buffer = nullptr;
|
ggml_backend_buffer_t buffer = nullptr;
|
||||||
|
|
||||||
|
std::vector<uint8_t> ctx_buf;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct whisper_model {
|
struct whisper_model {
|
||||||
@ -941,9 +941,11 @@ static bool whisper_kv_cache_init(
|
|||||||
const int64_t n_mem = n_text_layer*n_ctx;
|
const int64_t n_mem = n_text_layer*n_ctx;
|
||||||
const int64_t n_elements = n_text_state*n_mem;
|
const int64_t n_elements = n_text_state*n_mem;
|
||||||
|
|
||||||
|
cache.ctx_buf.resize(2*ggml_tensor_overhead());
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ 2*ggml_tensor_overhead(),
|
/*.mem_size =*/ cache.ctx_buf.size(),
|
||||||
/*.mem_buffer =*/ nullptr,
|
/*.mem_buffer =*/ cache.ctx_buf.data(),
|
||||||
/*.no_alloc =*/ true,
|
/*.no_alloc =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -953,17 +955,17 @@ static bool whisper_kv_cache_init(
|
|||||||
cache.cells.clear();
|
cache.cells.clear();
|
||||||
cache.cells.resize(n_ctx);
|
cache.cells.resize(n_ctx);
|
||||||
|
|
||||||
cache.ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
|
||||||
if (!cache.ctx) {
|
if (!ctx) {
|
||||||
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
|
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
cache.k = ggml_new_tensor_1d(ctx, wtype, n_elements);
|
||||||
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
cache.v = ggml_new_tensor_1d(ctx, wtype, n_elements);
|
||||||
|
|
||||||
cache.buffer = ggml_backend_alloc_ctx_tensors(cache.ctx, backend);
|
cache.buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||||
if (!cache.buffer) {
|
if (!cache.buffer) {
|
||||||
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
|
WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
@ -971,13 +973,13 @@ static bool whisper_kv_cache_init(
|
|||||||
|
|
||||||
ggml_backend_buffer_clear(cache.buffer, 0);
|
ggml_backend_buffer_clear(cache.buffer, 0);
|
||||||
|
|
||||||
|
ggml_free(ctx);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void whisper_kv_cache_free(struct whisper_kv_cache & cache) {
|
static void whisper_kv_cache_free(struct whisper_kv_cache & cache) {
|
||||||
ggml_free(cache.ctx);
|
|
||||||
ggml_backend_buffer_free(cache.buffer);
|
ggml_backend_buffer_free(cache.buffer);
|
||||||
cache.ctx = nullptr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool whisper_kv_cache_find_slot(
|
static bool whisper_kv_cache_find_slot(
|
||||||
@ -2002,7 +2004,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|||||||
|
|
||||||
auto & kv_pad = wstate.kv_pad;
|
auto & kv_pad = wstate.kv_pad;
|
||||||
|
|
||||||
WHISPER_ASSERT(!!kv_pad.ctx);
|
WHISPER_ASSERT(!!kv_pad.buffer);
|
||||||
|
|
||||||
const int n_ctx_pad = GGML_PAD(n_ctx, 256);
|
const int n_ctx_pad = GGML_PAD(n_ctx, 256);
|
||||||
|
|
||||||
@ -2416,7 +2418,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|||||||
|
|
||||||
auto & kv_self = wstate.kv_self;
|
auto & kv_self = wstate.kv_self;
|
||||||
|
|
||||||
WHISPER_ASSERT(!!kv_self.ctx);
|
WHISPER_ASSERT(!!kv_self.buffer);
|
||||||
|
|
||||||
const int n_ctx = kv_self.size;
|
const int n_ctx = kv_self.size;
|
||||||
const int n_state = hparams.n_text_state;
|
const int n_state = hparams.n_text_state;
|
||||||
|
Loading…
Reference in New Issue
Block a user