ggml : upgrade init_tensor API to return a ggml_status (llama/11854)

* Upgrade init_tensor API to return a ggml_status

To prepare for an 'abort-free' ggml
(ggml not to abort on OOMs but return a OOM status),
as agreeed with Diego in the ggml repo,
upgrade the init_tensor() and view_init() APIs
to return a ggml_status.

* misc fixes

---------

Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
William Tambellini 2025-02-28 05:41:47 -08:00 committed by Georgi Gerganov
parent 3bab804981
commit c98681e6d5
13 changed files with 72 additions and 52 deletions

View File

@ -19,7 +19,7 @@ struct ggml_tallocr {
};
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
GGML_API enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
// Graph allocator
/*

View File

@ -56,7 +56,7 @@ extern "C" {
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
@ -342,8 +342,8 @@ extern "C" {
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor);
// CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

View File

@ -89,7 +89,7 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
return talloc;
}
void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
size = GGML_PAD(size, talloc->alignment);
@ -104,7 +104,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
assert(((uintptr_t)addr % talloc->alignment) == 0);
ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
}
// dynamic tensor allocator
@ -933,42 +933,51 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
// utils
static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free((*buffers)[i]);
}
free(*buffers);
}
static bool alloc_tensor_range(struct ggml_context * ctx,
struct ggml_tensor * first, struct ggml_tensor * last,
ggml_backend_buffer_type_t buft, size_t size,
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free((*buffers)[i]);
}
free(*buffers);
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
free_buffers(buffers, n_buffers);
return false;
}
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) {
ggml_backend_view_init(t);
}
} else {
if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor
ggml_backend_view_init(t);
}
}
}
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
(*buffers)[(*n_buffers)++] = buffer;
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
enum ggml_status status = GGML_STATUS_SUCCESS;
if (t->data == NULL) {
if (t->view_src == NULL) {
status = ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) {
status = ggml_backend_view_init(t);
}
} else {
if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor
status = ggml_backend_view_init(t);
}
}
if (status != GGML_STATUS_SUCCESS) {
GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
free_buffers(buffers, n_buffers);
return false;
}
}
return true;
}

View File

@ -44,7 +44,7 @@ extern "C" {
// base address of the buffer
void * (*get_base) (ggml_backend_buffer_t buffer);
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
// tensor data access
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);

View File

@ -126,11 +126,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
return base;
}
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
// init_tensor is optional
if (buffer->iface.init_tensor) {
buffer->iface.init_tensor(buffer, tensor);
return buffer->iface.init_tensor(buffer, tensor);
}
return GGML_STATUS_SUCCESS;
}
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@ -1641,7 +1642,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
// utils
void ggml_backend_view_init(struct ggml_tensor * tensor) {
enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
@ -1649,10 +1650,10 @@ void ggml_backend_view_init(struct ggml_tensor * tensor) {
tensor->buffer = tensor->view_src->buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
return ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
}
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->data == NULL);
GGML_ASSERT(tensor->view_src == NULL);
@ -1662,7 +1663,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
tensor->buffer = buffer;
tensor->data = addr;
ggml_backend_buffer_init_tensor(buffer, tensor);
return ggml_backend_buffer_init_tensor(buffer, tensor);
}
static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
@ -1708,7 +1709,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
ggml_backend_view_init(dst);
enum ggml_status status = ggml_backend_view_init(dst);
GGML_ASSERT(status == GGML_STATUS_SUCCESS);
}
else {
ggml_backend_tensor_copy(src, dst);
@ -1823,7 +1825,6 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
assert(g1->n_nodes == g2->n_nodes);
for (int i = 0; i < g1->n_nodes; i++) {
//printf("eval %d/%d\n", i, g1->n_nodes);
struct ggml_tensor * t1 = g1->nodes[i];
struct ggml_tensor * t2 = g2->nodes[i];

View File

@ -796,11 +796,11 @@ static bool need_transform(ggml_type type) {
* @param buffer The CANN buffer from which to initialize the tensor.
* @param tensor Pointer to the tensor to be initialized.
*/
static void ggml_backend_cann_buffer_init_tensor(
static enum ggml_status ggml_backend_cann_buffer_init_tensor(
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
if (tensor->view_src != NULL && tensor->view_offs == 0) {
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
return;
return GGML_STATUS_SUCCESS;
}
// TODO: can backend doesn't support quantized yet. Just leave the code
@ -817,6 +817,7 @@ static void ggml_backend_cann_buffer_init_tensor(
memset_size, 0, memset_size));
}
}
return GGML_STATUS_SUCCESS;
}
// TODO: need handle tensor which has paddings.

View File

@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *) (buffer->context);
}
static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
}
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,

View File

@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
return nullptr;
}
static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
static enum ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
GGML_UNUSED(buffer);
return GGML_STATUS_SUCCESS;
}
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,

View File

@ -540,12 +540,12 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
return ctx->dev_ptr;
}
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
if (tensor->view_src != NULL) {
assert(tensor->view_src->buffer->buft == buffer->buft);
return;
return GGML_STATUS_SUCCESS;
}
if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
@ -558,6 +558,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
}
}
return GGML_STATUS_SUCCESS;
}
static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@ -792,7 +793,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
GGML_UNUSED(buffer);
}
static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@ -838,6 +839,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
}
}
tensor->extra = extra;
return GGML_STATUS_SUCCESS;
}
static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {

View File

@ -1211,7 +1211,7 @@ static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer)
GGML_UNUSED(buffer);
}
static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
ggml_cl2_init(buffer->buft->device);
@ -1251,6 +1251,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
tensor->extra = extra;
}
}
return GGML_STATUS_SUCCESS;
}
// The optimized gemm and gemv kernels are used for large matrices without batch.

View File

@ -464,7 +464,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
return result;
}
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
// CUDA backend on the server pads everything to 512 due to CUDA limitations.
@ -478,6 +478,7 @@ static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, gg
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
GGML_ASSERT(status);
}
return GGML_STATUS_SUCCESS;
}
static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {

View File

@ -323,14 +323,14 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
return ctx->dev_ptr;
}
static void
static enum ggml_status
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
ggml_tensor *tensor) try {
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
if (tensor->view_src != NULL) {
assert(tensor->view_src->buffer->buft == buffer->buft);
return;
return GGML_STATUS_SUCCESS;
}
ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
@ -348,6 +348,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
padded_size - original_size).wait()));
}
}
return GGML_STATUS_SUCCESS;
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@ -729,7 +730,7 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
GGML_UNUSED(buffer);
}
static void
static enum ggml_status
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
ggml_tensor *tensor) try {
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
@ -804,6 +805,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
}
}
tensor->extra = extra;
return GGML_STATUS_SUCCESS;
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__

View File

@ -7845,11 +7845,12 @@ static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
UNUSED(buffer);
}
static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
if (tensor->view_src != nullptr) {
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
}
return GGML_STATUS_SUCCESS;
}
static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {