move BLAS to a separate backend (llama/6210)

* move BLAS to a separate backend

* rename GGML_USE_OPENBLAS to GGML_USE_BLAS

* alloc : reuse same buffer when the same buffer type if used multiple times

* set number of threads automatically for openblas and blis

* sched : print assignments when GGML_SCHED_DEBUG env variable is set

* sched : allow ops with weights on an incompatible buffer type

This will cause the weight to be copied to a backend that supports the
op, which is very costly. The weight should have been stored in a buffer
of a backend that can run the op, but llama.cpp cannot do this
automatically at the moment.

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
slaren 2024-06-13 03:11:35 +02:00 committed by Georgi Gerganov
parent d8b7a24bc9
commit 174a461fc6
11 changed files with 373 additions and 353 deletions

View File

@ -339,6 +339,7 @@ struct hash_node {
}; };
struct tensor_alloc { struct tensor_alloc {
int buffer_id;
size_t offset; size_t offset;
size_t size_max; // 0 = pre-allocated, unused, or view size_t size_max; // 0 = pre-allocated, unused, or view
}; };
@ -349,7 +350,6 @@ struct leaf_alloc {
}; };
struct node_alloc { struct node_alloc {
int buffer_id;
struct tensor_alloc dst; struct tensor_alloc dst;
struct tensor_alloc src[GGML_MAX_SRC]; struct tensor_alloc src[GGML_MAX_SRC];
}; };
@ -386,9 +386,20 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
for (int i = 0; i < n_bufs; i++) { for (int i = 0; i < n_bufs; i++) {
galloc->bufts[i] = bufts[i]; galloc->bufts[i] = bufts[i];
galloc->buffers[i] = NULL; galloc->buffers[i] = NULL;
// check if the same buffer type is used multiple times and reuse the same allocator
for (int j = 0; j < i; j++) {
if (bufts[i] == bufts[j]) {
galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
break;
}
}
if (galloc->buf_tallocs[i] == NULL) {
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
} }
}
galloc->n_buffers = n_bufs; galloc->n_buffers = n_bufs;
return galloc; return galloc;
@ -405,12 +416,32 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
if (galloc->buffers != NULL) { if (galloc->buffers != NULL) {
// skip if already freed
bool freed = false;
for (int j = 0; j < i; j++) {
if (galloc->buffers[j] == galloc->buffers[i]) {
freed = true;
break;
}
}
if (!freed) {
ggml_backend_buffer_free(galloc->buffers[i]); ggml_backend_buffer_free(galloc->buffers[i]);
} }
}
if (galloc->buf_tallocs != NULL) { if (galloc->buf_tallocs != NULL) {
// skip if already freed
bool freed = false;
for (int j = 0; j < i; j++) {
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
freed = true;
break;
}
}
if (!freed) {
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
} }
} }
}
free(galloc->hash_set.keys); free(galloc->hash_set.keys);
free(galloc->hash_values); free(galloc->hash_values);
@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
} }
} }
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
// graph outputs are never freed // graph outputs are never freed
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
AT_PRINTF("not freeing output %s\n", node->name); AT_PRINTF("not freeing output %s\n", node->name);
return; return;
} }
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
size_t offset = hn->offset; size_t offset = hn->offset;
int buffer_id = hn->buffer_id;
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node); size_t size = ggml_backend_buft_get_alloc_size(buft, node);
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
hn->allocated = false; hn->allocated = false;
@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
AT_PRINTF("view_src %s: %d children, %d views\n", AT_PRINTF("view_src %s: %d children, %d views\n",
view_src->name, view_src_hn->n_children, view_src_hn->n_views); view_src->name, view_src_hn->n_children, view_src_hn->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
ggml_gallocr_free_node(galloc, view_src, buffer_id); ggml_gallocr_free_node(galloc, view_src);
} }
} }
else if (p_hn->allocated) { else if (p_hn->allocated) {
ggml_gallocr_free_node(galloc, parent, buffer_id); ggml_gallocr_free_node(galloc, parent);
} }
} }
AT_PRINTF("\n"); AT_PRINTF("\n");
@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i];
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
if (node->view_src || node->data) { if (node->view_src || node->data) {
node_alloc->dst.buffer_id = -1;
node_alloc->dst.offset = SIZE_MAX; node_alloc->dst.offset = SIZE_MAX;
node_alloc->dst.size_max = 0; node_alloc->dst.size_max = 0;
} else { } else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
node_alloc->dst.buffer_id = hn->buffer_id;
node_alloc->dst.offset = hn->offset; node_alloc->dst.offset = hn->offset;
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
} }
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j]; struct ggml_tensor * src = node->src[j];
if (!src || src->view_src || src->data) { if (!src || src->view_src || src->data) {
node_alloc->src[j].buffer_id = -1;
node_alloc->src[j].offset = SIZE_MAX; node_alloc->src[j].offset = SIZE_MAX;
node_alloc->src[j].size_max = 0; node_alloc->src[j].size_max = 0;
} else { } else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
node_alloc->src[j].buffer_id = hn->buffer_id;
node_alloc->src[j].offset = hn->offset; node_alloc->src[j].offset = hn->offset;
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
} }
@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id; galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
if (leaf->view_src || leaf->data) { if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
galloc->leaf_allocs[i].leaf.size_max = 0; galloc->leaf_allocs[i].leaf.size_max = 0;
} else { } else {
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].leaf.offset = hn->offset; galloc->leaf_allocs[i].leaf.offset = hn->offset;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
} }
@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
// reallocate buffers if needed // reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) { for (int i = 0; i < galloc->n_buffers; i++) {
// if the buffer type is used multiple times, we reuse the same buffer
for (int j = 0; j < i; j++) {
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
galloc->buffers[i] = galloc->buffers[j];
break;
}
}
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif #endif
ggml_backend_buffer_free(galloc->buffers[i]); ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) { if (galloc->buffers[i] == NULL) {
@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
} }
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) { static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
int buffer_id = tensor_alloc->buffer_id;
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
} }
} }
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
return talloc->size_max >= node_size; return talloc->size_max >= node_size;
} }
@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i];
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
#endif #endif
@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
#endif #endif
@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
for (int i = 0; i < graph->n_leafs; i++) { for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i]; struct ggml_tensor * leaf = graph->leafs[i];
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf); ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
} }
// nodes // nodes
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
if (src == NULL) { if (src == NULL) {
continue; continue;
} }
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
} }
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
} }
return true; return true;
@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
if (galloc->buffers[buffer_id] == NULL) { if (galloc->buffers[buffer_id] == NULL) {
return 0; return 0;
} }
for (int i = 0; i < buffer_id; i++) {
if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
// this buffer is the same as a previous one due to the same buffer type being used multiple times
// only return the buffer size the first time it appears to avoid double counting
return 0;
}
}
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
} }

View File

@ -17,13 +17,15 @@ extern "C" {
struct ggml_backend_buffer_type_i { struct ggml_backend_buffer_type_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
// allocate a buffer of this type
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment // tensor alignment
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding // max buffer size that can be allocated
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
// data size needed to allocate the tensor, including padding
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// check if tensor data is in host memory // check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
}; };
@ -92,27 +94,37 @@ extern "C" {
void (*GGML_CALL synchronize)(ggml_backend_t backend); void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan (not used currently) // compute graph with a plan (not used currently)
// create a new plan for a graph
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
// compute graph with a plan void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
// compute the graph with the plan
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async) // compute graph without a plan (async)
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation // check if the backend can compute an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// check if the backend can use tensors allocated in a buffer type
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend // these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily // even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization // (optional) event synchronization
// create a new event that can record events on this backend instance
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event); void (*GGML_CALL event_free) (ggml_backend_event_t event);
// record an event on the backend instance that created it
void (*GGML_CALL event_record) (ggml_backend_event_t event); void (*GGML_CALL event_record) (ggml_backend_event_t event);
// wait for an event on on a different backend instance
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
// block until an event is recorded
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
}; };

View File

@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
return ggml_nbytes(tensor); return ggml_nbytes(tensor);
} }
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return buft->iface.supports_backend(buft, backend);
}
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
if (buft->iface.is_host) { if (buft->iface.is_host) {
return buft->iface.is_host(buft); return buft->iface.is_host(buft);
@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
return backend->iface.supports_op(backend, op); return backend->iface.supports_op(backend, op);
} }
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return backend->iface.supports_buft(backend, buft);
}
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
if (backend->iface.offload_op != NULL) { if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op); return backend->iface.offload_op(backend, op);
@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_cpu(backend);
GGML_UNUSED(buft);
}
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true; return true;
@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
}, },
/* .context = */ NULL, /* .context = */ NULL,
@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
}, },
/* .context = */ NULL, /* .context = */ NULL,
@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(backend);
}
static struct ggml_backend_i cpu_backend_i = { static struct ggml_backend_i cpu_backend_i = {
/* .get_name = */ ggml_backend_cpu_name, /* .get_name = */ ggml_backend_cpu_name,
/* .free = */ ggml_backend_cpu_free, /* .free = */ ggml_backend_cpu_free,
@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
/* .synchronize = */ NULL, /* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op, /* .supports_op = */ ggml_backend_cpu_supports_op,
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,
@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
int * node_backend_ids; // [graph_size] int * node_backend_ids; // [graph_size]
int * leaf_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size]
int * prev_node_backend_ids; // [graph_size]
int * prev_leaf_backend_ids; // [graph_size]
// copy of the graph with modified inputs // copy of the graph with modified inputs
struct ggml_cgraph * graph; struct ggml_cgraph * graph;
@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
ggml_backend_sched_eval_callback callback_eval; ggml_backend_sched_eval_callback callback_eval;
void * callback_eval_user_data; void * callback_eval_user_data;
bool debug;
// align context_buffer to GGML_MEM_ALIGN // align context_buffer to GGML_MEM_ALIGN
#ifdef _MSC_VER #ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN)) __declspec(align(GGML_MEM_ALIGN))
@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
return -1; return -1;
} }
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) { static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
ggml_backend_buffer_t buffer = tensor->buffer; ggml_backend_buffer_t buffer = tensor->buffer;
if (buffer == NULL) { if (buffer == NULL) {
return -1; return -1;
} }
// find highest prio backend that supports the buffer type // find highest prio backend that supports the buffer type and the op
for (int i = 0; i < sched->n_backends; i++) { for (int i = 0; i < sched->n_backends; i++) {
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
ggml_backend_supports_op(sched->backends[i], op)) {
return i; return i;
} }
} }
fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n", #ifndef NDEBUG
__func__, ggml_backend_buffer_name(buffer), tensor->name); fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
GGML_ASSERT(false); __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
#endif
return -1; return -1;
} }
@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// TODO: use supports_op to check if the backend supports the op // TODO: use supports_op to check if the backend supports the op
// assign pre-allocated nodes to their backend // assign pre-allocated nodes to their backend
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor); int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
if (cur_backend_id != -1) { if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.dst"); SET_CAUSE(tensor, "1.dst");
return cur_backend_id; return cur_backend_id;
@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// view_src // view_src
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
if (cur_backend_id != -1) { if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.vsrc"); SET_CAUSE(tensor, "1.vsrc");
return cur_backend_id; return cur_backend_id;
@ -1161,7 +1168,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
continue; continue;
} }
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src); int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op // check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1) { if (src_backend_id == sched->n_backends - 1) {
for (int b = 0; b < src_backend_id; b++) { for (int b = 0; b < src_backend_id; b++) {
@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
} }
} }
//#define DEBUG_PASS1 static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
//#define DEBUG_PASS2 ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
//#define DEBUG_PASS3 ggml_backend_buffer_type_t buft = NULL;
//#define DEBUG_PASS4
if (buf) {
// the tensor is already allocated
buft = buf->buft;
} else {
// see if the tensor already has a backend assigned, and use the buffer type of that backend
int tensor_backend_id = tensor_backend_id(t);
if (tensor_backend_id == -1 && t->view_src) {
tensor_backend_id = tensor_backend_id(t->view_src);
}
if (tensor_backend_id != -1) {
buft = sched->bufts[tensor_backend_id];
}
}
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
}
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.sup");
}
}
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
} }
#ifdef DEBUG_PASS1
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 2: expand current backend assignments // pass 2: expand current backend assignments
// assign the same backend to adjacent nodes // assign the same backend to adjacent nodes
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
// expand gpu down
// pass 2.2 expand gpu down
{ {
int cur_backend_id = -1; int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} else { } else {
cur_backend_id = *node_backend_id; cur_backend_id = *node_backend_id;
} }
} else { } else if (cur_backend_id != -1) {
*node_backend_id = cur_backend_id; ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
SET_CAUSE(node, "2.2");
} }
} }
} }
// pass 2.1 expand gpu up // expand gpu up
{ {
int cur_backend_id = -1; int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) { for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} else { } else {
cur_backend_id = *node_backend_id; cur_backend_id = *node_backend_id;
} }
} else { } else if (cur_backend_id != -1) {
*node_backend_id = cur_backend_id; ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
SET_CAUSE(node, "2.1");
} }
} }
} }
// pass 2.4 expand rest down // expand rest down
{ {
int cur_backend_id = -1; int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
int * node_backend_id = &tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) { if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id; cur_backend_id = *node_backend_id;
} else { } else if (cur_backend_id != -1) {
*node_backend_id = cur_backend_id; ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
SET_CAUSE(node, "2.4");
} }
} }
} }
// pass 2.3 expand rest up // expand rest up
{ {
int cur_backend_id = -1; int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) { for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
int * node_backend_id = &tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) { if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id; cur_backend_id = *node_backend_id;
} else if (cur_backend_id != -1) {
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
// however, we also need to verify that the sources are in compatible buffer types
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id == -1) {
// unassigned node: find the backend with the most supported inputs
int n_supported_best = -1;
for (int b = 0; b < sched->n_backends; b++) {
if (ggml_backend_supports_op(sched->backends[b], node)) {
int n_supported = 0;
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
n_supported++;
}
}
if (n_supported > n_supported_best) {
n_supported_best = n_supported;
*node_backend_id = b;
SET_CAUSE(node, "3.best");
}
}
}
} else { } else {
*node_backend_id = cur_backend_id; // assigned node: upgrade to higher prio backend if possible
SET_CAUSE(node, "2.3"); for (int b = 0; b < *node_backend_id; b++) {
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
bool supported = true;
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
supported = false;
break;
}
}
if (supported) {
*node_backend_id = b;
SET_CAUSE(node, "3.upg");
break;
}
}
} }
} }
} }
#ifdef DEBUG_PASS2 // pass 4: assign backends to remaining src from dst and view_src
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 3: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
int * cur_backend_id = &tensor_backend_id(node); int * cur_backend_id = &tensor_backend_id(node);
if (node->view_src != NULL && *cur_backend_id == -1) { if (node->view_src != NULL && *cur_backend_id == -1) {
*cur_backend_id = tensor_backend_id(node->view_src); *cur_backend_id = tensor_backend_id(node->view_src);
SET_CAUSE(node, "3.vsrc"); SET_CAUSE(node, "4.vsrc");
} }
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j]; struct ggml_tensor * src = node->src[j];
@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src->view_src != NULL) { if (src->view_src != NULL) {
// views are always on the same backend as the source // views are always on the same backend as the source
*src_backend_id = tensor_backend_id(src->view_src); *src_backend_id = tensor_backend_id(src->view_src);
SET_CAUSE(src, "3.vsrc"); SET_CAUSE(src, "4.vsrc");
} else { } else {
*src_backend_id = *cur_backend_id; *src_backend_id = *cur_backend_id;
SET_CAUSE(src, "3.cur"); SET_CAUSE(src, "4.cur");
} }
} }
} }
} }
#ifdef DEBUG_PASS3
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
// pass 4: split graph, find tensors that need to be copied // pass 4: split graph, find tensors that need to be copied
{ {
@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
// check if the split has too many inputs // check if the split has too many inputs
// FIXME: count the number of inputs instead of only checking when full
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src); const size_t id = hash_id(src);
int src_backend_id = sched->tensor_backend_id[id]; int src_backend_id = sched->tensor_backend_id[id];
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) { bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true; need_new_split = true;
break; break;
@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
if (src_backend_id != node_backend_id) { bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && !supported) {
// create a copy of the input in the split's backend // create a copy of the input in the split's backend
const size_t id = hash_id(src); const size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
split->i_end = graph->n_nodes; split->i_end = graph->n_nodes;
sched->n_splits = i_split + 1; sched->n_splits = i_split + 1;
} }
#ifdef DEBUG_PASS4
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); if (sched->debug) {
#endif ggml_backend_sched_print_assignments(sched, graph);
}
// swap node_backend_ids and leaf_backend_ids and prevs
{
int * tmp = sched->node_backend_ids;
sched->node_backend_ids = sched->prev_node_backend_ids;
sched->prev_node_backend_ids = tmp;
tmp = sched->leaf_backend_ids;
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
sched->prev_leaf_backend_ids = tmp;
}
// create copies of the graph for each split // create copies of the graph for each split
// TODO: avoid this copy // TODO: avoid this copy
@ -1613,8 +1704,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
bool backend_ids_changed = false;
for (int i = 0; i < sched->graph->n_nodes; i++) {
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
backend_ids_changed = true;
break;
}
}
if (!backend_ids_changed) {
for (int i = 0; i < sched->graph->n_leafs; i++) {
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
backend_ids_changed = true;
break;
}
}
}
// allocate graph // allocate graph
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
// the re-allocation may cause the split inputs to be moved to a different address // the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched); ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG #ifndef NDEBUG
@ -1727,6 +1834,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
// initialize hash table // initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size); sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
@ -1735,6 +1844,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->n_backends = n_backends; sched->n_backends = n_backends;
@ -1747,7 +1858,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
for (int b = 0; b < n_backends; b++) { for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b]; sched->backends[b] = backends[b];
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b])); GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
if (sched->n_copies > 1) { if (sched->n_copies > 1) {
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
sched->events[b][c] = ggml_backend_event_new(backends[b]); sched->events[b][c] = ggml_backend_event_new(backends[b]);
@ -1779,6 +1890,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
free(sched->tensor_copies); free(sched->tensor_copies);
free(sched->node_backend_ids); free(sched->node_backend_ids);
free(sched->leaf_backend_ids); free(sched->leaf_backend_ids);
free(sched->prev_node_backend_ids);
free(sched->prev_leaf_backend_ids);
free(sched); free(sched);
} }
@ -1875,6 +1988,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
int backend_index = ggml_backend_sched_backend_id(sched, backend); int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index; tensor_backend_id(node) = backend_index;
SET_CAUSE(node, "usr");
} }
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {

View File

@ -23,7 +23,6 @@ extern "C" {
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer // buffer
@ -74,6 +73,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends // tensor copy between different backends
@ -90,7 +90,7 @@ extern "C" {
GGML_API void ggml_backend_event_free (ggml_backend_event_t event); GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event); GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
// //
// CPU backend // CPU backend
@ -119,7 +119,7 @@ extern "C" {
GGML_API size_t ggml_backend_reg_get_count(void); GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_reg_find_by_name(const char * name); GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params] GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
GGML_API const char * ggml_backend_reg_get_name(size_t i); GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);

View File

@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
return ctx->name.c_str(); return ctx->name.c_str();
} }
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
}
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
if (!ggml_backend_is_cuda(backend)) {
return false;
}
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
return buft_ctx->device == cuda_ctx->device;
}
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .get_name = */ ggml_backend_cuda_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
@ -863,6 +855,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
GGML_UNUSED(buft); GGML_UNUSED(buft);
} }
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
}
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
// instead, we allocate them for each tensor separately in init_tensor // instead, we allocate them for each tensor separately in init_tensor
@ -906,12 +902,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
return total_size; return total_size;
} }
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_cuda(backend);
GGML_UNUSED(buft);
}
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false; return false;
@ -924,7 +914,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
}; };
@ -1024,7 +1013,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
}, },
/* .context = */ nullptr, /* .context = */ nullptr,
@ -2879,6 +2867,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cuda_split(buft)) {
return true;
}
if (ggml_backend_buft_is_cuda(buft)) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
return buft_ctx->device == cuda_ctx->device;
}
return false;
}
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
const int min_batch_size = 32; const int min_batch_size = 32;
@ -2951,9 +2953,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
/* .synchronize = */ ggml_backend_cuda_synchronize, /* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op, /* .supports_op = */ ggml_backend_cuda_supports_op,
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
/* .offload_op = */ ggml_backend_cuda_offload_op, /* .offload_op = */ ggml_backend_cuda_offload_op,
/* .event_new = */ ggml_backend_cuda_event_new, /* .event_new = */ ggml_backend_cuda_event_new,
/* .event_free = */ ggml_backend_cuda_event_free, /* .event_free = */ ggml_backend_cuda_event_free,

View File

@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
return ctx->max_alloc; return ctx->max_alloc;
} }
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
GGML_UNUSED(buft);
return ggml_backend_is_kompute(backend);
}
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name, /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
return ggml_vk_supports_op(op); return ggml_vk_supports_op(op);
} }
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
GGML_UNUSED(backend);
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
}
static struct ggml_backend_i kompute_backend_i = { static struct ggml_backend_i kompute_backend_i = {
/* .get_name = */ ggml_backend_kompute_name, /* .get_name = */ ggml_backend_kompute_name,
/* .free = */ ggml_backend_kompute_free, /* .free = */ ggml_backend_kompute_free,
@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
/* .synchronize = */ NULL, /* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .graph_compute = */ ggml_backend_kompute_graph_compute,
/* .supports_op = */ ggml_backend_kompute_supports_op, /* .supports_op = */ ggml_backend_kompute_supports_op,
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

View File

@ -3044,12 +3044,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
UNUSED(buft); UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
UNUSED(buft);
}
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true; return true;
@ -3064,7 +3058,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host, /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
}, },
/* .context = */ NULL, /* .context = */ NULL,
@ -3179,6 +3172,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
return ggml_metal_supports_op(metal_ctx, op); return ggml_metal_supports_op(metal_ctx, op);
} }
GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
UNUSED(backend);
}
static struct ggml_backend_i ggml_backend_metal_i = { static struct ggml_backend_i ggml_backend_metal_i = {
/* .get_name = */ ggml_backend_metal_name, /* .get_name = */ ggml_backend_metal_name,
/* .free = */ ggml_backend_metal_free, /* .free = */ ggml_backend_metal_free,
@ -3189,9 +3188,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
/* .synchronize = */ NULL, /* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute, /* .graph_compute = */ ggml_backend_metal_graph_compute,
/* .supports_op = */ ggml_backend_metal_supports_op, /* .supports_op = */ ggml_backend_metal_supports_op,
/* .supports_buft = */ ggml_backend_metal_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

View File

@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
return ggml_nbytes(tensor); return ggml_nbytes(tensor);
} }
GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
if (!ggml_backend_is_rpc(backend)) {
return false;
}
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
return buft_ctx->endpoint == rpc_ctx->endpoint;
}
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
/* .get_name = */ ggml_backend_rpc_buffer_type_name, /* .get_name = */ ggml_backend_rpc_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_rpc_get_max_size, /* .get_max_size = */ ggml_backend_rpc_get_max_size,
/* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const
return false; return false;
} }
GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) {
return false;
}
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
return buft_ctx->endpoint == rpc_ctx->endpoint;
}
static ggml_backend_i ggml_backend_rpc_interface = { static ggml_backend_i ggml_backend_rpc_interface = {
/* .get_name = */ ggml_backend_rpc_name, /* .get_name = */ ggml_backend_rpc_name,
/* .free = */ ggml_backend_rpc_free, /* .free = */ ggml_backend_rpc_free,
@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
/* .synchronize = */ ggml_backend_rpc_synchronize, /* .synchronize = */ ggml_backend_rpc_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .graph_compute = */ ggml_backend_rpc_graph_compute,
/* .supports_op = */ ggml_backend_rpc_supports_op, /* .supports_op = */ ggml_backend_rpc_supports_op,
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
/* .offload_op = */ NULL, /* .offload_op = */ NULL,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

View File

@ -16575,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
UNUSED(buft); UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
if (!ggml_backend_is_sycl(backend)) {
return false;
}
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
return buft_ctx->device == sycl_ctx->device;
}
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .get_name = */ ggml_backend_sycl_buffer_type_name, /* .get_name = */ ggml_backend_sycl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size, /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
/* .is_host = */ nullptr, /* .is_host = */ nullptr,
}; };
@ -16942,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
return total_size; return total_size;
} }
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_sycl(backend);
UNUSED(buft);
}
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false; return false;
@ -16960,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
}; };
@ -17046,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
}, },
/* .context = */ nullptr, /* .context = */ nullptr,
@ -17311,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
GGML_UNUSED(backend); GGML_UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
return false;
}
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
return buft_ctx->device == sycl_ctx->device;
}
static ggml_backend_i ggml_backend_sycl_interface = { static ggml_backend_i ggml_backend_sycl_interface = {
/* .get_name = */ ggml_backend_sycl_name, /* .get_name = */ ggml_backend_sycl_name,
@ -17322,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .synchronize = */ ggml_backend_sycl_synchronize, /* .synchronize = */ ggml_backend_sycl_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .graph_compute = */ ggml_backend_sycl_graph_compute,
/* .supports_op = */ ggml_backend_sycl_supports_op, /* .supports_op = */ ggml_backend_sycl_supports_op,
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
/* .offload_op = */ ggml_backend_sycl_offload_op, /* .offload_op = */ ggml_backend_sycl_offload_op,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

View File

@ -6142,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
UNUSED(buft); UNUSED(buft);
} }
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
if (!ggml_backend_is_vk(backend)) {
return false;
}
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
return buft_ctx->ctx->idx == ctx->idx;
}
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
/* .get_name = */ ggml_backend_vk_buffer_type_name, /* .get_name = */ ggml_backend_vk_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
/* .is_host = */ NULL, /* .is_host = */ NULL,
}; };
@ -6235,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
}, },
/* .context = */ nullptr, /* .context = */ nullptr,
@ -6551,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
UNUSED(backend); UNUSED(backend);
} }
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
return false;
}
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
return buft_ctx->ctx->idx == ctx->idx;
}
// TODO: enable async and synchronize // TODO: enable async and synchronize
static ggml_backend_i ggml_backend_vk_interface = { static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name, /* .get_name = */ ggml_backend_vk_name,
@ -6562,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize, /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL, /* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL, /* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL, /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute, /* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ ggml_backend_vk_supports_op, /* .supports_op = */ ggml_backend_vk_supports_op,
/* .supports_buft = */ ggml_backend_vk_supports_buft,
/* .offload_op = */ ggml_backend_vk_offload_op, /* .offload_op = */ ggml_backend_vk_offload_op,
/* .event_new = */ NULL, /* .event_new = */ NULL,
/* .event_free = */ NULL, /* .event_free = */ NULL,

197
ggml.c
View File

@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#if defined(GGML_USE_ACCELERATE) #if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h> #include <Accelerate/Accelerate.h>
#elif defined(GGML_USE_OPENBLAS)
#if defined(GGML_BLAS_USE_MKL)
#include <mkl.h>
#else
#include <cblas.h>
#endif
#endif #endif
// floating point type used to accumulate sums // floating point type used to accumulate sums
@ -12179,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
// ggml_compute_forward_mul_mat // ggml_compute_forward_mul_mat
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
//const int64_t ne00 = src0->ne[0];
//const int64_t ne01 = src0->ne[1];
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
// all the experts for each batch element and the processing would become incredibly slow
// TODO: find the optimal values for these
if (dst->op != GGML_OP_MUL_MAT_ID &&
ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
//src0->type == GGML_TYPE_F32 &&
src1->type == GGML_TYPE_F32 &&
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
return true;
}
return false;
}
#endif
static void ggml_compute_forward_mul_mat_one_chunk( static void ggml_compute_forward_mul_mat_one_chunk(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst, struct ggml_tensor * dst,
@ -12349,73 +12310,6 @@ static void ggml_compute_forward_mul_mat(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
const int64_t ne_plane = ne01*ne00;
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
UNUSED(desired_wsize);
if (params->type == GGML_TASK_TYPE_INIT) {
if (type != GGML_TYPE_F32) {
assert(params->wsize >= desired_wsize);
// parallelize by src0 rows
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
// broadcast src0 into src1 across 2nd,3rd dimension
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
ggml_to_float_t const to_float = type_traits[type].to_float;
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
}
}
}
}
return;
}
if (params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
// perform sgemm, parallelization controlled by blas lib
if (ith != 0) {
return;
}
//const int64_t tgemm0 = ggml_perf_time_us();
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
const int64_t i03 = i13/r3;
const int64_t i02 = i12/r2;
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
if (type != GGML_TYPE_F32) {
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
}
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne1, ne01, ne10,
1.0f, y, ne10,
x, ne00,
0.0f, d, ne01);
}
}
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
return;
}
#endif
#if GGML_USE_LLAMAFILE #if GGML_USE_LLAMAFILE
const bool src1_cont = ggml_is_contiguous(src1); const bool src1_cont = ggml_is_contiguous(src1);
@ -12796,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
bool use_blas = ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
#endif
if (params->type == GGML_TASK_TYPE_INIT) { if (params->type == GGML_TASK_TYPE_INIT) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
if (use_blas) {
return;
}
#endif
if (ith != 0) { if (ith != 0) {
return; return;
} }
@ -12820,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
return; return;
} }
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (use_blas) {
if (params->ith != 0) { // All threads other than the first do no work.
return;
}
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
// src0: (k,n)
// src1: (k,m)
// dst: (m,n)
//
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
// Also expressed as (major,minor)
// a: (m,k): so src1 transposed
// b: (k,n): so src0
// c: (m,n)
//
// However, if ggml_is_transposed(src1) is true, then
// src1->data already contains a transposed version, so sgemm mustn't
// transpose it further.
int n = src0->ne[0];
int k = src0->ne[1];
int m = src1->ne[0];
int transposeA, lda;
if (!ggml_is_transposed(src1)) {
transposeA = CblasTrans;
lda = m;
} else {
transposeA = CblasNoTrans;
lda = k;
}
float * a = (float *) ((char *) src1->data);
float * b = (float *) ((char *) src0->data);
float * c = (float *) ((char *) dst->data);
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
return;
}
#endif
// dst[:,:,:,:] = 0 // dst[:,:,:,:] = 0
// for i2,i3: // for i2,i3:
// for i1: // for i1:
@ -12993,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (params->type == GGML_TASK_TYPE_INIT) { if (params->type == GGML_TASK_TYPE_INIT) {
if (ith != 0) { if (ith != 0) {
return; return;
@ -13391,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
dequantize_row_q( dequantize_row_q(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@ -13434,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
ggml_fp16_to_fp32_row( ggml_fp16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@ -13477,6 +13317,8 @@ static void ggml_compute_forward_get_rows_bf16(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
ggml_bf16_to_fp32_row( ggml_bf16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
@ -13520,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
assert(i01 >= 0 && i01 < ne01);
ggml_vec_cpy_f32(nc, ggml_vec_cpy_f32(nc,
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
@ -18893,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
switch (node->op) { switch (node->op) {
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_DUP: case GGML_OP_DUP:
case GGML_OP_CONT:
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ADD1: case GGML_OP_ADD1:
case GGML_OP_ACC: case GGML_OP_ACC:
@ -18977,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
} break; } break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_SET: case GGML_OP_SET:
case GGML_OP_CONT:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
@ -19138,7 +18982,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
} }
*node_n = atomic_load(&state->shared->node_n); *node_n = atomic_load(&state->shared->node_n);
if (* node_n != last_node_n) break; if (*node_n != last_node_n) {
break;
}
#if defined(__SSE3__) #if defined(__SSE3__)
// Tell the processor we're spinning. It's a processor hint for spinlocks. // Tell the processor we're spinning. It's a processor hint for spinlocks.
_mm_pause(); _mm_pause();
@ -19156,7 +19003,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
} }
*task_phase = atomic_load(&state->shared->node_task); *task_phase = atomic_load(&state->shared->node_task);
if (* task_phase != last_task_phase) break; if (*task_phase != last_task_phase) {
break;
}
#if defined(__SSE3__) #if defined(__SSE3__)
// Tell the processor we're spinning. It's a processor hint for spinlocks. // Tell the processor we're spinning. It's a processor hint for spinlocks.
_mm_pause(); _mm_pause();
@ -19356,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
{ {
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(node)) {
if (node->src[0]->type != GGML_TYPE_F32) {
// here we need memory for fully dequantized matrix from src0
// take into account that src0 can be broadcasted into src1[2,3]
cur = ggml_type_size(GGML_TYPE_F32)
* node->src[0]->ne[0]*node->src[0]->ne[1]
* node->src[1]->ne[2]*node->src[1]->ne[3];
}
} else
#endif
if (node->src[1]->type != vec_dot_type) { if (node->src[1]->type != vec_dot_type) {
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
} }
@ -22664,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
} }
int ggml_cpu_has_blas(void) { int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) #if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
return 1; return 1;
#else #else
return 0; return 0;