mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-02-28 16:12:46 +01:00
move BLAS to a separate backend (llama/6210)
* move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
d8b7a24bc9
commit
174a461fc6
86
ggml-alloc.c
86
ggml-alloc.c
@ -339,6 +339,7 @@ struct hash_node {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct tensor_alloc {
|
struct tensor_alloc {
|
||||||
|
int buffer_id;
|
||||||
size_t offset;
|
size_t offset;
|
||||||
size_t size_max; // 0 = pre-allocated, unused, or view
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
||||||
};
|
};
|
||||||
@ -349,7 +350,6 @@ struct leaf_alloc {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct node_alloc {
|
struct node_alloc {
|
||||||
int buffer_id;
|
|
||||||
struct tensor_alloc dst;
|
struct tensor_alloc dst;
|
||||||
struct tensor_alloc src[GGML_MAX_SRC];
|
struct tensor_alloc src[GGML_MAX_SRC];
|
||||||
};
|
};
|
||||||
@ -386,9 +386,20 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
|||||||
for (int i = 0; i < n_bufs; i++) {
|
for (int i = 0; i < n_bufs; i++) {
|
||||||
galloc->bufts[i] = bufts[i];
|
galloc->bufts[i] = bufts[i];
|
||||||
galloc->buffers[i] = NULL;
|
galloc->buffers[i] = NULL;
|
||||||
|
|
||||||
|
// check if the same buffer type is used multiple times and reuse the same allocator
|
||||||
|
for (int j = 0; j < i; j++) {
|
||||||
|
if (bufts[i] == bufts[j]) {
|
||||||
|
galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (galloc->buf_tallocs[i] == NULL) {
|
||||||
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
||||||
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
galloc->n_buffers = n_bufs;
|
galloc->n_buffers = n_bufs;
|
||||||
|
|
||||||
return galloc;
|
return galloc;
|
||||||
@ -405,12 +416,32 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
|||||||
|
|
||||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
if (galloc->buffers != NULL) {
|
if (galloc->buffers != NULL) {
|
||||||
|
// skip if already freed
|
||||||
|
bool freed = false;
|
||||||
|
for (int j = 0; j < i; j++) {
|
||||||
|
if (galloc->buffers[j] == galloc->buffers[i]) {
|
||||||
|
freed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!freed) {
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (galloc->buf_tallocs != NULL) {
|
if (galloc->buf_tallocs != NULL) {
|
||||||
|
// skip if already freed
|
||||||
|
bool freed = false;
|
||||||
|
for (int j = 0; j < i; j++) {
|
||||||
|
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
||||||
|
freed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!freed) {
|
||||||
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
free(galloc->hash_set.keys);
|
free(galloc->hash_set.keys);
|
||||||
free(galloc->hash_values);
|
free(galloc->hash_values);
|
||||||
@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
||||||
// graph outputs are never freed
|
// graph outputs are never freed
|
||||||
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
||||||
AT_PRINTF("not freeing output %s\n", node->name);
|
AT_PRINTF("not freeing output %s\n", node->name);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
|
||||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
|
||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||||
size_t offset = hn->offset;
|
size_t offset = hn->offset;
|
||||||
|
int buffer_id = hn->buffer_id;
|
||||||
|
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||||
|
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||||
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
||||||
hn->allocated = false;
|
hn->allocated = false;
|
||||||
@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|||||||
AT_PRINTF("view_src %s: %d children, %d views\n",
|
AT_PRINTF("view_src %s: %d children, %d views\n",
|
||||||
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
|
||||||
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
|
||||||
ggml_gallocr_free_node(galloc, view_src, buffer_id);
|
ggml_gallocr_free_node(galloc, view_src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (p_hn->allocated) {
|
else if (p_hn->allocated) {
|
||||||
ggml_gallocr_free_node(galloc, parent, buffer_id);
|
ggml_gallocr_free_node(galloc, parent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
AT_PRINTF("\n");
|
AT_PRINTF("\n");
|
||||||
@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||||
node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
|
|
||||||
if (node->view_src || node->data) {
|
if (node->view_src || node->data) {
|
||||||
|
node_alloc->dst.buffer_id = -1;
|
||||||
node_alloc->dst.offset = SIZE_MAX;
|
node_alloc->dst.offset = SIZE_MAX;
|
||||||
node_alloc->dst.size_max = 0;
|
node_alloc->dst.size_max = 0;
|
||||||
} else {
|
} else {
|
||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||||
|
node_alloc->dst.buffer_id = hn->buffer_id;
|
||||||
node_alloc->dst.offset = hn->offset;
|
node_alloc->dst.offset = hn->offset;
|
||||||
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
||||||
}
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (!src || src->view_src || src->data) {
|
if (!src || src->view_src || src->data) {
|
||||||
|
node_alloc->src[j].buffer_id = -1;
|
||||||
node_alloc->src[j].offset = SIZE_MAX;
|
node_alloc->src[j].offset = SIZE_MAX;
|
||||||
node_alloc->src[j].size_max = 0;
|
node_alloc->src[j].size_max = 0;
|
||||||
} else {
|
} else {
|
||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
||||||
|
node_alloc->src[j].buffer_id = hn->buffer_id;
|
||||||
node_alloc->src[j].offset = hn->offset;
|
node_alloc->src[j].offset = hn->offset;
|
||||||
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
||||||
}
|
}
|
||||||
@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||||
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
||||||
if (leaf->view_src || leaf->data) {
|
if (leaf->view_src || leaf->data) {
|
||||||
|
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
||||||
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
||||||
galloc->leaf_allocs[i].leaf.size_max = 0;
|
galloc->leaf_allocs[i].leaf.size_max = 0;
|
||||||
} else {
|
} else {
|
||||||
|
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
||||||
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
||||||
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
||||||
}
|
}
|
||||||
@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||||||
|
|
||||||
// reallocate buffers if needed
|
// reallocate buffers if needed
|
||||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||||
|
// if the buffer type is used multiple times, we reuse the same buffer
|
||||||
|
for (int j = 0; j < i; j++) {
|
||||||
|
if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
|
||||||
|
galloc->buffers[i] = galloc->buffers[j];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
||||||
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
||||||
|
|
||||||
@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||||
if (galloc->buffers[i] == NULL) {
|
if (galloc->buffers[i] == NULL) {
|
||||||
@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
|||||||
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
||||||
|
int buffer_id = tensor_alloc->buffer_id;
|
||||||
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||||
|
|
||||||
if (tensor->view_src != NULL) {
|
if (tensor->view_src != NULL) {
|
||||||
@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
|
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||||
ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
|
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
|
||||||
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
||||||
return talloc->size_max >= node_size;
|
return talloc->size_max >= node_size;
|
||||||
}
|
}
|
||||||
@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||||
|
|
||||||
if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
|
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
||||||
#endif
|
#endif
|
||||||
@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
||||||
#endif
|
#endif
|
||||||
@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
||||||
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
|
ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
|
||||||
}
|
}
|
||||||
// nodes
|
// nodes
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
|
ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
|
||||||
}
|
}
|
||||||
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
|||||||
if (galloc->buffers[buffer_id] == NULL) {
|
if (galloc->buffers[buffer_id] == NULL) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < buffer_id; i++) {
|
||||||
|
if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
|
||||||
|
// this buffer is the same as a previous one due to the same buffer type being used multiple times
|
||||||
|
// only return the buffer size the first time it appears to avoid double counting
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,13 +17,15 @@ extern "C" {
|
|||||||
|
|
||||||
struct ggml_backend_buffer_type_i {
|
struct ggml_backend_buffer_type_i {
|
||||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
||||||
|
// allocate a buffer of this type
|
||||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
// tensor alignment
|
||||||
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
||||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
// max buffer size that can be allocated
|
||||||
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
||||||
|
// data size needed to allocate the tensor, including padding
|
||||||
|
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||||
// check if tensor data is in host memory
|
// check if tensor data is in host memory
|
||||||
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
|
||||||
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -92,27 +94,37 @@ extern "C" {
|
|||||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan (not used currently)
|
// compute graph with a plan (not used currently)
|
||||||
|
// create a new plan for a graph
|
||||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||||
// compute graph with a plan
|
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
||||||
|
// compute the graph with the plan
|
||||||
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
// compute graph without a plan (async)
|
// compute graph without a plan (async)
|
||||||
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// check if the backend supports an operation
|
// check if the backend can compute an operation
|
||||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
// check if the backend can use tensors allocated in a buffer type
|
||||||
|
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
||||||
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
||||||
// even if the weight has to be copied from the CPU temporarily
|
// even if the weight has to be copied from the CPU temporarily
|
||||||
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// (optional) event synchronization
|
// (optional) event synchronization
|
||||||
|
// create a new event that can record events on this backend instance
|
||||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
||||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
||||||
|
// record an event on the backend instance that created it
|
||||||
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
||||||
|
// wait for an event on on a different backend instance
|
||||||
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||||
|
// block until an event is recorded
|
||||||
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
240
ggml-backend.c
240
ggml-backend.c
@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf
|
|||||||
return ggml_nbytes(tensor);
|
return ggml_nbytes(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
return buft->iface.supports_backend(buft, backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
if (buft->iface.is_host) {
|
if (buft->iface.is_host) {
|
||||||
return buft->iface.is_host(buft);
|
return buft->iface.is_host(buft);
|
||||||
@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
|||||||
return backend->iface.supports_op(backend, op);
|
return backend->iface.supports_op(backend, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
return backend->iface.supports_buft(backend, buft);
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
if (backend->iface.offload_op != NULL) {
|
if (backend->iface.offload_op != NULL) {
|
||||||
return backend->iface.offload_op(backend, op);
|
return backend->iface.offload_op(backend, op);
|
||||||
@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
return ggml_backend_is_cpu(backend);
|
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
|||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
return ggml_backend_buft_is_host(buft);
|
||||||
|
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_backend_i cpu_backend_i = {
|
static struct ggml_backend_i cpu_backend_i = {
|
||||||
/* .get_name = */ ggml_backend_cpu_name,
|
/* .get_name = */ ggml_backend_cpu_name,
|
||||||
/* .free = */ ggml_backend_cpu_free,
|
/* .free = */ ggml_backend_cpu_free,
|
||||||
@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = {
|
|||||||
/* .synchronize = */ NULL,
|
/* .synchronize = */ NULL,
|
||||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||||
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_cpu_supports_buft,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
@ -1055,6 +1055,9 @@ struct ggml_backend_sched {
|
|||||||
int * node_backend_ids; // [graph_size]
|
int * node_backend_ids; // [graph_size]
|
||||||
int * leaf_backend_ids; // [graph_size]
|
int * leaf_backend_ids; // [graph_size]
|
||||||
|
|
||||||
|
int * prev_node_backend_ids; // [graph_size]
|
||||||
|
int * prev_leaf_backend_ids; // [graph_size]
|
||||||
|
|
||||||
// copy of the graph with modified inputs
|
// copy of the graph with modified inputs
|
||||||
struct ggml_cgraph * graph;
|
struct ggml_cgraph * graph;
|
||||||
|
|
||||||
@ -1075,6 +1078,8 @@ struct ggml_backend_sched {
|
|||||||
ggml_backend_sched_eval_callback callback_eval;
|
ggml_backend_sched_eval_callback callback_eval;
|
||||||
void * callback_eval_user_data;
|
void * callback_eval_user_data;
|
||||||
|
|
||||||
|
bool debug;
|
||||||
|
|
||||||
// align context_buffer to GGML_MEM_ALIGN
|
// align context_buffer to GGML_MEM_ALIGN
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
__declspec(align(GGML_MEM_ALIGN))
|
__declspec(align(GGML_MEM_ALIGN))
|
||||||
@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
|
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
||||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find highest prio backend that supports the buffer type
|
// find highest prio backend that supports the buffer type and the op
|
||||||
for (int i = 0; i < sched->n_backends; i++) {
|
for (int i = 0; i < sched->n_backends; i++) {
|
||||||
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
|
if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
|
||||||
|
ggml_backend_supports_op(sched->backends[i], op)) {
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
|
#ifndef NDEBUG
|
||||||
__func__, ggml_backend_buffer_name(buffer), tensor->name);
|
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
||||||
GGML_ASSERT(false);
|
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
||||||
|
#endif
|
||||||
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||||||
// TODO: use supports_op to check if the backend supports the op
|
// TODO: use supports_op to check if the backend supports the op
|
||||||
|
|
||||||
// assign pre-allocated nodes to their backend
|
// assign pre-allocated nodes to their backend
|
||||||
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
||||||
if (cur_backend_id != -1) {
|
if (cur_backend_id != -1) {
|
||||||
SET_CAUSE(tensor, "1.dst");
|
SET_CAUSE(tensor, "1.dst");
|
||||||
return cur_backend_id;
|
return cur_backend_id;
|
||||||
@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||||||
|
|
||||||
// view_src
|
// view_src
|
||||||
if (tensor->view_src != NULL) {
|
if (tensor->view_src != NULL) {
|
||||||
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
|
||||||
if (cur_backend_id != -1) {
|
if (cur_backend_id != -1) {
|
||||||
SET_CAUSE(tensor, "1.vsrc");
|
SET_CAUSE(tensor, "1.vsrc");
|
||||||
return cur_backend_id;
|
return cur_backend_id;
|
||||||
@ -1161,7 +1168,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
|
||||||
// check if a backend with higher prio wants to offload the op
|
// check if a backend with higher prio wants to offload the op
|
||||||
if (src_backend_id == sched->n_backends - 1) {
|
if (src_backend_id == sched->n_backends - 1) {
|
||||||
for (int b = 0; b < src_backend_id; b++) {
|
for (int b = 0; b < src_backend_id; b++) {
|
||||||
@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//#define DEBUG_PASS1
|
static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
|
||||||
//#define DEBUG_PASS2
|
ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
|
||||||
//#define DEBUG_PASS3
|
ggml_backend_buffer_type_t buft = NULL;
|
||||||
//#define DEBUG_PASS4
|
|
||||||
|
if (buf) {
|
||||||
|
// the tensor is already allocated
|
||||||
|
buft = buf->buft;
|
||||||
|
} else {
|
||||||
|
// see if the tensor already has a backend assigned, and use the buffer type of that backend
|
||||||
|
int tensor_backend_id = tensor_backend_id(t);
|
||||||
|
if (tensor_backend_id == -1 && t->view_src) {
|
||||||
|
tensor_backend_id = tensor_backend_id(t->view_src);
|
||||||
|
}
|
||||||
|
if (tensor_backend_id != -1) {
|
||||||
|
buft = sched->bufts[tensor_backend_id];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
|
||||||
|
if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
|
||||||
|
*node_backend_id = cur_backend_id;
|
||||||
|
SET_CAUSE(node, "2.sup");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
||||||
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_PASS1
|
|
||||||
fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// pass 2: expand current backend assignments
|
// pass 2: expand current backend assignments
|
||||||
// assign the same backend to adjacent nodes
|
// assign the same backend to adjacent nodes
|
||||||
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
||||||
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
||||||
|
// ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
|
||||||
|
// expand gpu down
|
||||||
// pass 2.2 expand gpu down
|
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
} else {
|
} else {
|
||||||
cur_backend_id = *node_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else if (cur_backend_id != -1) {
|
||||||
*node_backend_id = cur_backend_id;
|
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
||||||
SET_CAUSE(node, "2.2");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// pass 2.1 expand gpu up
|
// expand gpu up
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||||
@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
} else {
|
} else {
|
||||||
cur_backend_id = *node_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else if (cur_backend_id != -1) {
|
||||||
*node_backend_id = cur_backend_id;
|
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
||||||
SET_CAUSE(node, "2.1");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// pass 2.4 expand rest down
|
// expand rest down
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
int * node_backend_id = &tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (*node_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
cur_backend_id = *node_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
} else {
|
} else if (cur_backend_id != -1) {
|
||||||
*node_backend_id = cur_backend_id;
|
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
||||||
SET_CAUSE(node, "2.4");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// pass 2.3 expand rest up
|
// expand rest up
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||||
@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
int * node_backend_id = &tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (*node_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
cur_backend_id = *node_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
|
} else if (cur_backend_id != -1) {
|
||||||
|
ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pass 3: upgrade nodes to higher prio backends with compatible buffer types
|
||||||
|
// if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
|
||||||
|
// however, we also need to verify that the sources are in compatible buffer types
|
||||||
|
// (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
|
||||||
|
// however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
|
||||||
|
// this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
|
||||||
|
// additionally, set remaining unassigned nodes to the backend with the most supported inputs
|
||||||
|
// only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
|
||||||
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
if (ggml_is_view_op(node->op)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
|
if (*node_backend_id == -1) {
|
||||||
|
// unassigned node: find the backend with the most supported inputs
|
||||||
|
int n_supported_best = -1;
|
||||||
|
for (int b = 0; b < sched->n_backends; b++) {
|
||||||
|
if (ggml_backend_supports_op(sched->backends[b], node)) {
|
||||||
|
int n_supported = 0;
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
|
||||||
|
n_supported++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (n_supported > n_supported_best) {
|
||||||
|
n_supported_best = n_supported;
|
||||||
|
*node_backend_id = b;
|
||||||
|
SET_CAUSE(node, "3.best");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
*node_backend_id = cur_backend_id;
|
// assigned node: upgrade to higher prio backend if possible
|
||||||
SET_CAUSE(node, "2.3");
|
for (int b = 0; b < *node_backend_id; b++) {
|
||||||
|
if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
|
||||||
|
bool supported = true;
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
|
||||||
|
supported = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (supported) {
|
||||||
|
*node_backend_id = b;
|
||||||
|
SET_CAUSE(node, "3.upg");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBUG_PASS2
|
// pass 4: assign backends to remaining src from dst and view_src
|
||||||
fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// pass 3: assign backends to remaining src from dst and view_src
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
int * cur_backend_id = &tensor_backend_id(node);
|
int * cur_backend_id = &tensor_backend_id(node);
|
||||||
if (node->view_src != NULL && *cur_backend_id == -1) {
|
if (node->view_src != NULL && *cur_backend_id == -1) {
|
||||||
*cur_backend_id = tensor_backend_id(node->view_src);
|
*cur_backend_id = tensor_backend_id(node->view_src);
|
||||||
SET_CAUSE(node, "3.vsrc");
|
SET_CAUSE(node, "4.vsrc");
|
||||||
}
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
if (src->view_src != NULL) {
|
if (src->view_src != NULL) {
|
||||||
// views are always on the same backend as the source
|
// views are always on the same backend as the source
|
||||||
*src_backend_id = tensor_backend_id(src->view_src);
|
*src_backend_id = tensor_backend_id(src->view_src);
|
||||||
SET_CAUSE(src, "3.vsrc");
|
SET_CAUSE(src, "4.vsrc");
|
||||||
} else {
|
} else {
|
||||||
*src_backend_id = *cur_backend_id;
|
*src_backend_id = *cur_backend_id;
|
||||||
SET_CAUSE(src, "3.cur");
|
SET_CAUSE(src, "4.cur");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_PASS3
|
|
||||||
fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// pass 4: split graph, find tensors that need to be copied
|
// pass 4: split graph, find tensors that need to be copied
|
||||||
{
|
{
|
||||||
@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// check if the split has too many inputs
|
// check if the split has too many inputs
|
||||||
|
// FIXME: count the number of inputs instead of only checking when full
|
||||||
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||||
const size_t id = hash_id(src);
|
const size_t id = hash_id(src);
|
||||||
int src_backend_id = sched->tensor_backend_id[id];
|
int src_backend_id = sched->tensor_backend_id[id];
|
||||||
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||||
|
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
|
||||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||||
need_new_split = true;
|
need_new_split = true;
|
||||||
break;
|
break;
|
||||||
@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src_backend_id != node_backend_id) {
|
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||||
|
if (src_backend_id != cur_backend_id && !supported) {
|
||||||
// create a copy of the input in the split's backend
|
// create a copy of the input in the split's backend
|
||||||
const size_t id = hash_id(src);
|
const size_t id = hash_id(src);
|
||||||
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
||||||
@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
split->i_end = graph->n_nodes;
|
split->i_end = graph->n_nodes;
|
||||||
sched->n_splits = i_split + 1;
|
sched->n_splits = i_split + 1;
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_PASS4
|
|
||||||
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
if (sched->debug) {
|
||||||
#endif
|
ggml_backend_sched_print_assignments(sched, graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
// swap node_backend_ids and leaf_backend_ids and prevs
|
||||||
|
{
|
||||||
|
int * tmp = sched->node_backend_ids;
|
||||||
|
sched->node_backend_ids = sched->prev_node_backend_ids;
|
||||||
|
sched->prev_node_backend_ids = tmp;
|
||||||
|
|
||||||
|
tmp = sched->leaf_backend_ids;
|
||||||
|
sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
|
||||||
|
sched->prev_leaf_backend_ids = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
// create copies of the graph for each split
|
// create copies of the graph for each split
|
||||||
// TODO: avoid this copy
|
// TODO: avoid this copy
|
||||||
@ -1613,8 +1704,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
|
bool backend_ids_changed = false;
|
||||||
|
for (int i = 0; i < sched->graph->n_nodes; i++) {
|
||||||
|
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
|
||||||
|
backend_ids_changed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!backend_ids_changed) {
|
||||||
|
for (int i = 0; i < sched->graph->n_leafs; i++) {
|
||||||
|
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
|
||||||
|
backend_ids_changed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// allocate graph
|
// allocate graph
|
||||||
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
||||||
// the re-allocation may cause the split inputs to be moved to a different address
|
// the re-allocation may cause the split inputs to be moved to a different address
|
||||||
ggml_backend_sched_synchronize(sched);
|
ggml_backend_sched_synchronize(sched);
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
@ -1727,6 +1834,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||||||
|
|
||||||
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
|
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||||
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
||||||
@ -1735,6 +1844,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||||||
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||||
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||||
|
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
||||||
|
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
|
|
||||||
@ -1747,7 +1858,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|||||||
for (int b = 0; b < n_backends; b++) {
|
for (int b = 0; b < n_backends; b++) {
|
||||||
sched->backends[b] = backends[b];
|
sched->backends[b] = backends[b];
|
||||||
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
|
||||||
GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
|
GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
|
||||||
if (sched->n_copies > 1) {
|
if (sched->n_copies > 1) {
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
sched->events[b][c] = ggml_backend_event_new(backends[b]);
|
||||||
@ -1779,6 +1890,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|||||||
free(sched->tensor_copies);
|
free(sched->tensor_copies);
|
||||||
free(sched->node_backend_ids);
|
free(sched->node_backend_ids);
|
||||||
free(sched->leaf_backend_ids);
|
free(sched->leaf_backend_ids);
|
||||||
|
free(sched->prev_node_backend_ids);
|
||||||
|
free(sched->prev_leaf_backend_ids);
|
||||||
free(sched);
|
free(sched);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1875,6 +1988,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
|||||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
tensor_backend_id(node) = backend_index;
|
tensor_backend_id(node) = backend_index;
|
||||||
|
SET_CAUSE(node, "usr");
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||||
|
@ -23,7 +23,6 @@ extern "C" {
|
|||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
|
||||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
// buffer
|
// buffer
|
||||||
@ -74,6 +73,7 @@ extern "C" {
|
|||||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// tensor copy between different backends
|
// tensor copy between different backends
|
||||||
@ -90,7 +90,7 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
||||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
||||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
|
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU backend
|
// CPU backend
|
||||||
@ -119,7 +119,7 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
||||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
||||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
||||||
|
44
ggml-cuda.cu
44
ggml-cuda.cu
@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu
|
|||||||
return ctx->name.c_str();
|
return ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
|
||||||
|
return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
if (!ggml_backend_is_cuda(backend)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
|
||||||
|
|
||||||
return buft_ctx->device == cuda_ctx->device;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||||
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -863,6 +855,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
|
||||||
|
return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
||||||
// instead, we allocate them for each tensor separately in init_tensor
|
// instead, we allocate them for each tensor separately in init_tensor
|
||||||
@ -906,12 +902,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_
|
|||||||
return total_size;
|
return total_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
return ggml_backend_is_cuda(backend);
|
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -924,7 +914,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
|||||||
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1024,7 +1013,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
@ -2879,6 +2867,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
if (ggml_backend_buft_is_cuda_split(buft)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ggml_backend_buft_is_cuda(buft)) {
|
||||||
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||||
|
return buft_ctx->device == cuda_ctx->device;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
const int min_batch_size = 32;
|
||||||
|
|
||||||
@ -2951,9 +2953,11 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
|||||||
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||||
/* .graph_plan_create = */ NULL,
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_cuda_supports_buft,
|
||||||
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
||||||
/* .event_new = */ ggml_backend_cuda_event_new,
|
/* .event_new = */ ggml_backend_cuda_event_new,
|
||||||
/* .event_free = */ ggml_backend_cuda_event_free,
|
/* .event_free = */ ggml_backend_cuda_event_free,
|
||||||
|
@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_
|
|||||||
return ctx->max_alloc;
|
return ctx->max_alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
GGML_UNUSED(buft);
|
|
||||||
return ggml_backend_is_kompute(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
|
||||||
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
|
|||||||
return ggml_vk_supports_op(op);
|
return ggml_vk_supports_op(op);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
GGML_UNUSED(backend);
|
||||||
|
return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_backend_i kompute_backend_i = {
|
static struct ggml_backend_i kompute_backend_i = {
|
||||||
/* .get_name = */ ggml_backend_kompute_name,
|
/* .get_name = */ ggml_backend_kompute_name,
|
||||||
/* .free = */ ggml_backend_kompute_free,
|
/* .free = */ ggml_backend_kompute_free,
|
||||||
@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = {
|
|||||||
/* .synchronize = */ NULL,
|
/* .synchronize = */ NULL,
|
||||||
/* .graph_plan_create = */ NULL,
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
|
15
ggml-metal.m
15
ggml-metal.m
@ -3044,12 +3044,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend
|
|||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
|
||||||
|
|
||||||
UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
@ -3064,7 +3058,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
@ -3179,6 +3172,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con
|
|||||||
return ggml_metal_supports_op(metal_ctx, op);
|
return ggml_metal_supports_op(metal_ctx, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_backend_i ggml_backend_metal_i = {
|
static struct ggml_backend_i ggml_backend_metal_i = {
|
||||||
/* .get_name = */ ggml_backend_metal_name,
|
/* .get_name = */ ggml_backend_metal_name,
|
||||||
/* .free = */ ggml_backend_metal_free,
|
/* .free = */ ggml_backend_metal_free,
|
||||||
@ -3189,9 +3188,11 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
|||||||
/* .synchronize = */ NULL,
|
/* .synchronize = */ NULL,
|
||||||
/* .graph_plan_create = */ NULL,
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_metal_supports_buft,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
|
21
ggml-rpc.cpp
21
ggml-rpc.cpp
@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend
|
|||||||
return ggml_nbytes(tensor);
|
return ggml_nbytes(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
if (!ggml_backend_is_rpc(backend)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
|
||||||
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||||
/* .get_name = */ ggml_backend_rpc_buffer_type_name,
|
/* .get_name = */ ggml_backend_rpc_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ ggml_backend_rpc_get_max_size,
|
/* .get_max_size = */ ggml_backend_rpc_get_max_size,
|
||||||
/* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||||
|
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||||
|
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_i ggml_backend_rpc_interface = {
|
static ggml_backend_i ggml_backend_rpc_interface = {
|
||||||
/* .get_name = */ ggml_backend_rpc_name,
|
/* .get_name = */ ggml_backend_rpc_name,
|
||||||
/* .free = */ ggml_backend_rpc_free,
|
/* .free = */ ggml_backend_rpc_free,
|
||||||
@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|||||||
/* .synchronize = */ ggml_backend_rpc_synchronize,
|
/* .synchronize = */ ggml_backend_rpc_synchronize,
|
||||||
/* .graph_plan_create = */ NULL,
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
|
@ -16575,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen
|
|||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
if (!ggml_backend_is_sycl(backend)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
|
||||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
||||||
return buft_ctx->device == sycl_ctx->device;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
||||||
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
|
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
|
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
|
||||||
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ nullptr,
|
/* .is_host = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -16942,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
|
|||||||
return total_size;
|
return total_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
return ggml_backend_is_sycl(backend);
|
|
||||||
|
|
||||||
UNUSED(buft);
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -16960,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|||||||
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -17046,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
@ -17311,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const
|
|||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
||||||
|
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||||
|
return buft_ctx->device == sycl_ctx->device;
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_i ggml_backend_sycl_interface = {
|
static ggml_backend_i ggml_backend_sycl_interface = {
|
||||||
/* .get_name = */ ggml_backend_sycl_name,
|
/* .get_name = */ ggml_backend_sycl_name,
|
||||||
@ -17322,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|||||||
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
/* .synchronize = */ ggml_backend_sycl_synchronize,
|
||||||
/* .graph_plan_create = */ NULL,
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
||||||
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
|
@ -6142,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_
|
|||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
|
||||||
if (!ggml_backend_is_vk(backend)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
|
||||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
||||||
|
|
||||||
return buft_ctx->ctx->idx == ctx->idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||||
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
/* .get_name = */ ggml_backend_vk_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
|
||||||
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
|
|
||||||
/* .is_host = */ NULL,
|
/* .is_host = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -6235,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|||||||
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
@ -6551,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g
|
|||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||||
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||||
|
|
||||||
|
return buft_ctx->ctx->idx == ctx->idx;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: enable async and synchronize
|
// TODO: enable async and synchronize
|
||||||
static ggml_backend_i ggml_backend_vk_interface = {
|
static ggml_backend_i ggml_backend_vk_interface = {
|
||||||
/* .get_name = */ ggml_backend_vk_name,
|
/* .get_name = */ ggml_backend_vk_name,
|
||||||
@ -6562,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|||||||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||||
/* .graph_plan_create = */ NULL,
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
||||||
/* .offload_op = */ ggml_backend_vk_offload_op,
|
/* .offload_op = */ ggml_backend_vk_offload_op,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
|
197
ggml.c
197
ggml.c
@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE)
|
#if defined(GGML_USE_ACCELERATE)
|
||||||
#include <Accelerate/Accelerate.h>
|
#include <Accelerate/Accelerate.h>
|
||||||
#elif defined(GGML_USE_OPENBLAS)
|
|
||||||
#if defined(GGML_BLAS_USE_MKL)
|
|
||||||
#include <mkl.h>
|
|
||||||
#else
|
|
||||||
#include <cblas.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// floating point type used to accumulate sums
|
// floating point type used to accumulate sums
|
||||||
@ -12179,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
|
|||||||
|
|
||||||
// ggml_compute_forward_mul_mat
|
// ggml_compute_forward_mul_mat
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
||||||
// helper function to determine if it is better to use BLAS or not
|
|
||||||
// for large matrices, BLAS is faster
|
|
||||||
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
|
||||||
const struct ggml_tensor * src1 = dst->src[1];
|
|
||||||
|
|
||||||
//const int64_t ne00 = src0->ne[0];
|
|
||||||
//const int64_t ne01 = src0->ne[1];
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
|
||||||
const int64_t ne1 = dst->ne[1];
|
|
||||||
|
|
||||||
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
|
||||||
// all the experts for each batch element and the processing would become incredibly slow
|
|
||||||
// TODO: find the optimal values for these
|
|
||||||
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
|
||||||
ggml_is_contiguous(src0) &&
|
|
||||||
ggml_is_contiguous(src1) &&
|
|
||||||
//src0->type == GGML_TYPE_F32 &&
|
|
||||||
src1->type == GGML_TYPE_F32 &&
|
|
||||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
|
||||||
|
|
||||||
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void ggml_compute_forward_mul_mat_one_chunk(
|
static void ggml_compute_forward_mul_mat_one_chunk(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
struct ggml_tensor * dst,
|
struct ggml_tensor * dst,
|
||||||
@ -12349,73 +12310,6 @@ static void ggml_compute_forward_mul_mat(
|
|||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
|
||||||
const int64_t ne_plane = ne01*ne00;
|
|
||||||
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
|
||||||
UNUSED(desired_wsize);
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
||||||
if (type != GGML_TYPE_F32) {
|
|
||||||
assert(params->wsize >= desired_wsize);
|
|
||||||
// parallelize by src0 rows
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
||||||
// broadcast src0 into src1 across 2nd,3rd dimension
|
|
||||||
const int64_t i03 = i13/r3;
|
|
||||||
const int64_t i02 = i12/r2;
|
|
||||||
|
|
||||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
|
||||||
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
||||||
ggml_to_float_t const to_float = type_traits[type].to_float;
|
|
||||||
|
|
||||||
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
|
||||||
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// perform sgemm, parallelization controlled by blas lib
|
|
||||||
if (ith != 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//const int64_t tgemm0 = ggml_perf_time_us();
|
|
||||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
|
||||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
||||||
const int64_t i03 = i13/r3;
|
|
||||||
const int64_t i02 = i12/r2;
|
|
||||||
|
|
||||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
|
||||||
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
|
||||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
|
||||||
|
|
||||||
if (type != GGML_TYPE_F32) {
|
|
||||||
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
||||||
}
|
|
||||||
|
|
||||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
||||||
ne1, ne01, ne10,
|
|
||||||
1.0f, y, ne10,
|
|
||||||
x, ne00,
|
|
||||||
0.0f, d, ne01);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
|
||||||
|
|
||||||
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if GGML_USE_LLAMAFILE
|
#if GGML_USE_LLAMAFILE
|
||||||
const bool src1_cont = ggml_is_contiguous(src1);
|
const bool src1_cont = ggml_is_contiguous(src1);
|
||||||
|
|
||||||
@ -12796,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
||||||
bool use_blas = ggml_is_matrix(src0) &&
|
|
||||||
ggml_is_matrix(src1) &&
|
|
||||||
ggml_is_contiguous(src0) &&
|
|
||||||
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
|
||||||
if (use_blas) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if (ith != 0) {
|
if (ith != 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -12820,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
||||||
if (use_blas) {
|
|
||||||
if (params->ith != 0) { // All threads other than the first do no work.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
|
||||||
// src0: (k,n)
|
|
||||||
// src1: (k,m)
|
|
||||||
// dst: (m,n)
|
|
||||||
//
|
|
||||||
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
|
||||||
// Also expressed as (major,minor)
|
|
||||||
// a: (m,k): so src1 transposed
|
|
||||||
// b: (k,n): so src0
|
|
||||||
// c: (m,n)
|
|
||||||
//
|
|
||||||
// However, if ggml_is_transposed(src1) is true, then
|
|
||||||
// src1->data already contains a transposed version, so sgemm mustn't
|
|
||||||
// transpose it further.
|
|
||||||
|
|
||||||
int n = src0->ne[0];
|
|
||||||
int k = src0->ne[1];
|
|
||||||
int m = src1->ne[0];
|
|
||||||
|
|
||||||
int transposeA, lda;
|
|
||||||
|
|
||||||
if (!ggml_is_transposed(src1)) {
|
|
||||||
transposeA = CblasTrans;
|
|
||||||
lda = m;
|
|
||||||
} else {
|
|
||||||
transposeA = CblasNoTrans;
|
|
||||||
lda = k;
|
|
||||||
}
|
|
||||||
|
|
||||||
float * a = (float *) ((char *) src1->data);
|
|
||||||
float * b = (float *) ((char *) src0->data);
|
|
||||||
float * c = (float *) ((char *) dst->data);
|
|
||||||
|
|
||||||
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// dst[:,:,:,:] = 0
|
// dst[:,:,:,:] = 0
|
||||||
// for i2,i3:
|
// for i2,i3:
|
||||||
// for i1:
|
// for i1:
|
||||||
@ -12993,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||||
if (ith != 0) {
|
if (ith != 0) {
|
||||||
return;
|
return;
|
||||||
@ -13391,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
|
|||||||
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||||
|
|
||||||
|
assert(i01 >= 0 && i01 < ne01);
|
||||||
|
|
||||||
dequantize_row_q(
|
dequantize_row_q(
|
||||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||||
@ -13434,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
|
|||||||
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||||
|
|
||||||
|
assert(i01 >= 0 && i01 < ne01);
|
||||||
|
|
||||||
ggml_fp16_to_fp32_row(
|
ggml_fp16_to_fp32_row(
|
||||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||||
@ -13477,6 +13317,8 @@ static void ggml_compute_forward_get_rows_bf16(
|
|||||||
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||||
|
|
||||||
|
assert(i01 >= 0 && i01 < ne01);
|
||||||
|
|
||||||
ggml_bf16_to_fp32_row(
|
ggml_bf16_to_fp32_row(
|
||||||
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
||||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
||||||
@ -13520,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
|
|||||||
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
||||||
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
||||||
|
|
||||||
|
assert(i01 >= 0 && i01 < ne01);
|
||||||
|
|
||||||
ggml_vec_cpy_f32(nc,
|
ggml_vec_cpy_f32(nc,
|
||||||
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
||||||
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
||||||
@ -18893,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
|
case GGML_OP_CONT:
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_ADD1:
|
case GGML_OP_ADD1:
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
@ -18977,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|||||||
} break;
|
} break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
case GGML_OP_SET:
|
case GGML_OP_SET:
|
||||||
case GGML_OP_CONT:
|
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
@ -19138,7 +18982,10 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|||||||
}
|
}
|
||||||
|
|
||||||
*node_n = atomic_load(&state->shared->node_n);
|
*node_n = atomic_load(&state->shared->node_n);
|
||||||
if (* node_n != last_node_n) break;
|
if (*node_n != last_node_n) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(__SSE3__)
|
#if defined(__SSE3__)
|
||||||
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
||||||
_mm_pause();
|
_mm_pause();
|
||||||
@ -19156,7 +19003,10 @@ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_co
|
|||||||
}
|
}
|
||||||
|
|
||||||
*task_phase = atomic_load(&state->shared->node_task);
|
*task_phase = atomic_load(&state->shared->node_task);
|
||||||
if (* task_phase != last_task_phase) break;
|
if (*task_phase != last_task_phase) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(__SSE3__)
|
#if defined(__SSE3__)
|
||||||
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
||||||
_mm_pause();
|
_mm_pause();
|
||||||
@ -19356,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|||||||
{
|
{
|
||||||
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
||||||
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
|
||||||
if (node->src[0]->type != GGML_TYPE_F32) {
|
|
||||||
// here we need memory for fully dequantized matrix from src0
|
|
||||||
// take into account that src0 can be broadcasted into src1[2,3]
|
|
||||||
cur = ggml_type_size(GGML_TYPE_F32)
|
|
||||||
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
|
||||||
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
|
||||||
}
|
|
||||||
} else
|
|
||||||
#endif
|
|
||||||
if (node->src[1]->type != vec_dot_type) {
|
if (node->src[1]->type != vec_dot_type) {
|
||||||
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
||||||
}
|
}
|
||||||
@ -22664,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_blas(void) {
|
int ggml_cpu_has_blas(void) {
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
||||||
return 1;
|
return 1;
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user