ggml : try to improve threading

This commit is contained in:
Georgi Gerganov 2022-12-26 18:51:35 +02:00
parent 4e0b2069e7
commit 4e6d2e98ab
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
2 changed files with 340 additions and 167 deletions

495
ggml.c
View File

@ -303,6 +303,91 @@ int64_t ggml_cycles_per_ms(void) {
#define ggml_perf_cycles_per_ms() 0
#endif
//
// thread data
//
// synchronization is done via busy loops
// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
//
#ifdef __APPLE__
//#include <os/lock.h>
//typedef os_unfair_lock ggml_lock_t;
//
//#define ggml_lock_init(x) UNUSED(x)
//#define ggml_lock_destroy(x) UNUSED(x)
//#define ggml_lock_lock os_unfair_lock_lock
//#define ggml_lock_unlock os_unfair_lock_unlock
//
//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
//typedef int ggml_lock_t;
//
//#define ggml_lock_init(x) UNUSED(x)
//#define ggml_lock_destroy(x) UNUSED(x)
//#define ggml_lock_lock(x) UNUSED(x)
//#define ggml_lock_unlock(x) UNUSED(x)
//
//#define GGML_LOCK_INITIALIZER 0
typedef pthread_mutex_t ggml_lock_t;
typedef pthread_cond_t ggml_cond_t;
#define ggml_lock_init(x) pthread_mutex_init(x, NULL)
#define ggml_lock_destroy(x) pthread_mutex_destroy(x)
#define ggml_lock_lock pthread_mutex_lock
#define ggml_lock_unlock pthread_mutex_unlock
#define ggml_cond_init(x) pthread_cond_init(x, NULL)
#define ggml_cond_destroy(x) pthread_cond_destroy(x)
#define ggml_cond_wait pthread_cond_wait
#define ggml_cond_broadcast pthread_cond_broadcast
#define GGML_LOCK_INITIALIZER PTHREAD_MUTEX_INITIALIZER
#define GGML_COND_INITIALIZER PTHREAD_COND_INITIALIZER
#else
//typedef pthread_spinlock_t ggml_lock_t;
//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
//#define ggml_lock_destroy pthread_spin_destroy
//#define ggml_lock_lock pthread_spin_lock
//#define ggml_lock_unlock pthread_spin_unlock
//typedef int ggml_lock_t;
//
//#define ggml_lock_init(x) UNUSED(x)
//#define ggml_lock_destroy(x) UNUSED(x)
//#define ggml_lock_lock(x) UNUSED(x)
//#define ggml_lock_unlock(x) UNUSED(x)
//
//#define GGML_LOCK_INITIALIZER 0
typedef pthread_mutex_t ggml_lock_t;
typedef pthread_cond_t ggml_cond_t;
#define ggml_lock_init(x) pthread_mutex_init(x, NULL)
#define ggml_lock_destroy(x) pthread_mutex_destroy(x)
#define ggml_lock_lock pthread_mutex_lock
#define ggml_lock_unlock pthread_mutex_unlock
#define ggml_cond_init(x) pthread_cond_init(x, NULL)
#define ggml_cond_destroy(x) pthread_cond_destroy(x)
#define ggml_cond_wait pthread_cond_wait
#define ggml_cond_broadcast pthread_cond_broadcast
#define GGML_LOCK_INITIALIZER PTHREAD_MUTEX_INITIALIZER
#define GGML_COND_INITIALIZER PTHREAD_COND_INITIALIZER
#endif
typedef pthread_t ggml_thread_t;
#define ggml_thread_create pthread_create
#define ggml_thread_join pthread_join
//
// cache line
//
@ -1205,12 +1290,38 @@ struct ggml_compute_params {
void * wdata;
};
struct ggml_compute_state_shared;
struct ggml_compute_state {
ggml_thread_t thrd;
struct ggml_compute_params params;
struct ggml_tensor * node;
struct ggml_compute_state_shared * shared;
};
struct ggml_thread_pool {
bool is_used;
int n_threads;
int n_ready;
atomic_bool stop;
ggml_lock_t lock;
ggml_cond_t cond;
struct ggml_compute_state states[GGML_MAX_THREADS];
};
//
// ggml state
//
struct ggml_state {
struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
struct ggml_thread_pool th_pools[GGML_MAX_THREAD_POOLS];
};
// global state
@ -1393,12 +1504,25 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
g_state = (struct ggml_state) {
/*.contexts =*/ { 0 },
/*.th_pools =*/ { 0 },
};
for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
g_state.contexts[i].used = false;
}
for (int i = 0; i < GGML_MAX_THREAD_POOLS; ++i) {
g_state.th_pools[i] = (struct ggml_thread_pool) {
/*.is_used =*/ false,
/*.n_threads =*/ 0,
/*.n_ready =*/ 0,
/*.stop =*/ false,
/*.lock =*/ GGML_LOCK_INITIALIZER,
/*.cond =*/ GGML_COND_INITIALIZER,
/*.states =*/ { 0 },
};
}
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
@ -1450,7 +1574,8 @@ void ggml_free(struct ggml_context * ctx) {
// make this function thread safe
ggml_critical_section_start();
bool found = false;
bool found = false;
bool is_last = true;
for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
if (&g_state.contexts[i].context == ctx) {
@ -1464,12 +1589,16 @@ void ggml_free(struct ggml_context * ctx) {
}
found = true;
break;
} else if (g_state.contexts[i].used) {
is_last = false;
}
}
if (!found) {
GGML_PRINT_DEBUG("%s: context not found\n", __func__);
} else if (is_last) {
// stop all threads
fprintf(stderr, "%s: stopping all threads XXXXXXXXX\n", __func__);
}
ggml_critical_section_end();
@ -6858,126 +6987,91 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
}
//
// thread data
// compute
//
// synchronization is done via busy loops
// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
//
#ifdef __APPLE__
//#include <os/lock.h>
//typedef os_unfair_lock ggml_lock_t;
//
//#define ggml_lock_init(x) UNUSED(x)
//#define ggml_lock_destroy(x) UNUSED(x)
//#define ggml_lock_lock os_unfair_lock_lock
//#define ggml_lock_unlock os_unfair_lock_unlock
//
//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
typedef int ggml_lock_t;
#define ggml_lock_init(x) UNUSED(x)
#define ggml_lock_destroy(x) UNUSED(x)
#define ggml_lock_lock(x) UNUSED(x)
#define ggml_lock_unlock(x) UNUSED(x)
#define GGML_LOCK_INITIALIZER 0
typedef pthread_t ggml_thread_t;
#define ggml_thread_create pthread_create
#define ggml_thread_join pthread_join
#else
//typedef pthread_spinlock_t ggml_lock_t;
//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
//#define ggml_lock_destroy pthread_spin_destroy
//#define ggml_lock_lock pthread_spin_lock
//#define ggml_lock_unlock pthread_spin_unlock
typedef int ggml_lock_t;
#define ggml_lock_init(x) UNUSED(x)
#define ggml_lock_destroy(x) UNUSED(x)
#define ggml_lock_lock(x) UNUSED(x)
#define ggml_lock_unlock(x) UNUSED(x)
#define GGML_LOCK_INITIALIZER 0
typedef pthread_t ggml_thread_t;
#define ggml_thread_create pthread_create
#define ggml_thread_join pthread_join
#endif
struct ggml_compute_state_shared {
ggml_lock_t spin;
int n_threads;
// synchronization primitives
ggml_lock_t spin;
atomic_int n_ready;
atomic_bool has_work;
atomic_bool stop; // stop all threads
struct ggml_thread_pool * th_pool;
};
struct ggml_compute_state {
ggml_thread_t thrd;
struct ggml_compute_params params;
struct ggml_tensor * node;
struct ggml_compute_state_shared * shared;
};
// function used by each compute thread
static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
const int n_threads = state->shared->n_threads;
while (true) {
if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
atomic_store(&state->shared->has_work, false);
} else {
while (atomic_load(&state->shared->has_work)) {
if (atomic_load(&state->shared->stop)) {
return 0;
ggml_lock_lock(&state->shared->th_pool->lock);
state->shared->th_pool->n_ready++;
ggml_cond_wait(&state->shared->th_pool->cond, &state->shared->th_pool->lock);
state->shared->th_pool->n_ready--;
ggml_lock_unlock(&state->shared->th_pool->lock);
if (state->shared->th_pool->stop) {
break;
}
const int n_threads = state->shared->n_threads;
if (state->params.ith > n_threads) {
continue;
}
while (true) {
// spin lock
{
if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
atomic_store(&state->shared->has_work, false);
} else {
while (atomic_load(&state->shared->has_work)) {
if (atomic_load(&state->shared->stop)) {
break;
}
//ggml_lock_lock (&state->shared->spin);
//ggml_lock_unlock(&state->shared->spin);
}
if (atomic_load(&state->shared->stop)) {
break;
}
}
ggml_lock_lock (&state->shared->spin);
ggml_lock_unlock(&state->shared->spin);
atomic_fetch_sub(&state->shared->n_ready, 1);
}
}
atomic_fetch_sub(&state->shared->n_ready, 1);
// spin lock
{
// wait for work
while (!atomic_load(&state->shared->has_work)) {
if (atomic_load(&state->shared->stop)) {
break;
}
//ggml_lock_lock (&state->shared->spin);
//ggml_lock_unlock(&state->shared->spin);
}
// wait for work
while (!atomic_load(&state->shared->has_work)) {
if (atomic_load(&state->shared->stop)) {
return 0;
// check if we should stop
if (atomic_load(&state->shared->stop)) {
break;
}
}
ggml_lock_lock (&state->shared->spin);
ggml_lock_unlock(&state->shared->spin);
}
// check if we should stop
if (atomic_load(&state->shared->stop)) {
break;
}
if (state->node) {
ggml_compute_forward(&state->params, state->node);
state->node = NULL;
} else {
break;
if (state->node) {
ggml_compute_forward(&state->params, state->node);
state->node = NULL;
} else {
break;
}
}
}
printf("thread %d exiting\n", state->params.ith);
return 0;
}
@ -6986,26 +7080,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
cgraph->n_threads = 8;
}
if (cgraph->n_threads > GGML_MAX_THREADS) {
cgraph->n_threads = GGML_MAX_THREADS;
}
const int n_threads = cgraph->n_threads;
struct ggml_compute_state_shared state_shared = {
/*.spin =*/ GGML_LOCK_INITIALIZER,
/*.n_threads =*/ n_threads,
/*.spin =*/ GGML_LOCK_INITIALIZER,
/*.n_ready =*/ 0,
/*.has_work =*/ false,
/*.stop =*/ false,
/*.th_pool =*/ NULL,
};
struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
// create thread pool
struct ggml_compute_state * workers = NULL;
// find thread pool that is not currently in use
if (n_threads > 1) {
ggml_lock_init(&state_shared.spin);
ggml_critical_section_start();
atomic_store(&state_shared.has_work, true);
for (int i = 0; i < GGML_MAX_THREAD_POOLS; ++i) {
if (g_state.th_pools[i].is_used) {
continue;
}
state_shared.th_pool = &g_state.th_pools[i];
state_shared.th_pool->is_used = true;
break;
}
ggml_critical_section_end();
if (!state_shared.th_pool) {
fprintf(stderr, "%s: no thread pool available for graph computation\n", __func__);
GGML_ASSERT(false); // TODO: maybe dynamically allocate threads in the future
return;
}
workers = state_shared.th_pool->states;
for (int j = 0; j < n_threads - 1; j++) {
const ggml_thread_t th_save = workers[j].thrd;
workers[j] = (struct ggml_compute_state) {
.thrd = 0,
.thrd = th_save,
.params = {
.type = GGML_TASK_COMPUTE,
.ith = j + 1,
@ -7016,10 +7137,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
.node = NULL,
.shared = &state_shared,
};
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
assert(rc == 0);
UNUSED(rc);
if (workers[j].thrd == 0) {
int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
assert(rc == 0);
UNUSED(rc);
state_shared.th_pool->n_threads++;
}
}
ggml_lock_lock(&state_shared.th_pool->lock);
while (state_shared.th_pool->n_ready < state_shared.th_pool->n_threads) {
ggml_lock_unlock(&state_shared.th_pool->lock);
ggml_lock_lock (&state_shared.th_pool->lock);
// busy loop
}
atomic_store(&state_shared.has_work, true);
ggml_cond_broadcast(&state_shared.th_pool->cond);
ggml_lock_unlock (&state_shared.th_pool->lock);
}
// initialize tasks + work buffer
@ -7235,13 +7373,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
// COMPUTE
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
// spin lock
{
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
while (atomic_load(&state_shared.has_work)) {
//ggml_lock_lock (&state_shared.spin);
//ggml_lock_unlock(&state_shared.spin);
}
}
// launch thread pool
@ -7256,14 +7397,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
workers[j].node = node;
}
atomic_fetch_sub(&state_shared.n_ready, 1);
// spin lock
{
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) > 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
while (atomic_load(&state_shared.n_ready) > 0) {
//ggml_lock_lock (&state_shared.spin);
//ggml_lock_unlock(&state_shared.spin);
}
atomic_store(&state_shared.has_work, true);
}
atomic_store(&state_shared.has_work, true);
}
params.type = GGML_TASK_COMPUTE;
@ -7271,32 +7415,38 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
// wait for thread pool
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
// spin lock
{
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
while (atomic_load(&state_shared.has_work)) {
//ggml_lock_lock (&state_shared.spin);
//ggml_lock_unlock(&state_shared.spin);
}
atomic_fetch_sub(&state_shared.n_ready, 1);
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) != 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
while (atomic_load(&state_shared.n_ready) != 0) {
//ggml_lock_lock (&state_shared.spin);
//ggml_lock_unlock(&state_shared.spin);
}
}
}
// FINALIZE
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
// spin lock
{
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
while (atomic_load(&state_shared.has_work)) {
//ggml_lock_lock (&state_shared.spin);
//ggml_lock_unlock(&state_shared.spin);
}
}
// launch thread pool
@ -7311,14 +7461,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
workers[j].node = node;
}
atomic_fetch_sub(&state_shared.n_ready, 1);
// spin lock
{
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) > 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
while (atomic_load(&state_shared.n_ready) > 0) {
//ggml_lock_lock (&state_shared.spin);
//ggml_lock_unlock(&state_shared.spin);
}
atomic_store(&state_shared.has_work, true);
}
atomic_store(&state_shared.has_work, true);
}
params.type = GGML_TASK_FINALIZE;
@ -7326,20 +7479,23 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
// wait for thread pool
if (node->n_tasks > 1) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
// spin lock
{
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
while (atomic_load(&state_shared.has_work)) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
}
while (atomic_load(&state_shared.has_work)) {
//ggml_lock_lock (&state_shared.spin);
//ggml_lock_unlock(&state_shared.spin);
}
atomic_fetch_sub(&state_shared.n_ready, 1);
atomic_fetch_sub(&state_shared.n_ready, 1);
while (atomic_load(&state_shared.n_ready) != 0) {
ggml_lock_lock (&state_shared.spin);
ggml_lock_unlock(&state_shared.spin);
while (atomic_load(&state_shared.n_ready) != 0) {
//ggml_lock_lock (&state_shared.spin);
//ggml_lock_unlock(&state_shared.spin);
}
}
}
@ -7356,16 +7512,31 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
// join thread pool
if (n_threads > 1) {
atomic_store(&state_shared.stop, true);
atomic_store(&state_shared.has_work, true);
for (int j = 0; j < n_threads - 1; j++) {
int rc = ggml_thread_join(workers[j].thrd, NULL);
assert(rc == 0);
UNUSED(rc);
// spin lock
{
atomic_store(&state_shared.stop, true);
atomic_store(&state_shared.has_work, true);
}
ggml_lock_destroy(&state_shared.spin);
//for (int j = 0; j < n_threads - 1; j++) {
// int rc = ggml_thread_join(workers[j].thrd, NULL);
// assert(rc == 0);
// UNUSED(rc);
//}
ggml_lock_lock(&state_shared.th_pool->lock);
while (state_shared.th_pool->n_ready < state_shared.th_pool->n_threads) {
ggml_lock_unlock(&state_shared.th_pool->lock);
ggml_lock_lock (&state_shared.th_pool->lock);
// busy loop
}
ggml_lock_unlock(&state_shared.th_pool->lock);
ggml_critical_section_start();
state_shared.th_pool->is_used = false;
ggml_critical_section_end();
}
// performance stats (graph)

12
ggml.h
View File

@ -177,11 +177,13 @@ extern "C" {
#include <stddef.h>
#include <stdbool.h>
#define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 16
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
#define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 16
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
#define GGML_MAX_THREADS 64
#define GGML_MAX_THREAD_POOLS 16
#ifdef __ARM_NEON
// we use the built-in 16-bit float type