mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-26 07:48:48 +01:00
ggml : add numa options (llama/5377)
* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h * Reverted Makefile * Fixed include * Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables * removed trailing whitespace * Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h * Reverting Makefile * Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet * Removing MIRROR_MODE code for this PR * Removing last bit of MIRROR_MODE code for this PR * Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static * Fixed lingering init_llama_backend() bool calls in tests and examples * Remote enum llama_numa_strategies * Revert bad merge with dynatemp flags * add missing enum ggml_numa_strategies declaration and revert sync problem with master * add missing enum ggml_numa_strategies declaration * fixed ggml_init_numa variable * Update ggml.h Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges * split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples * Fix up some boolean vs enum comparisons * Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype * Update ggml.h Align enum values Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update ggml.c Remove whitespace Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update ggml.c align paremeters Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update examples/server/server.cpp remove whitespace and align brace Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/common.cpp Remove whitespace and align brace Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * unified ggml_numa_strategy enum and fixed text alignment in server.cpp example * Update ggml.c simplified return for platforms without NUMA support Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * removed redundant else from cli argument processing of --numa * whitespace --------- Co-authored-by: root <root@nenya.lothlorien.ca> Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
9f675e021c
commit
a4ed8a0821
80
ggml.c
80
ggml.c
@ -1954,9 +1954,16 @@ struct ggml_numa_node {
|
||||
};
|
||||
|
||||
struct ggml_numa_nodes {
|
||||
enum ggml_numa_strategy numa_strategy;
|
||||
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
||||
uint32_t n_nodes;
|
||||
uint32_t total_cpus; // hardware threads on system
|
||||
uint32_t current_node; // node on which main process is execting
|
||||
#ifdef __linux__
|
||||
cpu_set_t cpuset; // cpuset from numactl
|
||||
#else
|
||||
uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
|
||||
#endif
|
||||
};
|
||||
|
||||
//
|
||||
@ -1990,7 +1997,22 @@ inline static void ggml_critical_section_end(void) {
|
||||
atomic_fetch_sub(&g_state_barrier, 1);
|
||||
}
|
||||
|
||||
void ggml_numa_init(void) {
|
||||
#ifdef __linux__
|
||||
static cpu_set_t ggml_get_numa_affinity(void) {
|
||||
cpu_set_t cpuset;
|
||||
pthread_t thread;
|
||||
thread = pthread_self();
|
||||
CPU_ZERO(&cpuset);
|
||||
pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
|
||||
return cpuset;
|
||||
}
|
||||
#else
|
||||
static uint32_t ggml_get_numa_affinity(void) {
|
||||
return 0; // no NUMA support
|
||||
}
|
||||
#endif
|
||||
|
||||
void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
||||
if (g_state.numa.n_nodes > 0) {
|
||||
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
||||
|
||||
@ -2002,6 +2024,13 @@ void ggml_numa_init(void) {
|
||||
char path[256];
|
||||
int rv;
|
||||
|
||||
// set numa scheme
|
||||
g_state.numa.numa_strategy = numa_flag;
|
||||
|
||||
GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
|
||||
|
||||
g_state.numa.cpuset = ggml_get_numa_affinity();
|
||||
|
||||
// enumerate nodes
|
||||
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
||||
@ -2020,11 +2049,17 @@ void ggml_numa_init(void) {
|
||||
|
||||
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
||||
|
||||
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
||||
// figure out which node we're on
|
||||
uint current_cpu;
|
||||
int getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
||||
|
||||
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
||||
g_state.numa.n_nodes = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
|
||||
|
||||
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
||||
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
||||
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
||||
@ -16638,26 +16673,46 @@ typedef pthread_t ggml_thread_t;
|
||||
|
||||
// Android's libc implementation "bionic" does not support setting affinity
|
||||
#if defined(__linux__) && !defined(__BIONIC__)
|
||||
static void set_numa_thread_affinity(int thread_n, int n_threads) {
|
||||
static void set_numa_thread_affinity(int thread_n) {
|
||||
if (!ggml_is_numa()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// run thread on node_num thread_n / (threads per node)
|
||||
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
|
||||
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
||||
int node_num;
|
||||
int rv;
|
||||
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
||||
|
||||
switch(g_state.numa.numa_strategy) {
|
||||
case GGML_NUMA_STRATEGY_DISTRIBUTE:
|
||||
// run thread on node_num thread_n / (threads per node)
|
||||
node_num = thread_n % g_state.numa.n_nodes;
|
||||
break;
|
||||
case GGML_NUMA_STRATEGY_ISOLATE:
|
||||
// run thread on current_node
|
||||
node_num = g_state.numa.current_node;
|
||||
break;
|
||||
case GGML_NUMA_STRATEGY_NUMACTL:
|
||||
// use the cpuset that numactl gave us
|
||||
rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
|
||||
if (rv) {
|
||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv));
|
||||
}
|
||||
return;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
|
||||
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
||||
|
||||
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
||||
CPU_ZERO_S(setsize, cpus);
|
||||
for (size_t i = 0; i < node->n_cpus; ++i) {
|
||||
CPU_SET_S(node->cpus[i], setsize, cpus);
|
||||
}
|
||||
|
||||
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
||||
rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
||||
if (rv) {
|
||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
||||
strerror(rv));
|
||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
||||
}
|
||||
|
||||
CPU_FREE(cpus);
|
||||
@ -16678,8 +16733,7 @@ static void clear_numa_thread_affinity(void) {
|
||||
|
||||
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
|
||||
if (rv) {
|
||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
||||
strerror(rv));
|
||||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
|
||||
}
|
||||
|
||||
CPU_FREE(cpus);
|
||||
@ -16687,7 +16741,7 @@ static void clear_numa_thread_affinity(void) {
|
||||
#else
|
||||
// TODO: Windows etc.
|
||||
// (the linux implementation may also work on BSD, someone should test)
|
||||
static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
|
||||
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
|
||||
static void clear_numa_thread_affinity(void) {}
|
||||
#endif
|
||||
|
||||
@ -16987,7 +17041,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
|
||||
const int n_threads = state->shared->n_threads;
|
||||
|
||||
set_numa_thread_affinity(state->ith, n_threads);
|
||||
set_numa_thread_affinity(state->ith);
|
||||
|
||||
int node_n = -1;
|
||||
int task_phase = GGML_TASK_FINALIZE;
|
||||
|
12
ggml.h
12
ggml.h
@ -658,6 +658,16 @@ extern "C" {
|
||||
void * wdata;
|
||||
};
|
||||
|
||||
// numa strategies
|
||||
enum ggml_numa_strategy {
|
||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
|
||||
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
// misc
|
||||
|
||||
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
||||
@ -668,7 +678,7 @@ extern "C" {
|
||||
|
||||
GGML_API void ggml_print_backtrace(void);
|
||||
|
||||
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
|
Loading…
Reference in New Issue
Block a user