From a4ed8a08213ce0d5708f2431f436f9e72d7b7044 Mon Sep 17 00:00:00 2001 From: bmwl Date: Fri, 16 Feb 2024 01:31:07 -0800 Subject: [PATCH] ggml : add numa options (llama/5377) * Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h * Reverted Makefile * Fixed include * Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables * removed trailing whitespace * Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h * Reverting Makefile * Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet * Removing MIRROR_MODE code for this PR * Removing last bit of MIRROR_MODE code for this PR * Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static * Fixed lingering init_llama_backend() bool calls in tests and examples * Remote enum llama_numa_strategies * Revert bad merge with dynatemp flags * add missing enum ggml_numa_strategies declaration and revert sync problem with master * add missing enum ggml_numa_strategies declaration * fixed ggml_init_numa variable * Update ggml.h Co-authored-by: Jared Van Bortel * Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges * split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples * Fix up some boolean vs enum comparisons * Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype * Update ggml.h Align enum values Co-authored-by: Georgi Gerganov * Update ggml.c Remove whitespace Co-authored-by: Georgi Gerganov * Update ggml.c align paremeters Co-authored-by: Georgi Gerganov * Update examples/server/server.cpp remove whitespace and align brace Co-authored-by: Georgi Gerganov * Update common/common.cpp Remove whitespace and align brace Co-authored-by: Georgi Gerganov * unified ggml_numa_strategy enum and fixed text alignment in server.cpp example * Update ggml.c simplified return for platforms without NUMA support Co-authored-by: Jared Van Bortel * removed redundant else from cli argument processing of --numa * whitespace --------- Co-authored-by: root Co-authored-by: Jared Van Bortel Co-authored-by: Georgi Gerganov Co-authored-by: Jared Van Bortel --- ggml.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++---------- ggml.h | 12 ++++++++- 2 files changed, 78 insertions(+), 14 deletions(-) diff --git a/ggml.c b/ggml.c index d921d82f..4e302fb7 100644 --- a/ggml.c +++ b/ggml.c @@ -1954,9 +1954,16 @@ struct ggml_numa_node { }; struct ggml_numa_nodes { + enum ggml_numa_strategy numa_strategy; struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; uint32_t n_nodes; uint32_t total_cpus; // hardware threads on system + uint32_t current_node; // node on which main process is execting +#ifdef __linux__ + cpu_set_t cpuset; // cpuset from numactl +#else + uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype +#endif }; // @@ -1990,7 +1997,22 @@ inline static void ggml_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); } -void ggml_numa_init(void) { +#ifdef __linux__ +static cpu_set_t ggml_get_numa_affinity(void) { + cpu_set_t cpuset; + pthread_t thread; + thread = pthread_self(); + CPU_ZERO(&cpuset); + pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + return cpuset; +} +#else +static uint32_t ggml_get_numa_affinity(void) { + return 0; // no NUMA support +} +#endif + +void ggml_numa_init(enum ggml_numa_strategy numa_flag) { if (g_state.numa.n_nodes > 0) { fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); @@ -2002,6 +2024,13 @@ void ggml_numa_init(void) { char path[256]; int rv; + // set numa scheme + g_state.numa.numa_strategy = numa_flag; + + GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy); + + g_state.numa.cpuset = ggml_get_numa_affinity(); + // enumerate nodes while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); @@ -2020,11 +2049,17 @@ void ggml_numa_init(void) { GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); - if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) { + // figure out which node we're on + uint current_cpu; + int getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); + + if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) { g_state.numa.n_nodes = 0; return; } + GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu); + for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { struct ggml_numa_node * node = &g_state.numa.nodes[n]; GGML_PRINT_DEBUG("CPUs on node %u:", n); @@ -16638,26 +16673,46 @@ typedef pthread_t ggml_thread_t; // Android's libc implementation "bionic" does not support setting affinity #if defined(__linux__) && !defined(__BIONIC__) -static void set_numa_thread_affinity(int thread_n, int n_threads) { +static void set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; } - // run thread on node_num thread_n / (threads per node) - const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes); - struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; + int node_num; + int rv; size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); + switch(g_state.numa.numa_strategy) { + case GGML_NUMA_STRATEGY_DISTRIBUTE: + // run thread on node_num thread_n / (threads per node) + node_num = thread_n % g_state.numa.n_nodes; + break; + case GGML_NUMA_STRATEGY_ISOLATE: + // run thread on current_node + node_num = g_state.numa.current_node; + break; + case GGML_NUMA_STRATEGY_NUMACTL: + // use the cpuset that numactl gave us + rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",strerror(rv)); + } + return; + default: + return; + } + + struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; + cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); CPU_ZERO_S(setsize, cpus); for (size_t i = 0; i < node->n_cpus; ++i) { CPU_SET_S(node->cpus[i], setsize, cpus); } - int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); + rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", - strerror(rv)); + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); } CPU_FREE(cpus); @@ -16678,8 +16733,7 @@ static void clear_numa_thread_affinity(void) { int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); if (rv) { - fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", - strerror(rv)); + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); } CPU_FREE(cpus); @@ -16687,7 +16741,7 @@ static void clear_numa_thread_affinity(void) { #else // TODO: Windows etc. // (the linux implementation may also work on BSD, someone should test) -static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } +static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} #endif @@ -16987,7 +17041,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const int n_threads = state->shared->n_threads; - set_numa_thread_affinity(state->ith, n_threads); + set_numa_thread_affinity(state->ith); int node_n = -1; int task_phase = GGML_TASK_FINALIZE; diff --git a/ggml.h b/ggml.h index 01cecc1e..27001818 100644 --- a/ggml.h +++ b/ggml.h @@ -658,6 +658,16 @@ extern "C" { void * wdata; }; + // numa strategies + enum ggml_numa_strategy { + GGML_NUMA_STRATEGY_DISABLED = 0, + GGML_NUMA_STRATEGY_DISTRIBUTE = 1, + GGML_NUMA_STRATEGY_ISOLATE = 2, + GGML_NUMA_STRATEGY_NUMACTL = 3, + GGML_NUMA_STRATEGY_MIRROR = 4, + GGML_NUMA_STRATEGY_COUNT + }; + // misc GGML_API void ggml_time_init(void); // call this once at the beginning of the program @@ -668,7 +678,7 @@ extern "C" { GGML_API void ggml_print_backtrace(void); - GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems + GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node GGML_API void ggml_print_object (const struct ggml_object * obj);