llama : add thread safety test (llama/14035)

* llama : add thread safety test

* llamafile : remove global state

* llama : better LLAMA_SPLIT_MODE_NONE logic

when main_gpu < 0 GPU devices are not used

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Diego Devesa 2025-06-16 08:11:43 -07:00 committed by Georgi Gerganov
parent ad6cd94a3a
commit 44871c8a3e
3 changed files with 13 additions and 6 deletions

View File

@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
// TODO: move to ggml-threading // TODO: move to ggml-threading
void ggml_barrier(struct ggml_threadpool * tp); void ggml_barrier(struct ggml_threadpool * tp);
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
#endif #endif
} }
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
}
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
}
#if defined(__gnu_linux__) #if defined(__gnu_linux__)
static cpu_set_t ggml_get_numa_affinity(void) { static cpu_set_t ggml_get_numa_affinity(void) {
cpu_set_t cpuset; cpu_set_t cpuset;

View File

@ -53,7 +53,6 @@
#include "ggml-cpu-impl.h" #include "ggml-cpu-impl.h"
#include "ggml-quants.h" #include "ggml-quants.h"
#include <atomic>
#include <array> #include <array>
#include <type_traits> #include <type_traits>
@ -394,8 +393,6 @@ class tinyBLAS {
template <int RM, int RN, int BM> template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) { NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
static std::atomic<int64_t> current_chunk;
GGML_ASSERT(m % (RM * BM) == 0); GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM); const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN; const int64_t xtiles = (n + RN -1) / RN;
@ -410,7 +407,7 @@ class tinyBLAS {
if (params->ith == 0) { if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles); GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed); ggml_threadpool_chunk_set(params->threadpool, params->nth);
} }
ggml_barrier(params->threadpool); ggml_barrier(params->threadpool);
@ -439,8 +436,7 @@ class tinyBLAS {
GGML_ASSERT(jj == jj2); GGML_ASSERT(jj == jj2);
} }
// next step. job = ggml_threadpool_chunk_add(params->threadpool, 1);
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
} }
ggml_barrier(params->threadpool); ggml_barrier(params->threadpool);