diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 9662e4d7..ae68cd00 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) { // TODO: move to ggml-threading void ggml_barrier(struct ggml_threadpool * tp); +void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value); +int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index ff28bf98..2c12e493 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) { #endif } +void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) { + atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed); +} + +int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) { + return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed); +} + #if defined(__gnu_linux__) static cpu_set_t ggml_get_numa_affinity(void) { cpu_set_t cpuset; diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 1d46158f..1c545f80 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -53,7 +53,6 @@ #include "ggml-cpu-impl.h" #include "ggml-quants.h" -#include #include #include @@ -394,8 +393,6 @@ class tinyBLAS { template NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) { - static std::atomic current_chunk; - GGML_ASSERT(m % (RM * BM) == 0); const int64_t ytiles = m / (RM * BM); const int64_t xtiles = (n + RN -1) / RN; @@ -410,7 +407,7 @@ class tinyBLAS { if (params->ith == 0) { GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles); // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed); + ggml_threadpool_chunk_set(params->threadpool, params->nth); } ggml_barrier(params->threadpool); @@ -439,8 +436,7 @@ class tinyBLAS { GGML_ASSERT(jj == jj2); } - // next step. - job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed); + job = ggml_threadpool_chunk_add(params->threadpool, 1); } ggml_barrier(params->threadpool);