mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-19 08:07:09 +02:00
llama : add thread safety test (llama/14035)
* llama : add thread safety test * llamafile : remove global state * llama : better LLAMA_SPLIT_MODE_NONE logic when main_gpu < 0 GPU devices are not used --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
ad6cd94a3a
commit
44871c8a3e
@ -503,6 +503,9 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
|||||||
// TODO: move to ggml-threading
|
// TODO: move to ggml-threading
|
||||||
void ggml_barrier(struct ggml_threadpool * tp);
|
void ggml_barrier(struct ggml_threadpool * tp);
|
||||||
|
|
||||||
|
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
|
||||||
|
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -559,6 +559,14 @@ void ggml_barrier(struct ggml_threadpool * tp) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value) {
|
||||||
|
atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value) {
|
||||||
|
return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(__gnu_linux__)
|
#if defined(__gnu_linux__)
|
||||||
static cpu_set_t ggml_get_numa_affinity(void) {
|
static cpu_set_t ggml_get_numa_affinity(void) {
|
||||||
cpu_set_t cpuset;
|
cpu_set_t cpuset;
|
||||||
|
@ -53,7 +53,6 @@
|
|||||||
#include "ggml-cpu-impl.h"
|
#include "ggml-cpu-impl.h"
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
|
|
||||||
#include <atomic>
|
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
@ -394,8 +393,6 @@ class tinyBLAS {
|
|||||||
|
|
||||||
template <int RM, int RN, int BM>
|
template <int RM, int RN, int BM>
|
||||||
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||||
static std::atomic<int64_t> current_chunk;
|
|
||||||
|
|
||||||
GGML_ASSERT(m % (RM * BM) == 0);
|
GGML_ASSERT(m % (RM * BM) == 0);
|
||||||
const int64_t ytiles = m / (RM * BM);
|
const int64_t ytiles = m / (RM * BM);
|
||||||
const int64_t xtiles = (n + RN -1) / RN;
|
const int64_t xtiles = (n + RN -1) / RN;
|
||||||
@ -410,7 +407,7 @@ class tinyBLAS {
|
|||||||
if (params->ith == 0) {
|
if (params->ith == 0) {
|
||||||
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||||
std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
|
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_barrier(params->threadpool);
|
ggml_barrier(params->threadpool);
|
||||||
@ -439,8 +436,7 @@ class tinyBLAS {
|
|||||||
GGML_ASSERT(jj == jj2);
|
GGML_ASSERT(jj == jj2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// next step.
|
job = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||||
job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_barrier(params->threadpool);
|
ggml_barrier(params->threadpool);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user