From b7b38f7d68d0eb3a0668fc779aa055c7f1980489 Mon Sep 17 00:00:00 2001 From: amritahs-ibm Date: Sat, 9 Nov 2024 12:47:50 +0530 Subject: [PATCH] ggml : optimize llamafile cpu matrix multiplication for ppc64le (llama/10156) This change upstreams llamafile's cpu matrix multiplication kernels for ppc64le using MMA builtins for FP32 datatype. This change results in a consistent 90% improvement in input processing time, and 20% to 80% improvement in output processing time, across various batch sizes. The patch is tested with Meta-Lllama-3-8B, Mistral-7B, Llama-2-7B-chat-hf models on a IBM POWER10 machine. Signed-off-by: Amrita H S --- ggml/src/CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 6c5b816d..a05f8c50 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1265,8 +1265,13 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") message(STATUS "PowerPC detected") - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" + OUTPUT_VARIABLE POWER10_M) + string(FIND ${POWER10_M} "POWER10" substring_index) + if(${substring_index} GREATER_EQUAL 0) + list(APPEND ARCH_FLAGS -mcpu=power10) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) else() list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)