ggml : try to fix the abort mechanism

2025-07-04 16:30:58 +02:00 · 2023-11-05 20:02:24 +02:00
74 changed files with 1786 additions and 1571 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -88,7 +88,7 @@ jobs:
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            apt update
            apt install -y build-essential cmake libsdl2-dev
-            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+            cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
            make
            ctest -L gh --output-on-failure'
@ -115,7 +115,7 @@ jobs:
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            apt update
            apt install -y build-essential cmake libsdl2-dev
-            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
+            cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
            make
            ctest -L gh --output-on-failure'
@ -182,7 +182,7 @@ jobs:
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_SDL2=${{ matrix.sdl2 }}
+          -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
      - name: Build
        run: |
@ -217,10 +217,10 @@ jobs:
        sdl2: [ON]
        include:
          - arch: Win32
-            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x86.zip
+            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
            s2arc: x86
          - arch: x64
-            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x64.zip
+            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
            s2arc: x64
          - sdl2: ON
            s2ver: 2.26.0
@ -239,7 +239,7 @@ jobs:
          7z x blas.zip -oblas -y
          copy blas/include/cblas.h .
          copy blas/include/openblas_config.h .
-          echo "OPENBLAS_PATH=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
+          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
      - name: Fetch SDL2 and set SDL2_DIR
        if: matrix.sdl2 == 'ON'
@ -252,9 +252,9 @@ jobs:
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_OPENBLAS=${{ matrix.blas }}
+          -DWHISPER_SUPPORT_OPENBLAS=${{ matrix.blas }}
-          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
+          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-          -DWHISPER_SDL2=${{ matrix.sdl2 }}
+          -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
      - name: Build
        run: |
@ -263,7 +263,7 @@ jobs:
      - name: Copy libopenblas.dll
        if: matrix.blas == 'ON'
-        run: copy "$env:OPENBLAS_PATH/bin/libopenblas.dll" build/bin/${{ matrix.build }}
+        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
--- a/.gitignore
+++ b/.gitignore
@ -18,11 +18,6 @@ build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 # SPM
 .build/
 .swiftpm
 *.metallib
 /main
 /stream
 /command
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.5)
-project(whisper.cpp VERSION 1.4.3)
+project(whisper.cpp VERSION 1.4.2)
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/45
+++ b/45
@ -307,7 +307,7 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-WHISPER_OBJ += ggml.o ggml-alloc.o ggml-backend.o ggml-quants.o
+WHISPER_OBJ += ggml-alloc.o ggml-backend.o ggml-quants.o
 whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -331,11 +331,11 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 WHISPER_OBJ += ggml-metal.o
 endif
-libwhisper.a: $(WHISPER_OBJ)
+libwhisper.a: ggml.o $(WHISPER_OBJ)
-	$(AR) rcs libwhisper.a $(WHISPER_OBJ)
+	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
-libwhisper.so: $(WHISPER_OBJ)
+libwhisper.so: ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 clean:
 	rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so
@ -349,30 +349,30 @@ CC_SDL=`sdl2-config --cflags --libs`
 SRC_COMMON     = examples/common.cpp examples/common-ggml.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp
-main: examples/main/main.cpp $(SRC_COMMON) $(WHISPER_OBJ)
+main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
 	./main -h
-bench: examples/bench/bench.cpp $(WHISPER_OBJ)
+bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp $(WHISPER_OBJ) -o bench $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
-quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
+quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
-lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
+lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o lsp $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o lsp $(CC_SDL) $(LDFLAGS)
-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk-llama $(CC_SDL) $(LDFLAGS)
 #
 # Audio samples
@ -417,10 +417,9 @@ samples:
 .PHONY: medium.en
 .PHONY: medium
 .PHONY: large-v1
 .PHONY: large-v2
 .PHONY: large
-tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
+tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
 	bash ./models/download-ggml-model.sh $@
 	@echo ""
 	@echo "==============================================="
--- a/Package.swift
+++ b/Package.swift
@ -1,77 +0,0 @@
 // swift-tools-version:5.5
 import PackageDescription
 #if arch(arm) || arch(arm64)
 let platforms: [SupportedPlatform]? = [
    .macOS(.v12),
    .iOS(.v14),
    .watchOS(.v4),
    .tvOS(.v14)
 ]
 let exclude: [String] = []
 let resources: [Resource] = [
    .process("ggml-metal.metal")
 ]
 let additionalSources: [String] = ["ggml-metal.m"]
 let additionalSettings: [CSetting] = [
    .unsafeFlags(["-fno-objc-arc"]),
    .define("GGML_USE_METAL")
 ]
 #else
 let platforms: [SupportedPlatform]? = nil
 let exclude: [String] = ["ggml-metal.metal"]
 let resources: [Resource] = []
 let additionalSources: [String] = []
 let additionalSettings: [CSetting] = []
 #endif
 let package = Package(
    name: "whisper",
    platforms: platforms,
    products: [
        .library(name: "whisper", targets: ["whisper"]),
    ],
    targets: [
        .target(
            name: "whisper",
            path: ".",
            exclude: exclude + [
               "bindings",
               "cmake",
               "coreml",
               "examples",
               "extra",
               "models",
               "samples",
               "tests",
               "CMakeLists.txt",
               "ggml-cuda.cu",
               "ggml-cuda.h",
               "Makefile"
            ],
            sources: [
                "ggml.c",
                "whisper.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "ggml-quants.c"
            ] + additionalSources,
            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                .define("GGML_USE_ACCELERATE")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
            ] + additionalSettings,
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
        )
    ],
    cxxLanguageStandard: .cxx11
 )
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Beta: [v1.4.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.3) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Beta: [v1.4.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.2) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -234,7 +234,6 @@ make small
 make medium.en
 make medium
 make large-v1
 make large-v2
 make large
 ```
@ -246,7 +245,7 @@ make large
 | base   | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
 | small  | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
 | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| large  | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
+| large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 ## Quantization
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -24,7 +24,7 @@ const (
 var (
 	// The models which will be downloaded, if no model is specified as an argument
-	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
+	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
 )
 var (
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -83,6 +83,7 @@ const (
 	SampleRate = C.WHISPER_SAMPLE_RATE                 // Expected sample rate, samples per second
 	SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
 	NumFFT     = C.WHISPER_N_FFT
 	NumMEL     = C.WHISPER_N_MEL
 	HopLength  = C.WHISPER_HOP_LENGTH
 	ChunkSize  = C.WHISPER_CHUNK_SIZE
 )
@ -102,7 +103,7 @@ var (
 func Whisper_init(path string) *Context {
 	cPath := C.CString(path)
 	defer C.free(unsafe.Pointer(cPath))
-	if ctx := C.whisper_init_from_file_with_params(cPath, C.whisper_context_default_params()); ctx != nil {
+	if ctx := C.whisper_init_from_file(cPath); ctx != nil {
 		return (*Context)(ctx)
 	} else {
 		return nil
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperContext.java
@ -4,7 +4,6 @@ import com.sun.jna.Structure;
 import com.sun.jna.ptr.PointerByReference;
 import io.github.ggerganov.whispercpp.ggml.GgmlType;
 import io.github.ggerganov.whispercpp.WhisperModel;
 import io.github.ggerganov.whispercpp.params.WhisperContextParams;
 import java.util.List;
@ -24,9 +23,8 @@ public class WhisperContext extends Structure {
    public PointerByReference vocab;
    public PointerByReference state;
-    /** populated by whisper_init_from_file_with_params() */
+    /** populated by whisper_init_from_file() */
    String path_model;
    WhisperContextParams params;
 //    public static class ByReference extends WhisperContext implements Structure.ByReference {
 //    }
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
@ -2,7 +2,6 @@ package io.github.ggerganov.whispercpp;
 import com.sun.jna.Native;
 import com.sun.jna.Pointer;
 import io.github.ggerganov.whispercpp.params.WhisperContextParams;
 import io.github.ggerganov.whispercpp.params.WhisperFullParams;
 import io.github.ggerganov.whispercpp.params.WhisperSamplingStrategy;
@ -16,9 +15,8 @@ import java.io.IOException;
 public class WhisperCpp implements AutoCloseable {
    private WhisperCppJnaLibrary lib = WhisperCppJnaLibrary.instance;
    private Pointer ctx = null;
-    private Pointer paramsPointer = null;
+    private Pointer greedyPointer = null;
-    private Pointer greedyParamsPointer = null;
+    private Pointer beamPointer = null;
    private Pointer beamParamsPointer = null;
    public File modelDir() {
        String modelDirPath = System.getenv("XDG_CACHE_HOME");
@ -33,18 +31,6 @@ public class WhisperCpp implements AutoCloseable {
     * @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
     */
    public void initContext(String modelPath) throws FileNotFoundException {
        initContextImpl(modelPath, getContextDefaultParams());
    }
    /**
     * @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
     * @param params - params to use when initialising the context
     */
    public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
        initContextImpl(modelPath, params);
    }
    private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
        if (ctx != null) {
            lib.whisper_free(ctx);
        }
@ -57,26 +43,13 @@ public class WhisperCpp implements AutoCloseable {
            modelPath = new File(modelDir(), modelPath).getAbsolutePath();
        }
-        ctx = lib.whisper_init_from_file_with_params(modelPath, params);
+        ctx = lib.whisper_init_from_file(modelPath);
        if (ctx == null) {
            throw new FileNotFoundException(modelPath);
        }
    }
    /**
     * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
     * Because this function allocates memory for the params, the caller must call either:
     * - call `whisper_free_context_params()`
     * - `Native.free(Pointer.nativeValue(pointer));`
     */
    public WhisperContextParams getContextDefaultParams() {
        paramsPointer = lib.whisper_context_default_params_by_ref();
        WhisperContextParams params = new WhisperContextParams(paramsPointer);
        params.read();
        return params;
    }
    /**
     * Provides default params which can be used with `whisper_full()` etc.
     * Because this function allocates memory for the params, the caller must call either:
@ -90,15 +63,15 @@ public class WhisperCpp implements AutoCloseable {
        // whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
        if (strategy == WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY) {
-            if (greedyParamsPointer == null) {
+            if (greedyPointer == null) {
-                greedyParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
+                greedyPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
            }
-            pointer = greedyParamsPointer;
+            pointer = greedyPointer;
        } else {
-            if (beamParamsPointer == null) {
+            if (beamPointer == null) {
-                beamParamsPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
+                beamPointer = lib.whisper_full_default_params_by_ref(strategy.ordinal());
            }
-            pointer = beamParamsPointer;
+            pointer = beamPointer;
        }
        WhisperFullParams params = new WhisperFullParams(pointer);
@ -120,17 +93,13 @@ public class WhisperCpp implements AutoCloseable {
    }
    private void freeParams() {
-        if (paramsPointer != null) {
+        if (greedyPointer != null) {
-            Native.free(Pointer.nativeValue(paramsPointer));
+            Native.free(Pointer.nativeValue(greedyPointer));
-            paramsPointer = null;
+            greedyPointer = null;
        }
-        if (greedyParamsPointer != null) {
+        if (beamPointer != null) {
-            Native.free(Pointer.nativeValue(greedyParamsPointer));
+            Native.free(Pointer.nativeValue(beamPointer));
-            greedyParamsPointer = null;
+            beamPointer = null;
        }
        if (beamParamsPointer != null) {
            Native.free(Pointer.nativeValue(beamParamsPointer));
            beamParamsPointer = null;
        }
    }
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
@ -5,7 +5,6 @@ import com.sun.jna.Native;
 import com.sun.jna.Pointer;
 import io.github.ggerganov.whispercpp.model.WhisperModelLoader;
 import io.github.ggerganov.whispercpp.model.WhisperTokenData;
 import io.github.ggerganov.whispercpp.params.WhisperContextParams;
 import io.github.ggerganov.whispercpp.params.WhisperFullParams;
 public interface WhisperCppJnaLibrary extends Library {
@ -14,32 +13,13 @@ public interface WhisperCppJnaLibrary extends Library {
    String whisper_print_system_info();
    /**
-     * DEPRECATED. Allocate (almost) all memory needed for the model by loading from a file.
+     * Allocate (almost) all memory needed for the model by loading from a file.
     *
     * @param path_model Path to the model file
     * @return Whisper context on success, null on failure
     */
    Pointer whisper_init_from_file(String path_model);
    /**
     * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
     * Because this function allocates memory for the params, the caller must call either:
     * - call `whisper_free_context_params()`
     * - `Native.free(Pointer.nativeValue(pointer));`
     */
    Pointer whisper_context_default_params_by_ref();
    void whisper_free_context_params(Pointer params);
    /**
     * Allocate (almost) all memory needed for the model by loading from a file.
     *
     * @param path_model Path to the model file
     * @param params     Pointer to whisper_context_params
     * @return Whisper context on success, null on failure
     */
    Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
    /**
     * Allocate (almost) all memory needed for the model by loading from a buffer.
     *
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperContextParams.java
@ -1,31 +0,0 @@
 package io.github.ggerganov.whispercpp.params;
 import com.sun.jna.*;
 import java.util.Arrays;
 import java.util.List;
 /**
 * Parameters for the whisper_init_from_file_with_params() function.
 * If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
 * whisper_context_default_params()
 */
 public class WhisperContextParams extends Structure {
    public WhisperContextParams(Pointer p) {
        super(p);
    }
    /** Use GPU for inference Number (default = true) */
    public CBool use_gpu;
    /** Use GPU for inference Number (default = true) */
    public void useGpu(boolean enable) {
        use_gpu = enable ? CBool.TRUE : CBool.FALSE;
    }
    @Override
    protected List<String> getFieldOrder() {
        return Arrays.asList("use_gpu");
    }
 }
--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -20,7 +20,7 @@ struct whisper_context * g_context;
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_context == nullptr) {
-            g_context = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
+            g_context = whisper_init_from_file(path_model.c_str());
            if (g_context != nullptr) {
                return true;
            } else {
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.4.3",
+  "version": "1.4.2",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -87,7 +87,7 @@ static VALUE ruby_whisper_initialize(int argc, VALUE *argv, VALUE self) {
  if (!rb_respond_to(whisper_model_file_path, rb_intern("to_s"))) {
    rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
  }
-  rw->context = whisper_init_from_file_with_params(StringValueCStr(whisper_model_file_path), whisper_context_default_params());
+  rw->context = whisper_init_from_file(StringValueCStr(whisper_model_file_path));
  if (rw->context == nullptr) {
    rb_raise(rb_eRuntimeError, "error: failed to initialize whisper context");
  }
--- a/coreml/whisper-encoder-impl.h
+++ b/coreml/whisper-encoder-impl.h
@ -123,7 +123,7 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
 /**
    Make a prediction using the convenience interface
-    @param logmel_data as 1 × n_mel × 3000 3-dimensional array of floats:
+    @param logmel_data as 1 × 80 × 3000 3-dimensional array of floats:
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as whisper_encoder_implOutput
 */
--- a/coreml/whisper-encoder.h
+++ b/coreml/whisper-encoder.h
@ -3,8 +3,6 @@
 // Code is derived from the work of Github user @wangchou
 // ref: https://github.com/wangchou/callCoreMLFromCpp
 #include <stdint.h>
 #if __cplusplus
 extern "C" {
 #endif
@ -16,8 +14,6 @@ void whisper_coreml_free(struct whisper_coreml_context * ctx);
 void whisper_coreml_encode(
        const whisper_coreml_context * ctx,
                             int64_t   n_ctx,
                             int64_t   n_mel,
                               float * mel,
                               float * out);
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -48,15 +48,13 @@ void whisper_coreml_free(struct whisper_coreml_context * ctx) {
 void whisper_coreml_encode(
        const whisper_coreml_context * ctx,
                             int64_t   n_ctx,
                             int64_t   n_mel,
                               float * mel,
                               float * out) {
    MLMultiArray * inMultiArray = [
        [MLMultiArray alloc] initWithDataPointer: mel
-                                           shape: @[@1, @(n_mel), @(n_ctx)]
+                                           shape: @[@1, @80, @3000]
                                        dataType: MLMultiArrayDataTypeFloat32
-                                         strides: @[@(n_ctx*n_mel), @(n_ctx), @1]
+                                         strides: @[@(240000), @(3000), @1]
                                     deallocator: nil
                                           error: nil
    ];
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -11,7 +11,6 @@ const whisperParamsMock = {
  language: "en",
  model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
  fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
  use_gpu: true,
 };
 describe("Run whisper.node", () => {
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -36,7 +36,6 @@ struct whisper_params {
    bool print_colors   = false;
    bool print_progress = false;
    bool no_timestamps  = false;
    bool use_gpu        = true;
    std::string language = "en";
    std::string prompt;
@ -154,9 +153,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
@ -318,12 +315,10 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  std::string language = whisper_params.Get("language").As<Napi::String>();
  std::string model = whisper_params.Get("model").As<Napi::String>();
  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
  bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
  params.language = language;
  params.model = model;
  params.fname_inp.emplace_back(input);
  params.use_gpu = use_gpu;
  Napi::Function callback = info[1].As<Napi::Function>();
  Worker* worker = new Worker(callback, params);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -11,7 +11,6 @@ const whisperParams = {
  language: "en",
  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
  fname_inp: "../../samples/jfk.wav",
  use_gpu: true,
 };
 const arguments = process.argv.slice(2);
--- a/examples/bench.wasm/emscripten.cpp
+++ b/examples/bench.wasm/emscripten.cpp
@ -23,9 +23,7 @@ void bench_main(size_t index) {
    fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
-    const int n_mels = whisper_model_n_mels(ctx);
+    if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
    if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
        fprintf(stderr, "error: failed to set mel: %d\n", ret);
        return;
    }
@ -59,7 +57,7 @@ EMSCRIPTEN_BINDINGS(bench) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    if (g_worker.joinable()) {
                        g_worker.join();
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -11,8 +11,6 @@ struct whisper_params {
    int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat
    std::string model = "models/ggml-base.en.bin";
    bool use_gpu = true;
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -25,10 +23,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
-        else if (arg == "-t"  || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
+        else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
-        else if (arg == "-m"  || arg == "--model")   { params.model     = argv[++i]; }
+        else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
-        else if (arg == "-w"  || arg == "--what")    { params.what      = atoi(argv[++i]); }
+        else if (arg == "-w" || arg == "--what")    { params.what     = atoi(argv[++i]); }
        else if (arg == "-ng" || arg == "--no-gpu")  { params.use_gpu   = false; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -48,7 +45,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
    fprintf(stderr, "  -ng,      --no-gpu      [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "                           %-7s  0 - whisper\n",                                 "");
    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
@ -58,10 +54,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
 int whisper_bench_full(const whisper_params & params) {
    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    {
        fprintf(stderr, "\n");
@ -73,9 +66,7 @@ int whisper_bench_full(const whisper_params & params) {
        return 2;
    }
-    const int n_mels = whisper_model_n_mels(ctx);
+    if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
    if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
        fprintf(stderr, "error: failed to set mel: %d\n", ret);
        return 3;
    }
--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -243,7 +243,7 @@ EMSCRIPTEN_BINDINGS(command) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -38,7 +38,6 @@ struct whisper_params {
    bool print_special = false;
    bool print_energy  = false;
    bool no_timestamps = true;
    bool use_gpu       = true;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -69,7 +68,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
@ -103,7 +101,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -ng,        --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -l LANG,    --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -f FNAME,   --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
@ -613,10 +610,7 @@ int main(int argc, char ** argv) {
    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    // print some info about the processing
    {
--- a/examples/common.h
+++ b/examples/common.h
@ -181,7 +181,7 @@ private:
    // It is assumed that PCM data is normalized to a range from -1 to 1
    bool write_audio(const float * data, size_t length) {
        for (size_t i = 0; i < length; ++i) {
-            const int16_t intSample = data[i] * 32767;
+            const auto intSample = static_cast<const int16_t>(data[i] * 32767);
            file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
            dataSize += sizeof(int16_t);
        }
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -48,7 +48,7 @@ if [ -n "$3" ]; then
 fi
 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 # list available models
 function list_models {
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -30,7 +30,6 @@ struct whisper_params {
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
    bool use_gpu       = true;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -73,7 +72,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else {
@ -104,7 +102,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -ng,        --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -l LANG,    --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "\n");
@ -435,9 +432,7 @@ int main(int argc, char ** argv) {
    }
    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    // init audio
    audio_async audio(30*1000);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -90,7 +90,6 @@ struct whisper_params {
    bool print_progress  = false;
    bool no_timestamps   = false;
    bool log_score       = false;
    bool use_gpu         = true;
    std::string language  = "en";
    std::string prompt;
@ -166,7 +165,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score = true; }
        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu = false; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -223,7 +221,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
    fprintf(stderr, "  -ng,       --no-gpu            [%-7s] disable GPU\n",                                    params.use_gpu ? "false" : "true");
    fprintf(stderr, "\n");
 }
@ -880,10 +877,7 @@ int main(int argc, char ** argv) {
    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -132,7 +132,7 @@ EMSCRIPTEN_BINDINGS(stream) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -48,12 +48,11 @@ struct whisper_params {
    bool no_context    = true;
    bool no_timestamps = false;
    bool tinydiarize   = false;
    bool save_audio    = false; // save audio to wav file
    bool use_gpu       = true;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
    std::string fname_out;
    bool save_audio = false; // save audio to wav file
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -66,26 +65,25 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
-        else if (arg == "-t"    || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (                  arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
+        else if (                 arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
-        else if (                  arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
+        else if (                 arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
-        else if (                  arg == "--keep")          { params.keep_ms       = std::stoi(argv[++i]); }
+        else if (                 arg == "--keep")          { params.keep_ms       = std::stoi(argv[++i]); }
-        else if (arg == "-c"    || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-mt"   || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
-        else if (arg == "-ac"   || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
-        else if (arg == "-vth"  || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth"  || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
-        else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-nf"   || arg == "--no-fallback")   { params.no_fallback   = true; }
+        else if (arg == "-nf"  || arg == "--no-fallback")   { params.no_fallback   = true; }
-        else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-kc"   || arg == "--keep-context")  { params.no_context    = false; }
+        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
-        else if (arg == "-l"    || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-m"    || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
-        else if (arg == "-f"    || arg == "--file")          { params.fname_out     = argv[++i]; }
+        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else if (arg == "-tdrz" || arg == "--tinydiarize")   { params.tinydiarize   = true; }
+        else if (arg == "-tdrz" || arg == "--tinydiarize")  { params.tinydiarize   = true; }
-        else if (arg == "-sa"   || arg == "--save-audio")    { params.save_audio    = true; }
+        else if (arg == "-sa"  || arg == "--save-audio")    { params.save_audio    = true; }
        else if (arg == "-ng"   || arg == "--no-gpu")        { params.use_gpu       = false; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -120,9 +118,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                                params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                          params.fname_out.c_str());
-    fprintf(stderr, "  -tdrz,    --tinydiarize   [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
+    fprintf(stderr, "  -tdrz,     --tinydiarize  [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
    fprintf(stderr, "  -sa,      --save-audio    [%-7s] save the recorded audio to a file\n",              params.save_audio ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu        [%-7s] disable GPU inference\n",                          params.use_gpu ? "false" : "true");
    fprintf(stderr, "\n");
 }
@ -166,10 +163,7 @@ int main(int argc, char ** argv) {
        exit(0);
    }
-    struct whisper_context_params cparams;
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
    std::vector<float> pcmf32_old;
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -14,8 +14,6 @@ if (WHISPER_SDL2)
        ../common-sdl.cpp
        ../../ggml.c
        ../../ggml-alloc.c
        ../../ggml-backend.c
        ../../ggml-quants.c
        ../../whisper.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -63,7 +63,6 @@ struct whisper_params {
    bool print_energy   = false;
    bool no_timestamps  = true;
    bool verbose_prompt = false;
    bool use_gpu        = true;
    std::string person      = "Georgi";
    std::string language    = "en";
@ -85,26 +84,25 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
-        else if (arg == "-t"   || arg == "--threads")        { params.n_threads      = std::stoi(argv[++i]); }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-vms" || arg == "--voice-ms")       { params.voice_ms       = std::stoi(argv[++i]); }
+        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms      = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")        { params.capture_id     = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")     { params.max_tokens     = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")      { params.audio_ctx      = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")       { params.speed_up       = true; }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
-        else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
+        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
-        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
+        else if (arg == "--verbose-prompt")                 { params.verbose_prompt = true; }
-        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
+        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
-        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
+        else if (arg == "--session")                        { params.path_session  = argv[++i];}
-        else if (arg == "--session")                         { params.path_session   = argv[++i];}
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-l"   || arg == "--language")       { params.language       = argv[++i]; }
+        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
-        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
+        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama   = argv[++i]; }
-        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
+        else if (arg == "-s"   || arg == "--speak")         { params.speak         = argv[++i]; }
-        else if (arg == "-s"   || arg == "--speak")          { params.speak          = argv[++i]; }
+        else if (arg == "--prompt-file")                    {
        else if (arg == "--prompt-file")                     {
            std::ifstream file(argv[++i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
@ -112,7 +110,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            }
        }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -128,28 +125,27 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help           [default] show this help message and exit\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -vms N,   --voice-ms N     [%-7d] voice duration in milliseconds\n",              params.voice_ms);
+    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
-    fprintf(stderr, "  -c ID,    --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,    --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
+    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
+    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
+    fprintf(stderr, "  -p NAME,  --person NAME   [%-7s] person name (for prompt selection)\n",          params.person.c_str());
-    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
+    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
-    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -ml FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
-    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
+    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
+    fprintf(stderr, "  --prompt-file FNAME       [%-7s] file with custom prompt to start dialog\n",     "");
-    fprintf(stderr, "  -s FILE,  --speak TEXT     [%-7s] command for TTS\n",                             params.speak.c_str());
+    fprintf(stderr, "  --session FNAME       file to cache model state in (may be large!) (default: none)\n");
-    fprintf(stderr, "  --prompt-file FNAME        [%-7s] file with custom prompt to start dialog\n",     "");
+    fprintf(stderr, "  --verbose-prompt          [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
-    fprintf(stderr, "  --session FNAME                   file to cache model state in (may be large!) (default: none)\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
 }
@ -248,7 +244,7 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
+    if (whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
@ -256,19 +252,13 @@ int main(int argc, char ** argv) {
    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
    // llama init
    llama_backend_init(true);
    auto lmparams = llama_model_default_params();
    if (!params.use_gpu) {
        lmparams.n_gpu_layers = 0;
    }
    struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -271,7 +271,7 @@ EMSCRIPTEN_BINDINGS(talk) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -121,13 +121,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            return false;
        }
-        char word[129];
+        std::string word;
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
            fin.read((char *) &len, sizeof(len));
-            word[len] = '\0';
+
-            fin.read((char *) word, len);
+            word.resize(len);
            fin.read((char *) word.data(), len);
            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -31,7 +31,6 @@ struct whisper_params {
    bool print_special = false;
    bool print_energy  = false;
    bool no_timestamps = true;
    bool use_gpu       = true;
    std::string person    = "Santa";
    std::string language  = "en";
@ -62,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
@ -96,7 +94,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu        [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -p NAME,  --person NAME   [%-7s] person name (for prompt selection)\n",          params.person.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
@ -184,10 +181,8 @@ int main(int argc, char ** argv) {
    }
    // whisper init
    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;
-    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
+    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
    // gpt init
--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@ -21,7 +21,7 @@ help()
    echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
    echo "options:"
    echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
+    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
    echo "-t       Number of threads to use."
    echo "-h       Print this help page."
    echo
--- a/examples/whisper.android/app/build.gradle
+++ b/examples/whisper.android/app/build.gradle
@ -18,7 +18,9 @@ android {
        vectorDrawables {
            useSupportLibrary true
        }
-
+        ndk {
            abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
        }
    }
    buildTypes {
@ -41,10 +43,20 @@ android {
    composeOptions {
        kotlinCompilerExtensionVersion '1.5.0'
    }
    ndkVersion "25.2.9519653"
    externalNativeBuild {
        cmake {
            path = file("src/main/jni/whisper/CMakeLists.txt")
        }
    }
    packagingOptions {
        resources {
            excludes += '/META-INF/{AL2.0,LGPL2.1}'
        }
    }
 }
 dependencies {
    implementation project(':lib')
    implementation 'androidx.activity:activity-compose:1.7.2'
    implementation 'androidx.compose.material:material-icons-core:1.5.0'
    implementation 'androidx.compose.material3:material3:1.1.1'
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -15,7 +15,7 @@ import androidx.lifecycle.viewmodel.initializer
 import androidx.lifecycle.viewmodel.viewModelFactory
 import com.whispercppdemo.media.decodeWaveFile
 import com.whispercppdemo.recorder.Recorder
-import com.whispercpp.whisper.WhisperContext
+import com.whispercppdemo.whisper.WhisperContext
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.launch
 import kotlinx.coroutines.runBlocking
@ -35,7 +35,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
    private val modelsPath = File(application.filesDir, "models")
    private val samplesPath = File(application.filesDir, "samples")
    private var recorder: Recorder = Recorder()
-    private var whisperContext: com.whispercpp.whisper.WhisperContext? = null
+    private var whisperContext: WhisperContext? = null
    private var mediaPlayer: MediaPlayer? = null
    private var recordedFile: File? = null
@ -47,7 +47,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
    }
    private suspend fun printSystemInfo() {
-        printMessage(String.format("System Info: %s\n", com.whispercpp.whisper.WhisperContext.getSystemInfo()))
+        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()))
    }
    private suspend fun loadData() {
@ -78,7 +78,7 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        printMessage("Loading model...\n")
        val models = application.assets.list("models/")
        if (models != null) {
-            whisperContext = com.whispercpp.whisper.WhisperContext.createContextFromAsset(application.assets, "models/" + models[0])
+            whisperContext = WhisperContext.createContextFromAsset(application.assets, "models/" + models[0])
            printMessage("Loaded model ${models[0]}.\n")
        }
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -1,4 +1,4 @@
-package com.whispercpp.whisper
+package com.whispercppdemo.whisper
 import android.content.res.AssetManager
 import android.os.Build
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/WhisperCpuConfig.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/WhisperCpuConfig.kt
@ -1,4 +1,4 @@
-package com.whispercpp.whisper
+package com.whispercppdemo.whisper
 import android.util.Log
 import java.io.BufferedReader
--- a/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -127,11 +127,11 @@ static struct whisper_context *whisper_init_from_asset(
            .close = &asset_close
    };
-    return whisper_init_with_params(&loader, whisper_context_default_params());
+    return whisper_init(&loader);
 }
 JNIEXPORT jlong JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_initContextFromAsset(
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContextFromAsset(
        JNIEnv *env, jobject thiz, jobject assetManager, jstring asset_path_str) {
    UNUSED(thiz);
    struct whisper_context *context = NULL;
@ -142,18 +142,18 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_initContextFromAsset(
 }
 JNIEXPORT jlong JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_initContext(
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContext(
        JNIEnv *env, jobject thiz, jstring model_path_str) {
    UNUSED(thiz);
    struct whisper_context *context = NULL;
    const char *model_path_chars = (*env)->GetStringUTFChars(env, model_path_str, NULL);
-    context = whisper_init_from_file_with_params(model_path_chars, whisper_context_default_params());
+    context = whisper_init_from_file(model_path_chars);
    (*env)->ReleaseStringUTFChars(env, model_path_str, model_path_chars);
    return (jlong) context;
 }
 JNIEXPORT void JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_freeContext(
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext(
        JNIEnv *env, jobject thiz, jlong context_ptr) {
    UNUSED(env);
    UNUSED(thiz);
@ -162,7 +162,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_freeContext(
 }
 JNIEXPORT void JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_fullTranscribe(
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
        JNIEnv *env, jobject thiz, jlong context_ptr, jint num_threads, jfloatArray audio_data) {
    UNUSED(thiz);
    struct whisper_context *context = (struct whisper_context *) context_ptr;
@ -194,7 +194,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_fullTranscribe(
 }
 JNIEXPORT jint JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_getTextSegmentCount(
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegmentCount(
        JNIEnv *env, jobject thiz, jlong context_ptr) {
    UNUSED(env);
    UNUSED(thiz);
@ -203,7 +203,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_getTextSegmentCount(
 }
 JNIEXPORT jstring JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_getTextSegment(
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
        JNIEnv *env, jobject thiz, jlong context_ptr, jint index) {
    UNUSED(thiz);
    struct whisper_context *context = (struct whisper_context *) context_ptr;
@ -213,7 +213,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_getTextSegment(
 }
 JNIEXPORT jstring JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_getSystemInfo(
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
        JNIEnv *env, jobject thiz
 ) {
    UNUSED(thiz);
@ -223,7 +223,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_getSystemInfo(
 }
 JNIEXPORT jstring JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
                                                                      jint n_threads) {
    UNUSED(thiz);
    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
@ -231,7 +231,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, j
 }
 JNIEXPORT jstring JNICALL
-Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
                                                                          jint n_threads) {
    UNUSED(thiz);
    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
--- a/examples/whisper.android/lib/.gitignore
+++ b/examples/whisper.android/lib/.gitignore
@ -1 +0,0 @@
 /build
--- a/examples/whisper.android/lib/build.gradle
+++ b/examples/whisper.android/lib/build.gradle
@ -1,51 +0,0 @@
 plugins {
    id 'com.android.library'
    id 'org.jetbrains.kotlin.android'
 }
 android {
    namespace 'com.whispercpp'
    compileSdk 34
    defaultConfig {
        minSdk 26
        targetSdk 34
        versionCode 1
        versionName "1.0"
        ndk {
            abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
        }
    }
    buildTypes {
        release {
            minifyEnabled false
        }
    }
    compileOptions {
        sourceCompatibility JavaVersion.VERSION_1_8
        targetCompatibility JavaVersion.VERSION_1_8
    }
    kotlinOptions {
        jvmTarget = '1.8'
    }
    ndkVersion "25.2.9519653"
    externalNativeBuild {
        cmake {
            path = file("src/main/jni/whisper/CMakeLists.txt")
        }
    }
    packagingOptions {
        resources {
            excludes += '/META-INF/{AL2.0,LGPL2.1}'
        }
    }
 }
 dependencies {
    implementation 'androidx.core:core-ktx:1.9.0'
    implementation 'androidx.appcompat:appcompat:1.6.1'
    implementation 'com.google.android.material:material:1.8.0'
 }
--- a/examples/whisper.android/lib/src/main/AndroidManifest.xml
+++ b/examples/whisper.android/lib/src/main/AndroidManifest.xml
@ -1,4 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android">
 </manifest>
--- a/examples/whisper.android/settings.gradle
+++ b/examples/whisper.android/settings.gradle
@ -14,4 +14,3 @@ dependencyResolutionManagement {
 }
 rootProject.name = "WhisperCppDemo"
 include ':app'
 include ':lib'
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -17,8 +17,8 @@
 		18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; };
 		18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; };
 		18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; };
-		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK -DGGML_USE_METAL"; }; };
+		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML"; }; };
-		18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL"; }; };
+		18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE"; }; };
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
 		18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
 		18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -61,13 +61,7 @@ void AudioInputCallback(void * inUserData,
        NSLog(@"Loading model from %@", modelPath);
        // create ggml context
-
+        stateInp.ctx = whisper_init_from_file([modelPath UTF8String]);
        struct whisper_context_params cparams = whisper_context_default_params();
 #if TARGET_OS_SIMULATOR
        cparams.use_gpu = false;
        NSLog(@"Running on simulator, using CPU");
 #endif
        stateInp.ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams);
        // check if the model was loaded successfully
        if (stateInp.ctx == NULL) {
--- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
+++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
@ -1,5 +1,4 @@
 import Foundation
 import whisper
 enum WhisperError: Error {
    case couldNotInitializeContext
@ -56,12 +55,7 @@ actor WhisperContext {
    }
    static func createContext(path: String) throws -> WhisperContext {
-        var params = whisper_context_default_params()
+        let context = whisper_init_from_file(path)
 #if targetEnvironment(simulator)
        params.use_gpu = false
        print("Running on the simulator, using CPU")
 #endif
        let context = whisper_init_from_file_with_params(path, params)
        if let context {
            return WhisperContext(context: context)
        } else {
--- a/examples/whisper.swiftui/whisper.cpp.swift/WhisperCppDemo-Bridging-Header.h
+++ b/examples/whisper.swiftui/whisper.cpp.swift/WhisperCppDemo-Bridging-Header.h
@ -0,0 +1,4 @@
 //
 //  Use this file to import your target's public headers that you would like to expose to Swift.
 //
 #import "whisper.h"
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -15,9 +15,14 @@
 		0AAC5D9B29539CCF003032C3 /* WhisperCppDemoApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9A29539CCF003032C3 /* WhisperCppDemoApp.swift */; };
 		0AAC5D9D29539CCF003032C3 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9C29539CCF003032C3 /* ContentView.swift */; };
 		0AAC5D9F29539CD0003032C3 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9E29539CD0003032C3 /* Assets.xcassets */; };
 		0AAC5DA329539CD0003032C3 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5DA229539CD0003032C3 /* Preview Assets.xcassets */; };
 		0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC729539EB0003032C3 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-Wno-shorten-64-to-32"; }; };
 		0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DC929539EB0003032C3 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -Wno-shorten-64-to-32"; }; };
 		0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DCD2953A05C003032C3 /* WhisperState.swift */; };
 		0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DD02953A394003032C3 /* LibWhisper.swift */; };
-		E3F92DC52AFA8E3800A6A9D4 /* whisper in Frameworks */ = {isa = PBXBuildFile; productRef = E3F92DC42AFA8E3800A6A9D4 /* whisper */; };
+		18ABE1522AF555FA0044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE14C2AF555FA0044A204 /* ggml-backend.c */; };
 		18ABE1532AF555FA0044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1512AF555FA0044A204 /* ggml-quants.c */; };
 		18AED4812AB21F2B009D854F /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 18AED47F2AB21F2B009D854F /* ggml-alloc.c */; };
 /* End PBXBuildFile section */
 /* Begin PBXFileReference section */
@ -31,9 +36,22 @@
 		0AAC5D9C29539CCF003032C3 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
 		0AAC5D9E29539CD0003032C3 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 		0AAC5DA029539CD0003032C3 /* WhisperCppDemo.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = WhisperCppDemo.entitlements; sourceTree = "<group>"; };
 		0AAC5DA229539CD0003032C3 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
 		0AAC5DC629539EAF003032C3 /* WhisperCppDemo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "WhisperCppDemo-Bridging-Header.h"; sourceTree = "<group>"; };
 		0AAC5DC729539EB0003032C3 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whisper.cpp; sourceTree = "<group>"; };
 		0AAC5DC829539EB0003032C3 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whisper.h; sourceTree = "<group>"; };
 		0AAC5DC929539EB0003032C3 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = ggml.c; sourceTree = "<group>"; };
 		0AAC5DCA29539EB0003032C3 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ggml.h; sourceTree = "<group>"; };
 		0AAC5DCD2953A05C003032C3 /* WhisperState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperState.swift; sourceTree = "<group>"; };
 		0AAC5DD02953A394003032C3 /* LibWhisper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibWhisper.swift; sourceTree = "<group>"; };
-		E3F92DC22AFA8DD800A6A9D4 /* whisper.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = whisper.cpp; path = ../..; sourceTree = "<group>"; };
+		18ABE14C2AF555FA0044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "ggml-backend.c"; sourceTree = "<group>"; };
 		18ABE14D2AF555FA0044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-backend.h"; sourceTree = "<group>"; };
 		18ABE14E2AF555FA0044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-backend-impl.h"; sourceTree = "<group>"; };
 		18ABE14F2AF555FA0044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-quants.h"; sourceTree = "<group>"; };
 		18ABE1502AF555FA0044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-impl.h"; sourceTree = "<group>"; };
 		18ABE1512AF555FA0044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "ggml-quants.c"; sourceTree = "<group>"; };
 		18AED47F2AB21F2B009D854F /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "ggml-alloc.c"; sourceTree = "<group>"; };
 		18AED4802AB21F2B009D854F /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "ggml-alloc.h"; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 /* Begin PBXFrameworksBuildPhase section */
@ -41,7 +59,6 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
 				E3F92DC52AFA8E3800A6A9D4 /* whisper in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@ -77,12 +94,11 @@
 		0AAC5D8E29539CCF003032C3 = {
 			isa = PBXGroup;
 			children = (
 				E3F92DC22AFA8DD800A6A9D4 /* whisper.cpp */,
 				0A8E48FF2954B3F100704C1B /* README.md */,
 				0AAC5DC529539E89003032C3 /* whisper.cpp */,
 				0AAC5DCF2953A36C003032C3 /* whisper.cpp.swift */,
 				0AAC5D9929539CCF003032C3 /* whisper.swiftui.demo */,
 				0AAC5D9829539CCF003032C3 /* Products */,
 				E3F92DC32AFA8E3800A6A9D4 /* Frameworks */,
 			);
 			sourceTree = "<group>";
 		};
@ -107,9 +123,39 @@
 			path = whisper.swiftui.demo;
 			sourceTree = "<group>";
 		};
 		0AAC5DA129539CD0003032C3 /* Preview Content */ = {
 			isa = PBXGroup;
 			children = (
 				0AAC5DA229539CD0003032C3 /* Preview Assets.xcassets */,
 			);
 			name = "Preview Content";
 			path = "../Preview Content";
 			sourceTree = "<group>";
 		};
 		0AAC5DC529539E89003032C3 /* whisper.cpp */ = {
 			isa = PBXGroup;
 			children = (
 				18ABE14E2AF555FA0044A204 /* ggml-backend-impl.h */,
 				18ABE14C2AF555FA0044A204 /* ggml-backend.c */,
 				18ABE14D2AF555FA0044A204 /* ggml-backend.h */,
 				18ABE1502AF555FA0044A204 /* ggml-impl.h */,
 				18ABE1512AF555FA0044A204 /* ggml-quants.c */,
 				18ABE14F2AF555FA0044A204 /* ggml-quants.h */,
 				18AED47F2AB21F2B009D854F /* ggml-alloc.c */,
 				18AED4802AB21F2B009D854F /* ggml-alloc.h */,
 				0AAC5DC929539EB0003032C3 /* ggml.c */,
 				0AAC5DCA29539EB0003032C3 /* ggml.h */,
 				0AAC5DC729539EB0003032C3 /* whisper.cpp */,
 				0AAC5DC829539EB0003032C3 /* whisper.h */,
 			);
 			name = whisper.cpp;
 			path = ../..;
 			sourceTree = "<group>";
 		};
 		0AAC5DCF2953A36C003032C3 /* whisper.cpp.swift */ = {
 			isa = PBXGroup;
 			children = (
 				0AAC5DC629539EAF003032C3 /* WhisperCppDemo-Bridging-Header.h */,
 				0AAC5DD02953A394003032C3 /* LibWhisper.swift */,
 			);
 			path = whisper.cpp.swift;
@ -128,17 +174,11 @@
 			children = (
 				0AAC5D9E29539CD0003032C3 /* Assets.xcassets */,
 				0AAC5DA029539CD0003032C3 /* WhisperCppDemo.entitlements */,
 				0AAC5DA129539CD0003032C3 /* Preview Content */,
 			);
 			path = "Supporting files";
 			sourceTree = "<group>";
 		};
 		E3F92DC32AFA8E3800A6A9D4 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
 			);
 			name = Frameworks;
 			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
 /* Begin PBXNativeTarget section */
@ -155,9 +195,6 @@
 			dependencies = (
 			);
 			name = whisper.swiftui;
 			packageProductDependencies = (
 				E3F92DC42AFA8E3800A6A9D4 /* whisper */,
 			);
 			productName = WhisperCppDemo;
 			productReference = 0AAC5D9729539CCF003032C3 /* whisper.swiftui.app */;
 			productType = "com.apple.product-type.application";
@ -202,6 +239,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				0AA751482953AC2E001EE061 /* samples in Resources */,
 				0AAC5DA329539CD0003032C3 /* Preview Assets.xcassets in Resources */,
 				0A8E49002954B3F100704C1B /* README.md in Resources */,
 				0AA751492953AC2E001EE061 /* models in Resources */,
 				0AAC5D9F29539CD0003032C3 /* Assets.xcassets in Resources */,
@ -217,10 +255,15 @@
 			files = (
 				0AAC5D9D29539CCF003032C3 /* ContentView.swift in Sources */,
 				0AAC5D9B29539CCF003032C3 /* WhisperCppDemoApp.swift in Sources */,
 				0AAC5DCC29539EB1003032C3 /* ggml.c in Sources */,
 				18ABE1532AF555FA0044A204 /* ggml-quants.c in Sources */,
 				0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */,
 				0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */,
 				0AA7514C2953B569001EE061 /* RiffWaveUtils.swift in Sources */,
 				0AAC5DCB29539EB1003032C3 /* whisper.cpp in Sources */,
 				0AA7514E2953D958001EE061 /* Recorder.swift in Sources */,
 				18AED4812AB21F2B009D854F /* ggml-alloc.c in Sources */,
 				18ABE1522AF555FA0044A204 /* ggml-backend.c in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@ -348,7 +391,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_ASSET_PATHS = "\"whisper.swiftui.demo/Supporting files/Preview Content\"";
-				DEVELOPMENT_TEAM = "";
+				DEVELOPMENT_TEAM = P8JZH34X63;
 				ENABLE_HARDENED_RUNTIME = YES;
 				ENABLE_PREVIEWS = YES;
 				GENERATE_INFOPLIST_FILE = YES;
@ -372,6 +415,7 @@
 				SDKROOT = auto;
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				SWIFT_OBJC_BRIDGING_HEADER = "whisper.cpp.swift/WhisperCppDemo-Bridging-Header.h";
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
@ -388,7 +432,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_ASSET_PATHS = "\"whisper.swiftui.demo/Supporting files/Preview Content\"";
-				DEVELOPMENT_TEAM = "";
+				DEVELOPMENT_TEAM = P8JZH34X63;
 				ENABLE_HARDENED_RUNTIME = YES;
 				ENABLE_PREVIEWS = YES;
 				GENERATE_INFOPLIST_FILE = YES;
@ -417,6 +461,7 @@
 				SDKROOT = auto;
 				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				SWIFT_OBJC_BRIDGING_HEADER = "whisper.cpp.swift/WhisperCppDemo-Bridging-Header.h";
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@ -444,13 +489,6 @@
 			defaultConfigurationName = Release;
 		};
 /* End XCConfigurationList section */
 /* Begin XCSwiftPackageProductDependency section */
 		E3F92DC42AFA8E3800A6A9D4 /* whisper */ = {
 			isa = XCSwiftPackageProductDependency;
 			productName = whisper;
 		};
 /* End XCSwiftPackageProductDependency section */
 	};
 	rootObject = 0AAC5D8F29539CCF003032C3 /* Project object */;
 }
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@ -24,7 +24,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    return i + 1;
                } else {
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -18,11 +18,11 @@ else
 fi
 models=(                                               \
-      "tiny"   "tiny-q4_0"   "tiny-q4_1"   "tiny-q5_0"   "tiny-q5_1"   "tiny-q8_0" \
+      "tiny"   "tiny-q5_0"   "tiny-q5_1"   "tiny-q8_0" \
-      "base"   "base-q4_0"   "base-q4_1"   "base-q5_0"   "base-q5_1"   "base-q8_0" \
+      "base"   "base-q5_0"   "base-q5_1"   "base-q8_0" \
-     "small"  "small-q4_0"  "small-q4_1"  "small-q5_0"  "small-q5_1"  "small-q8_0" \
+     "small"  "small-q5_0"  "small-q5_1"  "small-q8_0" \
-    "medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
+    "medium" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
-     "large"  "large-q4_0"  "large-q4_1"  "large-q5_0"  "large-q5_1"  "large-q8_0" \
+     "large"  "large-q5_0"  "large-q5_1"  "large-q8_0" \
 )
 if [ "$encoder_only" -eq 0 ]; then
@ -83,10 +83,6 @@ for model in "${models[@]}"; do
        config="$config COREML"
    fi
    if [[ $system_info == *"CUDA = 1"* ]]; then
        config="$config CUDA"
    fi
    if [[ $system_info == *"METAL = 1"* ]]; then
        config="$config METAL"
    fi
--- a/extra/convert-all.sh
+++ b/extra/convert-all.sh
@ -1,6 +1,6 @@
 #!/bin/bash
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 for model in "${models[@]}"; do
    python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -4476,13 +4476,6 @@ static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
    *dsti = __float2half(*xi);
 }
 static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
    const half * xi = (const half *) cxi;
    half * dsti = (half *) cdsti;
    *dsti = *xi;
 }
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@ -4736,25 +4729,6 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
 }
 static  __global__ void im2col_f32_f16(
        const float * x, half * dst,
        int ofs0, int ofs1, int IW, int IH, int CHW,
        int s0, int s1, int p0, int p1, int d0, int d1) {
    const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
 	const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
    const int offset_dst =
        (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
        (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
        const int offset_src =  threadIdx.x * ofs0 + blockIdx.x * ofs1;
        dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
    } else {
        dst[offset_dst] = __float2half(0.0f);
    }
 }
 template<int qk, int qr, dequantize_kernel_t dq>
 static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
@ -5644,16 +5618,6 @@ static void ggml_cpy_f32_f16_cuda(
        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 static void ggml_cpy_f16_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@ -5737,15 +5701,6 @@ static void soft_max_f32_cuda(const float * x, float * dst, const int ncols_x, c
    soft_max_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x);
 }
 static void im2col_f32_f16_cuda(const float * x, half * dst,
    int OH, int IW, int IH, int OW, int IC,
    int KH, int KW, int N,  int ofs0, int ofs1,
    int s0, int s1, int p0, int p1, int d0, int d1, cudaStream_t stream) {
    dim3 block_nums(IC, OH, OW);
    dim3 block_dims(N,  KH, KW);
    im2col_f32_f16<<<block_nums, block_dims, 0, stream>>>(x, dst, ofs0, ofs1, IW, IH, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
 }
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 256
@ -6528,7 +6483,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
            src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream);
            to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
        }
-        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
+        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
        size_t dst_f16_as = 0;
        half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream);
@ -6704,45 +6659,6 @@ inline void ggml_cuda_op_alibi(
    (void) src1_dd;
 }
 inline void ggml_cuda_op_im2col(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F16);
    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
    const int64_t N  = src1->ne[is_2D ? 3 : 2];
    const int64_t IC = src1->ne[is_2D ? 2 : 1];
    const int64_t IH = is_2D ? src1->ne[1] : 1;
    const int64_t IW =         src1->ne[0];
    const int64_t KH = is_2D ? src0->ne[1] : 1;
    const int64_t KW =         src0->ne[0];
    const int64_t OH = is_2D ? dst->ne[2] : 1;
    const int64_t OW =         dst->ne[1];
    const size_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
    const size_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
    im2col_f32_f16_cuda(src1_dd, (half*) dst_dd,
        OH, IW, IH, OW, IC, KH, KW, N,
        ofs0, ofs1, s0, s1, p0, p1, d0, d1, main_stream);
    (void) src0;
    (void) src0_dd;
 }
 inline void ggml_cuda_op_diag_mask_inf(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@ -7633,9 +7549,6 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
                              ne10, ne11, nb10, nb11, nb12, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
        ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
                              ne10, ne11, nb10, nb11, nb12, main_stream);
    } else {
        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
@ -7667,10 +7580,6 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
 }
 void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
 }
 static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    (void) src0;
    (void) src1;
@ -8034,9 +7943,6 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
        case GGML_OP_ALIBI:
            func = ggml_cuda_alibi;
            break;
        case GGML_OP_IM2COL:
            func = ggml_cuda_im2col;
            break;
        default:
            return false;
    }
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -86,7 +86,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(norm);
    GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
@ -115,7 +114,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(rope_f32);
    GGML_METAL_DECL_KERNEL(rope_f16);
    GGML_METAL_DECL_KERNEL(alibi_f32);
    GGML_METAL_DECL_KERNEL(im2col_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
    GGML_METAL_DECL_KERNEL(cpy_f16_f16);
@ -289,7 +287,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(norm);
        GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
@ -320,7 +317,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(rope_f32);
        GGML_METAL_ADD_KERNEL(rope_f16);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(im2col_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
        GGML_METAL_ADD_KERNEL(cpy_f16_f16);
@ -390,7 +386,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(norm);
    GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
@ -421,7 +416,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(rope_f32);
    GGML_METAL_DEL_KERNEL(rope_f16);
    GGML_METAL_DEL_KERNEL(alibi_f32);
    GGML_METAL_DEL_KERNEL(im2col_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
@ -1036,7 +1030,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
@ -1145,7 +1139,6 @@ void ggml_metal_graph_compute(
                                switch (src0t) {
                                    case GGML_TYPE_F32:
                                        {
                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
                                            nrows = 4;
                                        } break;
@ -1153,18 +1146,13 @@ void ggml_metal_graph_compute(
                                        {
                                            nth0 = 32;
                                            nth1 = 1;
-                                            if (src1t == GGML_TYPE_F32) {
+                                            if (ne11 * ne12 < 4) {
-                                                if (ne11 * ne12 < 4) {
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
-                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
+                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
-                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
+                                                nrows = ne11;
                                                    nrows = ne11;
                                                } else {
                                                    [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
                                                    nrows = 4;
                                                }
                                            } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f16];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
                                                nrows = 4;
                                            }
                                        } break;
@ -1354,7 +1342,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth/32*sizeof(float), 16) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
                            const int64_t nrows = ggml_nrows(src0);
@ -1373,7 +1361,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
                            const int64_t nrows = ggml_nrows(src0);
@ -1476,58 +1464,6 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_IM2COL:
                        {
                            GGML_ASSERT(src0->type == GGML_TYPE_F16);
                            GGML_ASSERT(src1->type == GGML_TYPE_F32);
                            GGML_ASSERT( dst->type == GGML_TYPE_F16);
                            const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
                            const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
                            const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
                            const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
                            const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
                            const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
                            const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
                            const int32_t N  = src1->ne[is_2D ? 3 : 2];
                            const int32_t IC = src1->ne[is_2D ? 2 : 1];
                            const int32_t IH = is_2D ? src1->ne[1] : 1;
                            const int32_t IW =         src1->ne[0];
                            const int32_t KH = is_2D ? src0->ne[1] : 1;
                            const int32_t KW =         src0->ne[0];
                            const int32_t OH = is_2D ? dst->ne[2] : 1;
                            const int32_t OW =         dst->ne[1];
                            const int32_t CHW = IC * KH * KW;
                            const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
                            const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
                            switch (src0->type) {
                                case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
                                case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_im2col_f16]; break;
                                default: GGML_ASSERT(false);
                            };
                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                            [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
                            [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
                            [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
                            [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
                            [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
                            [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
                            [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
                            [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
                            [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
                            [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
                            [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
                            [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
                        } break;
                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
                    case GGML_OP_CONT:
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -792,7 +792,7 @@ kernel void kernel_mul_mv_f32_f32(
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
+        uint tiisg[[thread_index_in_simdgroup]]) {
    const int64_t r0 = tgpig.x;
    const int64_t rb = tgpig.y*N_F32_F32;
@ -844,79 +844,6 @@ kernel void kernel_mul_mv_f32_f32(
    }
 }
 #define N_F16_F16 4
 kernel void kernel_mul_mv_f16_f16(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint  tiisg[[thread_index_in_simdgroup]]) {
    const int64_t r0 = tgpig.x;
    const int64_t rb = tgpig.y*N_F16_F16;
    const int64_t im = tgpig.z;
    device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
    if (ne00 < 128) {
        for (int row = 0; row < N_F16_F16; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            device const half * y = (device const half *) (src1 + r1*nb11 + im*nb12);
            float sumf = 0;
            for (int i = tiisg; i < ne00; i += 32) {
                sumf += (half) x[i] * (half) y[i];
            }
            float all_sum = simd_sum(sumf);
            if (tiisg == 0) {
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    } else {
        device const half4 * x4 = (device const half4 *)x;
        for (int row = 0; row < N_F16_F16; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            device const half  * y  = (device const half  *) (src1 + r1*nb11 + im*nb12);
            device const half4 * y4 = (device const half4 *) y;
            float sumf = 0;
            for (int i = tiisg; i < ne00/4; i += 32) {
                for (int k = 0; k < 4; ++k) sumf += (half) x4[i][k] * y4[i][k];
            }
            float all_sum = simd_sum(sumf);
            if (tiisg == 0) {
                for (int i = 4*(ne00/4); i < ne00; ++i) all_sum += (half) x[i] * y[i];
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    }
 }
 kernel void kernel_mul_mv_f16_f32_1row(
        device const  char * src0,
        device const  char * src1,
@ -1302,39 +1229,6 @@ kernel void kernel_rope(
 template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
 template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
 kernel void kernel_im2col_f16(
        device const float * x,
        device       half * dst,
        constant   int32_t & ofs0,
        constant   int32_t & ofs1,
        constant   int32_t & IW,
        constant   int32_t & IH,
        constant   int32_t & CHW,
        constant   int32_t & s0,
        constant   int32_t & s1,
        constant   int32_t & p0,
        constant   int32_t & p1,
        constant   int32_t & d0,
        constant   int32_t & d1,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3  tgpg[[threadgroups_per_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
    const int32_t iiw = tgpig[2] * s0 + tpitg[2] * d0 - p0;
    const int32_t iih = tgpig[1] * s1 + tpitg[1] * d1 - p1;
    const int32_t offset_dst =
        (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
        (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
        const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
        dst[offset_dst] = x[offset_src + iih * IW + iiw];
    } else {
        dst[offset_dst] = 0.0f;
    }
 }
 kernel void kernel_cpy_f16_f16(
        device const half * src0,
        device       half * dst,
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -403,8 +403,13 @@ extern "C" {
        GGML_OP_ROPE_BACK,
        GGML_OP_ALIBI,
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_1D_STAGE_0,  // internal
        GGML_OP_CONV_1D_STAGE_1,  // internal
        GGML_OP_CONV_TRANSPOSE_1D,
-        GGML_OP_IM2COL,
+        GGML_OP_CONV_2D,
        GGML_OP_CONV_2D_STAGE_0, // internal
        GGML_OP_CONV_2D_STAGE_1, // internal
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
@ -1393,18 +1398,6 @@ extern "C" {
            float                 min,
            float                 max);
    GGML_API struct ggml_tensor * ggml_im2col(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            int                  s0,
            int                  s1,
            int                  p0,
            int                  p1,
            int                  d0,
            int                  d1,
            bool                 is_2D);
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
--- a/models/README.md
+++ b/models/README.md
@ -50,8 +50,7 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
 | medium    | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
 | large-v1  | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
-| large-v2  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| large     | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 | large     | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
 ## Model files for testing purposes
--- a/models/convert-h5-to-coreml.py
+++ b/models/convert-h5-to-coreml.py
@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
 # Ported from models/convert-whisper-to-coreml.py
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
+    parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
    parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
    parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
    parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
    parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
    args = parser.parse_args()
-    if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
+    if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
        raise ValueError("Invalid model name")
    pt_target_path = f"models/hf-{args.model_name}.pt"
--- a/models/convert-pt-to-ggml.py
+++ b/models/convert-pt-to-ggml.py
@ -228,7 +228,7 @@ with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
 # for backwards compatibility, also check for older hf_transformers format tokenizer files
 # old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
 # new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
-multilingual = hparams["n_vocab"] >= 51865
+multilingual = hparams["n_vocab"] == 51865
 tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
 tokenizer_type = "tiktoken"
 if not tokenizer.is_file():
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -194,7 +194,7 @@ class TextDecoderANE(TextDecoder):
        x = x.permute(0,2,3,1).squeeze(0)
        # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
-        if self.token_embedding.weight.shape[0] >= 51865:
+        if self.token_embedding.weight.shape[0] == 51865:
            # split in 11 chunks - 4715 each
            splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
            logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
@ -252,7 +252,7 @@ class WhisperANE(Whisper):
 def convert_encoder(hparams, model, quantize=False):
    model.eval()
-    input_shape = (1, hparams.n_mels, 3000)
+    input_shape = (1, 80, 3000)
    input_data = torch.randn(input_shape)
    traced_model = torch.jit.trace(model, input_data)
@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
+    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
    parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
    parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
    parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
    args = parser.parse_args()
-    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large", "large-v1", "large-v2"]:
+    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
        raise ValueError("Invalid model name")
    whisper = load_model(args.model).cpu()
--- a/models/convert-whisper-to-openvino.py
+++ b/models/convert-whisper-to-openvino.py
@ -9,7 +9,7 @@ import shutil
 def convert_encoder(hparams, encoder, mname):
    encoder.eval()
-    mel = torch.zeros((1, hparams.n_mels, 3000))
+    mel = torch.zeros((1, 80, 3000))
    onnx_folder=os.path.join(os.path.dirname(__file__),"onnx_encoder")
@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
+    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
    args = parser.parse_args()
-    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
+    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
        raise ValueError("Invalid model name")
    whisper = load_model(args.model).cpu()
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -19,7 +19,7 @@ function get_script_path() {
 models_path="$(get_script_path)"
 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 # list available models
 function list_models {
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -8,7 +8,7 @@ popd
 set argc=0
 for %%x in (%*) do set /A argc+=1
-set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large
+set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large
 if %argc% neq 1 (
  echo.
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -41,7 +41,6 @@ models=(
    "medium-q5_0"
    "medium.en-q5_0"
    "large-v1"
    "large-v2"
    "large"
    "large-q5_0"
 )
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@ -1 +0,0 @@
 ../ggml.h
--- a/spm-headers/whisper.h
+++ b/spm-headers/whisper.h
@ -1 +0,0 @@
 ../whisper.h
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@ -19,7 +19,7 @@
 cd `dirname $0`
 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 # list available models
 function list_models {
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -1,20 +1,10 @@
 #ifndef WHISPER_H
 #define WHISPER_H
 #include "ggml.h"
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
 #ifdef __GNUC__
 #    define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
 #elif defined(_MSC_VER)
 #    define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
 #else
 #    define WHISPER_DEPRECATED(func, hint) func
 #endif
 #ifdef WHISPER_SHARED
 #    ifdef _WIN32
 #        ifdef WHISPER_BUILD
@ -31,6 +21,7 @@
 #define WHISPER_SAMPLE_RATE 16000
 #define WHISPER_N_FFT       400
 #define WHISPER_N_MEL       80
 #define WHISPER_HOP_LENGTH  160
 #define WHISPER_CHUNK_SIZE  30
@ -80,10 +71,6 @@ extern "C" {
    typedef int whisper_token;
    struct whisper_context_params {
        bool  use_gpu;
    };
    typedef struct whisper_token_data {
        whisper_token id;  // token id
        whisper_token tid; // forced timestamp token id
@ -112,40 +99,15 @@ extern "C" {
    // Various functions for loading a ggml whisper model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
-    WHISPER_API struct whisper_context * whisper_init_from_file_with_params  (const char * path_model,              struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size,    struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init_with_params            (struct whisper_model_loader * loader, struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
    // These are the same as the above, but the internal state of the context is not allocated automatically
    // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
-    WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state  (const char * path_model,              struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
-    WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size,    struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
-    WHISPER_API struct whisper_context * whisper_init_with_params_no_state            (struct whisper_model_loader * loader, struct whisper_context_params params);
+    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
    WHISPER_DEPRECATED(
        WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model),
        "use whisper_init_from_file_with_params instead"
    );
    WHISPER_DEPRECATED(
        WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size),
        "use whisper_init_from_buffer_with_params instead"
    );
    WHISPER_DEPRECATED(
        WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader),
        "use whisper_init_with_params instead"
    );
    WHISPER_DEPRECATED(
        WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model),
        "use whisper_init_from_file_with_params_no_state instead"
    );
    WHISPER_DEPRECATED(
        WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size),
        "use whisper_init_from_buffer_with_params_no_state instead"
    );
    WHISPER_DEPRECATED(
        WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader),
        "use whisper_init_with_params_no_state instead"
    );
    WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
@ -170,7 +132,6 @@ extern "C" {
    WHISPER_API void whisper_free      (struct whisper_context * ctx);
    WHISPER_API void whisper_free_state(struct whisper_state * state);
    WHISPER_API void whisper_free_params(struct whisper_full_params * params);
    WHISPER_API void whisper_free_context_params(struct whisper_context_params * params);
    // Convert RAW PCM audio to log mel spectrogram.
    // The resulting spectrogram is stored inside the default state of the provided whisper context.
@ -481,9 +442,7 @@ extern "C" {
        void * logits_filter_callback_user_data;
    };
-    // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
+    // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_params()
    WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref();
    WHISPER_API struct whisper_context_params whisper_context_default_params(void);
    WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy);
    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
@ -572,7 +531,8 @@ extern "C" {
    // Control logging output; default behavior is to print to stderr
-    WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data);
+    typedef void (*whisper_log_callback)(const char * line);
    WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
 #ifdef __cplusplus
 }