Vulkan: Clean up mul_mm shader (#15987 )

* vulkan: move mul_mm dequantization steps into a separate file and functions * improve mul_mm vector load code * fix debug mode issues and warnings
build: fix the build failures of Windows HIP release job (#15984 )
2026-07-02 18:47:43 +02:00 · 2025-09-14 16:56:28 +02:00 · 2025-09-14 07:20:35 -07:00 · 2025-09-14 15:33:22 +03:00 · 2025-09-14 12:28:18 +03:00 · 2025-09-14 02:21:59 -07:00
50 changed files with 10414 additions and 5109 deletions
@@ -88,6 +88,7 @@ jobs:
            -DGGML_METAL_SHADER_DEBUG=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1

      - name: Test
        id: cmake_test
@@ -126,7 +127,8 @@ jobs:
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -1050,9 +1052,13 @@ jobs:
        run:  examples/sycl/win-build-sycl.bat

  windows-latest-cmake-hip:
-    if: ${{ github.event.inputs.create_release != 'true' }}
    runs-on: windows-2022

+    env:
+      # The ROCm version must correspond to the version used in the HIP SDK.
+      ROCM_VERSION: "6.4.2"
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+
    steps:
      - name: Clone
        id: checkout
@@ -1061,16 +1067,14 @@ jobs:
      - name: Clone rocWMMA repository
        id: clone_rocwmma
        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+          git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1

      - name: Cache ROCm Installation
        id: cache-rocm
        uses: actions/cache@v4
        with:
          path: C:\Program Files\AMD\ROCm
-          key: rocm-6.1-${{ runner.os }}-v1
-          restore-keys: |
-            rocm-6.1-${{ runner.os }}-
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: Install ROCm
        if: steps.cache-rocm.outputs.cache-hit != 'true'
@@ -1078,7 +1082,7 @@ jobs:
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
          $completed = $proc.WaitForExit(600000)
@@ -108,7 +108,8 @@ jobs:
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Determine tag name
@@ -528,11 +529,16 @@ jobs:
  windows-hip:
    runs-on: windows-2022

+    env:
+      # The ROCm version must correspond to the version used in the HIP SDK.
+      ROCM_VERSION: "6.4.2"
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+
    strategy:
      matrix:
        include:
          - name: "radeon"
-            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+            gpu_targets: "gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"

    steps:
      - name: Clone
@@ -542,21 +548,19 @@ jobs:
      - name: Clone rocWMMA repository
        id: clone_rocwmma
        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+          git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1

      - name: Cache ROCm Installation
        id: cache-rocm
        uses: actions/cache@v4
        with:
          path: C:\Program Files\AMD\ROCm
-          key: rocm-6.1-${{ runner.os }}-v1
-          restore-keys: |
-            rocm-6.1-${{ runner.os }}-
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
+          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
          evict-old-files: 1d

      - name: Install ROCm
@@ -565,7 +569,7 @@ jobs:
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
          $completed = $proc.WaitForExit(600000)
@@ -610,9 +614,12 @@ jobs:
            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
+          md "build\bin\hipblaslt\library"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"

      - name: Pack artifacts
        id: pack_artifacts
@@ -270,7 +270,9 @@ function gg_run_ctest_with_model_debug {
    local model; model=$(gg_get_model)
    cd build-ci-debug
    set -e
+
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+
    set +e
    cd ..
 }
@@ -281,7 +283,15 @@ function gg_run_ctest_with_model_release {
    local model; model=$(gg_get_model)
    cd build-ci-release
    set -e
+
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+
+    # test memory leaks
+    #if [[ ! -z ${GG_BUILD_METAL} ]]; then
+    #    # TODO: this hangs for some reason ...
+    #    (time leaks -quiet -atExit -- ./bin/test-thread-safety -m $model --parallel 2 -t 2 -p "hello") 2>&1 | tee -a $OUT/${ci}-leaks.log
+    #fi
+
    set +e
    cd ..
 }
@@ -860,10 +870,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
-if [ -z ${GG_BUILD_SYCL} ]; then
-    # SYCL build breaks with debug build flags
-    test $ret -eq 0 && gg_run ctest_debug
-fi
+test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -871,9 +878,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        if [ -z ${GG_BUILD_SYCL} ]; then
-            test $ret -eq 0 && gg_run test_scripts_debug
-        fi
+        test $ret -eq 0 && gg_run test_scripts_debug
        test $ret -eq 0 && gg_run test_scripts_release
    fi

@@ -884,9 +889,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
-        if [ -z ${GG_BUILD_SYCL} ]; then
-            test $ret -eq 0 && gg_run ctest_with_model_debug
-        fi
+        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
@@ -745,6 +745,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &

 #endif // LLAMA_USE_CURL

+//
+// Docker registry functions
+//
+
+static std::string common_docker_get_token(const std::string & repo) {
+    std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
+
+    common_remote_params params;
+    auto                 res = common_remote_get_content(url, params);
+
+    if (res.first != 200) {
+        throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
+    }
+
+    std::string            response_str(res.second.begin(), res.second.end());
+    nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
+
+    if (!response.contains("token")) {
+        throw std::runtime_error("Docker registry token response missing 'token' field");
+    }
+
+    return response["token"].get<std::string>();
+}
+
+static std::string common_docker_resolve_model(const std::string & docker) {
+    // Parse ai/smollm2:135M-Q4_K_M
+    size_t      colon_pos = docker.find(':');
+    std::string repo, tag;
+    if (colon_pos != std::string::npos) {
+        repo = docker.substr(0, colon_pos);
+        tag  = docker.substr(colon_pos + 1);
+    } else {
+        repo = docker;
+        tag  = "latest";
+    }
+
+    // ai/ is the default
+    size_t      slash_pos = docker.find('/');
+    if (slash_pos == std::string::npos) {
+        repo.insert(0, "ai/");
+    }
+
+    LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
+    try {
+        // --- helper: digest validation ---
+        auto validate_oci_digest = [](const std::string & digest) -> std::string {
+            // Expected: algo:hex ; start with sha256 (64 hex chars)
+            // You can extend this map if supporting other algorithms in future.
+            static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
+            std::smatch m;
+            if (!std::regex_match(digest, m, re)) {
+                throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
+            }
+            // normalize hex to lowercase
+            std::string normalized = digest;
+            std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
+                return std::tolower(c);
+            });
+            return normalized;
+        };
+
+        std::string token = common_docker_get_token(repo);  // Get authentication token
+
+        // Get manifest
+        const std::string    url_prefix = "https://registry-1.docker.io/v2/" + repo;
+        std::string          manifest_url = url_prefix + "/manifests/" + tag;
+        common_remote_params manifest_params;
+        manifest_params.headers.push_back("Authorization: Bearer " + token);
+        manifest_params.headers.push_back(
+            "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
+        auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
+        if (manifest_res.first != 200) {
+            throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
+        }
+
+        std::string            manifest_str(manifest_res.second.begin(), manifest_res.second.end());
+        nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
+        std::string            gguf_digest;  // Find the GGUF layer
+        if (manifest.contains("layers")) {
+            for (const auto & layer : manifest["layers"]) {
+                if (layer.contains("mediaType")) {
+                    std::string media_type = layer["mediaType"].get<std::string>();
+                    if (media_type == "application/vnd.docker.ai.gguf.v3" ||
+                        media_type.find("gguf") != std::string::npos) {
+                        gguf_digest = layer["digest"].get<std::string>();
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (gguf_digest.empty()) {
+            throw std::runtime_error("No GGUF layer found in Docker manifest");
+        }
+
+        // Validate & normalize digest
+        gguf_digest = validate_oci_digest(gguf_digest);
+        LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
+
+        // Prepare local filename
+        std::string model_filename = repo;
+        std::replace(model_filename.begin(), model_filename.end(), '/', '_');
+        model_filename += "_" + tag + ".gguf";
+        std::string local_path = fs_get_cache_file(model_filename);
+
+        const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
+        if (!common_download_file_single(blob_url, local_path, token, false)) {
+            throw std::runtime_error("Failed to download Docker Model");
+        }
+
+        LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
+        return local_path;
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
+        throw;
+    }
+}
+
 //
 // utils
 //
@@ -795,7 +913,9 @@ static handle_model_result common_params_handle_model(
    handle_model_result result;
    // handle pre-fill default model path and url based on hf_repo and hf_file
    {
-        if (!model.hf_repo.empty()) {
+        if (!model.docker_repo.empty()) {  // Handle Docker URLs by resolving them to local paths
+            model.path = common_docker_resolve_model(model.docker_repo);
+        } else if (!model.hf_repo.empty()) {
            // short-hand to avoid specifying --hf-file -> default it to --model
            if (model.hf_file.empty()) {
                if (model.path.empty()) {
@@ -1184,7 +1304,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
    } else {
        for (const auto & device : dev_names) {
            auto * dev = ggml_backend_dev_by_name(device.c_str());
-            if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
+            if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
                throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
            }
            devices.push_back(dev);
@@ -1194,7 +1314,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
    return devices;
 }

-static void add_rpc_devices(std::string servers) {
+static void add_rpc_devices(const std::string & servers) {
    auto rpc_servers = string_split<std::string>(servers, ',');
    if (rpc_servers.empty()) {
        throw std::invalid_argument("no RPC servers specified");
@@ -2396,24 +2516,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--list-devices"},
        "print list of available devices and exit",
        [](common_params &) {
-            std::vector<ggml_backend_dev_t> rpc_devices;
-            std::vector<ggml_backend_dev_t> all_devices;
+            std::vector<ggml_backend_dev_t> devices;
            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
                auto * dev = ggml_backend_dev_get(i);
-                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
-                    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
-                    if (ggml_backend_reg_name(reg) == std::string("RPC")) {
-                        rpc_devices.push_back(dev);
-                    } else {
-                        all_devices.push_back(dev);
-                    }
+                if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+                    devices.push_back(dev);
                }
            }
-            // insert RPC devices in front
-            all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
            printf("Available devices:\n");
-            for (size_t i = 0; i < all_devices.size(); ++i) {
-                auto * dev = all_devices[i];
+            for (auto * dev : devices) {
                size_t free, total;
                ggml_backend_dev_memory(dev, &free, &total);
                printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
@@ -2636,6 +2747,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.url = value;
        }
    ).set_env("LLAMA_ARG_MODEL_URL"));
+    add_opt(common_arg(
+        { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
+        "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
+        "example: gemma3\n"
+        "(default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.model.docker_repo = value;
+        }
+    ).set_env("LLAMA_ARG_DOCKER_REPO"));
    add_opt(common_arg(
        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -193,10 +193,11 @@ struct common_params_sampling {
 };

 struct common_params_model {
-    std::string path    = ""; // model local path                                           // NOLINT
-    std::string url     = ""; // model url to download                                      // NOLINT
-    std::string hf_repo = ""; // HF repo                                                    // NOLINT
-    std::string hf_file = ""; // HF file                                                    // NOLINT
+    std::string path        = ""; // model local path                                       // NOLINT
+    std::string url         = ""; // model url to download                                  // NOLINT
+    std::string hf_repo     = ""; // HF repo                                                // NOLINT
+    std::string hf_file     = ""; // HF file                                                // NOLINT
+    std::string docker_repo = ""; // Docker repo                                            // NOLINT
 };

 struct common_params_speculative {
@@ -452,7 +453,7 @@ struct common_params {

    std::string slot_save_path;

-    float slot_prompt_similarity = 0.5f;
+    float slot_prompt_similarity = 0.1f;

    // batched-bench params
    bool is_pp_shared = false;
@@ -241,8 +241,8 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 |            | VX/VXE/VXE2 | zDNN | Spyre |
 |------------|-------------|------|-------|
 | FP32       | ✅           | ✅    | ❓     |
-| FP16       | ✅           | ❓    | ❓     |
-| BF16       | 🚫           | ❓    | ❓     |
+| FP16       | ✅           | ✅    | ❓     |
+| BF16       | 🚫           | ✅    | ❓     |
 | Q4_0       | ✅           | ❓    | ❓     |
 | Q4_1       | ✅           | ❓    | ❓     |
 | MXFP4      | 🚫           | ❓    | ❓     |
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself

-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 6, 2025.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
@@ -18,6 +18,7 @@ Legend:
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -26,6 +27,7 @@ Legend:
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
@@ -49,9 +51,11 @@ Legend:
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
@@ -61,7 +65,9 @@ Legend:
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                     OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
 |                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
@@ -98,6 +104,7 @@ Legend:
 |                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
@@ -7,8 +7,6 @@
 extern "C" {
 #endif

-GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
-
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);

 #ifdef __cplusplus
@@ -6,6 +6,7 @@ message(STATUS "Metal framework found")

 ggml_add_backend_library(ggml-metal
                         ggml-metal.m
+                         ggml-metal-common.cpp
                        )

 target_link_libraries(ggml-metal PRIVATE
@@ -0,0 +1,445 @@
+#include "ggml-metal-common.h"
+
+#include "ggml-impl.h"
+
+#include <vector>
+
+struct ggml_mem_range {
+    uint64_t pb; // buffer id
+
+    uint64_t p0; // begin
+    uint64_t p1; // end
+
+    ggml_mem_range_type pt;
+};
+
+struct ggml_mem_ranges {
+    std::vector<ggml_mem_range> ranges;
+
+    int debug = 0;
+};
+
+struct ggml_mem_ranges * ggml_mem_ranges_init(int debug) {
+    auto * res = new ggml_mem_ranges;
+
+    res->ranges.reserve(256);
+    res->debug = debug;
+
+    return res;
+}
+
+void ggml_mem_ranges_free(ggml_mem_ranges * mrs) {
+    delete mrs;
+}
+
+void ggml_mem_ranges_reset(ggml_mem_ranges * mrs) {
+    mrs->ranges.clear();
+}
+
+static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mrp) {
+    mrs->ranges.push_back(mrp);
+
+    return true;
+}
+
+static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggml_mem_range_type pt) {
+    // always use the base tensor
+    tensor = tensor->view_src ? tensor->view_src : tensor;
+
+    GGML_ASSERT(!tensor->view_src);
+
+    ggml_mem_range mrp;
+
+    if (tensor->buffer) {
+        // when the tensor is allocated, use the actual memory address range of the buffer
+        mrp = {
+            /*.pb =*/ (uint64_t) tensor->buffer,
+            /*.p0 =*/ (uint64_t) tensor->data,
+            /*.p1 =*/ (uint64_t) tensor->data + ggml_nbytes(tensor),
+            /*.pt =*/ pt,
+        };
+    } else {
+        // otherwise, the tensor ptr is used as an unique id of the memory ranges
+        //   that the tensor will be using when it is allocated
+        mrp = {
+            /*.pb =*/ (uint64_t) tensor,
+            /*.p0 =*/ 0,    //
+            /*.p1 =*/ 1024, // [0, 1024) is a dummy range, not used
+            /*.pt =*/ pt,
+        };
+    };
+
+    return mrp;
+}
+
+static ggml_mem_range ggml_mem_range_from_tensor_src(const ggml_tensor * tensor) {
+    return ggml_mem_range_from_tensor(tensor, MEM_RANGE_TYPE_SRC);
+}
+
+static ggml_mem_range ggml_mem_range_from_tensor_dst(const ggml_tensor * tensor) {
+    return ggml_mem_range_from_tensor(tensor, MEM_RANGE_TYPE_DST);
+}
+
+static bool ggml_mem_ranges_add_src(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+
+    ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor);
+
+    if (mrs->debug > 2) {
+        GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1);
+    }
+
+    return ggml_mem_ranges_add(mrs, mrp);
+}
+
+static bool ggml_mem_ranges_add_dst(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+
+    ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor);
+
+    if (mrs->debug > 2) {
+        GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1);
+    }
+
+    return ggml_mem_ranges_add(mrs, mrp);
+}
+
+bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (tensor->src[i]) {
+            ggml_mem_ranges_add_src(mrs, tensor->src[i]);
+        }
+    }
+
+    return ggml_mem_ranges_add_dst(mrs, tensor);
+}
+
+static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mrp) {
+    for (size_t i = 0; i < mrs->ranges.size(); i++) {
+        const auto & cmp = mrs->ranges[i];
+
+        if (mrp.pb != cmp.pb) {
+            continue;
+        }
+
+        if (mrp.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
+            continue;
+        }
+
+        if (mrp.p0 < cmp.p1 && mrp.p1 >= cmp.p0) {
+            if (mrs->debug > 2) {
+                GGML_LOG_DEBUG("%s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n",
+                        __func__,
+                        mrp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
+                        mrp.pb, mrp.p0, mrp.p1,
+                        cmp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst",
+                        cmp.pb, cmp.p0, cmp.p1);
+            }
+
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool ggml_mem_ranges_check_src(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+
+    ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor);
+
+    const bool res = ggml_mem_ranges_check(mrs, mrp);
+
+    return res;
+}
+
+static bool ggml_mem_ranges_check_dst(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+
+    ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor);
+
+    const bool res = ggml_mem_ranges_check(mrs, mrp);
+
+    return res;
+}
+
+bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (tensor->src[i]) {
+            if (!ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
+                return false;
+            }
+        }
+    }
+
+    return ggml_mem_ranges_check_dst(mrs, tensor);
+}
+
+// TODO: move to ggml.h?
+static bool is_empty(ggml_op op) {
+    switch (op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+            return true;
+        default:
+            return false;
+    }
+}
+
+struct node_info {
+    ggml_tensor * node;
+
+    std::vector<ggml_tensor *> fused;
+
+    ggml_op op() const {
+        return node->op;
+    }
+
+    const ggml_tensor * dst() const {
+        return fused.empty() ? node : fused.back();
+    }
+
+    bool is_empty() const {
+        return ::is_empty(node->op);
+    }
+
+    void add_fused(ggml_tensor * t) {
+        fused.push_back(t);
+    }
+};
+
+static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node_info> & nodes) {
+    // helper to add node src and dst ranges
+    const auto & h_add = [](ggml_mem_ranges * mrs, const node_info & node) {
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (node.node->src[i]) {
+                if (!ggml_mem_ranges_add_src(mrs, node.node->src[i])) {
+                    return false;
+                }
+            }
+        }
+
+        for (const auto * fused : node.fused) {
+            for (int i = 0; i < GGML_MAX_SRC; i++) {
+                if (fused->src[i]) {
+                    if (!ggml_mem_ranges_add_src(mrs, fused->src[i])) {
+                        return false;
+                    }
+                }
+            }
+        }
+
+        return ggml_mem_ranges_add_dst(mrs, node.dst());
+    };
+
+    // helper to check if a node can run concurrently with the existing set of nodes
+    const auto & h_check = [](const ggml_mem_ranges * mrs, const node_info & node) {
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (node.node->src[i]) {
+                if (!ggml_mem_ranges_check_src(mrs, node.node->src[i])) {
+                    return false;
+                }
+            }
+        }
+
+        for (const auto * fused : node.fused) {
+            for (int i = 0; i < GGML_MAX_SRC; i++) {
+                if (fused->src[i]) {
+                    if (!ggml_mem_ranges_check_src(mrs, fused->src[i])) {
+                        return false;
+                    }
+                }
+            }
+        }
+
+        return ggml_mem_ranges_check_dst(mrs, node.dst());
+    };
+
+    // perform reorders only across these types of ops
+    // can be expanded when needed
+    // IMPORTANT: do not add ops such as GGML_OP_CPY or GGML_OP_SET_ROWS
+    //            the dependencies from such ops are not always represented in the graph
+    const auto & h_safe = [](ggml_op op) {
+        switch (op) {
+            case GGML_OP_MUL_MAT:
+            case GGML_OP_MUL_MAT_ID:
+            case GGML_OP_ROPE:
+            case GGML_OP_NORM:
+            case GGML_OP_RMS_NORM:
+            case GGML_OP_GROUP_NORM:
+            case GGML_OP_SUM_ROWS:
+            case GGML_OP_MUL:
+            case GGML_OP_ADD:
+            case GGML_OP_DIV:
+            case GGML_OP_GLU:
+            case GGML_OP_SCALE:
+            case GGML_OP_GET_ROWS:
+                return true;
+            default:
+                return is_empty(op);
+        }
+    };
+
+    const int n = nodes.size();
+
+    std::vector<int> res;
+    res.reserve(n);
+
+    std::vector<bool> used(n, false);
+
+    ggml_mem_ranges * mrs0 = ggml_mem_ranges_init(0);
+    ggml_mem_ranges * mrs1 = ggml_mem_ranges_init(0);
+
+    for (int i0 = 0; i0 < n; i0++) {
+        if (used[i0]) {
+            continue;
+        }
+
+        const auto & node0 = nodes[i0];
+
+        // the node is not concurrent with the existing concurrent set, so we have to "put a barrier" (i.e reset mrs0)
+        // but before we do that, look forward for some other nodes that can be added to the concurrent set mrs0
+        //
+        // note: we can always add empty nodes to the concurrent set as they don't read nor write anything
+        if (!node0.is_empty() && !h_check(mrs0, node0)) {
+            // this will hold the set of memory ranges from the nodes that haven't been processed yet
+            // if a node is not concurrent with this set, we cannot reorder it
+            ggml_mem_ranges_reset(mrs1);
+
+            // initialize it with the current node
+            h_add(mrs1, node0);
+
+            // that many nodes forward to search for a concurrent node
+            constexpr int N_FORWARD = 8;
+
+            for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
+                if (used[i1]) {
+                    continue;
+                }
+
+                const auto & node1 = nodes[i1];
+
+                // disallow reordering of certain ops
+                if (!h_safe(node1.op())) {
+                    break;
+                }
+
+                const bool is_empty = node1.is_empty();
+
+                // to add a concurrent node, it has to be:
+                //   + empty or concurrent with all nodes in the existing concurrent set (mrs0)
+                //   + concurrent with all nodes prior to it that haven't been processed yet (mrs1)
+                if ((is_empty || h_check(mrs0, node1)) && h_check(mrs1, node1)) {
+                    // add the node to the existing concurrent set (i.e. reorder it for early execution)
+                    h_add(mrs0, node1);
+                    res.push_back(i1);
+
+                    // mark as used, so we skip re-processing it later
+                    used[i1] = true;
+                } else {
+                    // expand the set of nodes that haven't been processed yet
+                    h_add(mrs1, node1);
+                }
+            }
+
+            // finalize the concurrent set and begin a new one
+            ggml_mem_ranges_reset(mrs0);
+        }
+
+        // expand the concurrent set with the current node
+        {
+            h_add(mrs0, node0);
+            res.push_back(i0);
+        }
+    }
+
+    ggml_mem_ranges_free(mrs0);
+    ggml_mem_ranges_free(mrs1);
+
+    return res;
+}
+
+void ggml_metal_graph_optimize(ggml_cgraph * gf) {
+    constexpr int MAX_FUSE = 16;
+
+    const int n = gf->n_nodes;
+
+    enum ggml_op ops[MAX_FUSE];
+
+    std::vector<node_info> nodes;
+    nodes.reserve(gf->n_nodes);
+
+    // fuse nodes:
+    // we don't want to make reorders that break fusing, so we first pack all fusable tensors
+    //   and perform the reorder over the fused nodes. after the reorder is done, we unfuse
+    for (int i = 0; i < n; i++) {
+        node_info node = {
+            /*.node =*/ gf->nodes[i],
+            /*.fused =*/ {},
+        };
+
+        // fuse only ops that start with these operations
+        // can be expanded when needed
+        if (node.op() == GGML_OP_ADD ||
+            node.op() == GGML_OP_RMS_NORM) {
+            ops[0] = node.op();
+
+            int f = i + 1;
+            while (f < n && f < i + MAX_FUSE) {
+                // conservatively allow fusing only these ops
+                // can be expanded when needed
+                if (gf->nodes[f]->op != GGML_OP_ADD &&
+                    gf->nodes[f]->op != GGML_OP_MUL &&
+                    gf->nodes[f]->op != GGML_OP_RMS_NORM) {
+                    break;
+                }
+                ops[f - i] = gf->nodes[f]->op;
+                f++;
+            }
+
+            f -= i;
+            for (; f > 1; f--) {
+                if (ggml_can_fuse(gf, i, ops, f)) {
+                    break;
+                }
+            }
+
+            // add the fused tensors into the node info so we can unfuse them later
+            for (int k = 1; k < f; k++) {
+                ++i;
+
+                // the .dst() becomes the last fused tensor
+                node.add_fused(gf->nodes[i]);
+            }
+        }
+
+        nodes.push_back(std::move(node));
+    }
+
+    // reorder to improve concurrency
+#if 1
+    const auto order = ggml_metal_graph_optimize_reorder(nodes);
+#else
+    std::vector<int> order(nodes.size());
+    for (size_t i = 0; i < nodes.size(); i++) {
+        order[i] = i;
+    }
+#endif
+
+    // unfuse
+    {
+        int j = 0;
+        for (const auto i : order) {
+            const auto & node = nodes[i];
+
+            gf->nodes[j++] = node.node;
+
+            for (auto * fused : node.fused) {
+                gf->nodes[j++] = fused;
+            }
+        }
+    }
+}
@@ -0,0 +1,52 @@
+// helper functions for ggml-metal that are too difficult to implement in Objective-C
+
+#pragma once
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+enum ggml_mem_range_type {
+    MEM_RANGE_TYPE_SRC = 0,
+    MEM_RANGE_TYPE_DST = 1,
+};
+
+// a helper object that can be used for reordering operations to improve concurrency
+//
+// the fundamental idea is that a set of tasks (either ggml ops, or something else) can run concurrently if they
+//   don't write to a memory that is being read by another task or written to by another task in the set
+//
+// with this structure, we can add tasks to the set, setting memory constraints. we can also check if a new task
+//   can be added to the set without violating the constraints (i.e. if it can be executed concurrently with the
+//   tasks already in the set)
+//
+struct ggml_mem_ranges;
+
+struct ggml_mem_ranges * ggml_mem_ranges_init(int debug);
+void ggml_mem_ranges_free(struct ggml_mem_ranges * mrs);
+
+// remove all ranges from the set
+void ggml_mem_ranges_reset(struct ggml_mem_ranges * mrs);
+
+// add src or dst ranges to track
+bool ggml_mem_ranges_add(struct ggml_mem_ranges * mrs, const struct ggml_tensor * tensor);
+
+// return false if:
+// - new src range overlaps with any existing dst range
+// - new dst range overlaps with any existing range (src or dst)
+bool ggml_mem_ranges_check(const struct ggml_mem_ranges * mrs, const struct ggml_tensor * tensor);
+
+// reorder the nodes in the graph to improve concurrency, while respecting fusion
+//
+// note: this implementation is generic and not specific to metal
+//       if it proves to work well, we can start using it for other backends in the future
+void ggml_metal_graph_optimize(struct ggml_cgraph * gf);
+
+#ifdef __cplusplus
+}
+#endif
@@ -3,6 +3,7 @@
 #import "ggml-impl.h"
 #import "ggml-backend-impl.h"
 #import "ggml-metal-impl.h"
+#import "ggml-metal-common.h"

 #import <Foundation/Foundation.h>

@@ -61,8 +62,11 @@ static struct ggml_backend_metal_device_context {
    bool has_bfloat;
    bool use_bfloat;
    bool use_fusion;
+    bool use_concurrency;
    bool use_shared_buffers;
+    bool use_graph_optimize;

+    int debug_graph;
    int debug_fusion;

    // how many times a given op was fused
@@ -83,7 +87,10 @@ static struct ggml_backend_metal_device_context {
    /*.has_bfloat              =*/ false,
    /*.use_bfloat              =*/ false,
    /*.use_fusion              =*/ true,
+    /*.use_concurrency         =*/ true,
    /*.use_shared_buffers      =*/ true,
+    /*.use_graph_optimize      =*/ true,
+    /*.debug_graph             =*/ 0,
    /*.debug_fusion            =*/ 0,
    /*.fuse_cnt                =*/ { 0 },
    /*.max_size                =*/ 0,
@@ -124,7 +131,14 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
 #else
            ctx->use_bfloat = false;
 #endif
-            ctx->use_fusion = getenv("GGML_METAL_FUSION_DISABLE") == nil;
+
+            ctx->use_fusion      = getenv("GGML_METAL_FUSION_DISABLE") == nil;
+            ctx->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
+
+            {
+                const char * val = getenv("GGML_METAL_GRAPH_DEBUG");
+                ctx->debug_graph = val ? atoi(val) : 0;
+            }

            {
                const char * val = getenv("GGML_METAL_FUSION_DEBUG");
@@ -137,6 +151,12 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
                ctx->use_shared_buffers = false;
            }

+            ctx->use_graph_optimize = true;
+
+            if (getenv("GGML_METAL_GRAPH_OPTIMIZE_DISABLE") != NULL) {
+                ctx->use_graph_optimize = false;
+            }
+
            memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt));

            ctx->max_size = ctx->mtl_device.maxBufferLength;
@@ -212,28 +232,6 @@ struct ggml_metal_kernel {
@end

 enum ggml_metal_kernel_type {
-    GGML_METAL_KERNEL_TYPE_ADD,
-    GGML_METAL_KERNEL_TYPE_ADD_FUSE_2,
-    GGML_METAL_KERNEL_TYPE_ADD_FUSE_3,
-    GGML_METAL_KERNEL_TYPE_ADD_FUSE_4,
-    GGML_METAL_KERNEL_TYPE_ADD_FUSE_5,
-    GGML_METAL_KERNEL_TYPE_ADD_FUSE_6,
-    GGML_METAL_KERNEL_TYPE_ADD_FUSE_7,
-    GGML_METAL_KERNEL_TYPE_ADD_FUSE_8,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW_C4,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7,
-    GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8,
-    GGML_METAL_KERNEL_TYPE_SUB,
-    GGML_METAL_KERNEL_TYPE_SUB_ROW_C4,
-    GGML_METAL_KERNEL_TYPE_MUL,
-    GGML_METAL_KERNEL_TYPE_MUL_ROW_C4,
-    GGML_METAL_KERNEL_TYPE_DIV,
-    GGML_METAL_KERNEL_TYPE_DIV_ROW_C4,
    GGML_METAL_KERNEL_TYPE_ADD_ID,
    GGML_METAL_KERNEL_TYPE_REPEAT_F32,
    GGML_METAL_KERNEL_TYPE_REPEAT_F16,
@@ -299,9 +297,6 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_0,
    GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1,
    GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL,
-    GGML_METAL_KERNEL_TYPE_RMS_NORM,
-    GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL,
-    GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD,
    GGML_METAL_KERNEL_TYPE_L2_NORM,
    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
    GGML_METAL_KERNEL_TYPE_NORM,
@@ -628,7 +623,7 @@ static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
@end

 //
-// ggml_metal_mem_pool
+// ggml_metal_mem_pool [TAG_MEM_POOL_REMOVE]
 //

 struct ggml_metal_mem_pool {
@@ -791,6 +786,9 @@ struct ggml_metal_command_buffer {

    // each command buffer has a memory pool from which it can allocate temporary buffers during the compute
    struct ggml_metal_mem_pool * mem_pool;
+
+    // used to enable concurrent execution of ops in the command buffers
+    struct ggml_mem_ranges * mem_ranges;
 };

 struct ggml_backend_metal_context {
@@ -1091,7 +1089,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
    GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, ctx_dev->has_bfloat                  ? "true" : "false");
    GGML_LOG_INFO("%s: use bfloat            = %s\n", __func__, ctx_dev->use_bfloat                  ? "true" : "false");
    GGML_LOG_INFO("%s: use fusion            = %s\n", __func__, ctx_dev->use_fusion                  ? "true" : "false");
+    GGML_LOG_INFO("%s: use concurrency       = %s\n", __func__, ctx_dev->use_concurrency             ? "true" : "false");
    GGML_LOG_INFO("%s: use shared buffers    = %s\n", __func__, ctx_dev->use_shared_buffers          ? "true" : "false");
+    GGML_LOG_INFO("%s: use graph optimize    = %s\n", __func__, ctx_dev->use_graph_optimize          ? "true" : "false");
    GGML_LOG_INFO("%s: hasUnifiedMemory      = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");

    ctx->capture_next_compute = false;
@@ -1105,6 +1105,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de

        ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init();
        ctx->cmd_bufs[i].mem_pool->device = device;
+
+        if (ctx_dev->use_concurrency) {
+            ctx->cmd_bufs[i].mem_ranges = ggml_mem_ranges_init(ctx_dev->debug_graph);
+        }
    }

    ctx->cmd_bufs_ext = [[NSMutableArray alloc] init];
@@ -1148,28 +1152,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de

        // simd_sum and simd_max requires MTLGPUFamilyApple7

-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                             add,                             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_2,                      add_fuse_2,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_3,                      add_fuse_3,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_4,                      add_fuse_4,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_5,                      add_fuse_5,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_6,                      add_fuse_6,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_7,                      add_fuse_7,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_FUSE_8,                      add_fuse_8,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4,                      add_row_c4,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2,               add_row_c4_fuse_2,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3,               add_row_c4_fuse_3,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4,               add_row_c4_fuse_4,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5,               add_row_c4_fuse_5,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6,               add_row_c4_fuse_6,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7,               add_row_c4_fuse_7,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8,               add_row_c4_fuse_8,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB,                             sub,                             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW_C4,                      sub_row_c4,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                             mul,                             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW_C4,                      mul_row_c4,                      true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                             div,                             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW_C4,                      div_row_c4,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ID,                          add_id,                          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32,                      repeat_f32,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16,                      repeat_f16,                      true);
@@ -1235,15 +1217,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_0,                   set_rows_q5_0,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1,                   set_rows_q5_1,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL,                 set_rows_iq4_nl,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                        rms_norm,                        has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL,                    rms_norm_mul,                    has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD,                rms_norm_mul_add,                has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM,                         l2_norm,                         has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                      group_norm,                      has_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                            norm,                            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                            norm,                            has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,                    ssm_conv_f32,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                    ssm_scan_f32,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP,              ssm_scan_f32_group,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,                    ssm_scan_f32,                    has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32_GROUP,              ssm_scan_f32_group,              has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,                   rwkv_wkv6_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,                   rwkv_wkv7_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                  mul_mv_f32_f32,                  has_simdgroup_reduction);
@@ -1361,7 +1340,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,                 mul_mm_q5_1_f32,                 has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,                 mul_mm_q8_0_f32,                 has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,                mul_mm_mxfp4_f32,                has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_MXFP4_F32,                mul_mm_mxfp4_f32,                has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,                 mul_mm_q2_K_f32,                 has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,                 mul_mm_q3_K_f32,                 has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,                 mul_mm_q4_K_f32,                 has_simdgroup_mm);
@@ -1465,9 +1443,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SWIGLU_OAI,                      swiglu_oai,                      true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_ERF,                       geglu_erf,                       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU_QUICK,                     geglu_quick,                     true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            has_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,                 pool_2d_max_f32,                 true);
    }
@@ -1521,6 +1499,9 @@ static id<MTLComputePipelineState> ggml_metal_compile_kernel(ggml_backend_t back
        NSString * key = [NSString stringWithUTF8String:name];
        [ctx->kernels_ext setObject:obj forKey:key];

+        [metal_function release];
+        [obj release];
+
        GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) kernel.pipeline,
                (int) kernel.pipeline.maxTotalThreadsPerThreadgroup,
                (int) kernel.pipeline.threadExecutionWidth);
@@ -1542,8 +1523,6 @@ static id<MTLComputePipelineState> ggml_metal_get_pipeline_flash_attn_ext(
    char name[256];

    @autoreleasepool {
-        MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
-
        const int32_t dk = (int32_t) op->src[1]->ne[0];
        const int32_t dv = (int32_t) op->src[2]->ne[0];

@@ -1575,7 +1554,7 @@ static id<MTLComputePipelineState> ggml_metal_get_pipeline_flash_attn_ext(
            return res;
        }

-        cv = [[MTLFunctionConstantValues alloc] init];
+        MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];

        [cv setConstantValue:&has_mask  type:MTLDataTypeBool atIndex:FC_FLASH_ATTN_EXT + 0];
        [cv setConstantValue:&has_sinks type:MTLDataTypeBool atIndex:FC_FLASH_ATTN_EXT + 1];
@@ -1586,7 +1565,11 @@ static id<MTLComputePipelineState> ggml_metal_get_pipeline_flash_attn_ext(
        [cv setConstantValue:&ns20 type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT + 21];
        [cv setConstantValue:&nsg  type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT + 22];

-        return ggml_metal_compile_kernel(backend, base, name, cv);
+        res = ggml_metal_compile_kernel(backend, base, name, cv);
+
+        [cv release];
+
+        return res;
    }
 }

@@ -1604,8 +1587,6 @@ static id<MTLComputePipelineState> ggml_metal_get_pipeline_flash_attn_ext_vec(
    char name[256];

    @autoreleasepool {
-        MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
-
        const int32_t dk = (int32_t) op->src[1]->ne[0];
        const int32_t dv = (int32_t) op->src[2]->ne[0];

@@ -1637,7 +1618,7 @@ static id<MTLComputePipelineState> ggml_metal_get_pipeline_flash_attn_ext_vec(
            return res;
        }

-        cv = [[MTLFunctionConstantValues alloc] init];
+        MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];

        [cv setConstantValue:&has_mask  type:MTLDataTypeBool atIndex:FC_FLASH_ATTN_EXT_VEC + 0];
        [cv setConstantValue:&has_sinks type:MTLDataTypeBool atIndex:FC_FLASH_ATTN_EXT_VEC + 1];
@@ -1649,7 +1630,11 @@ static id<MTLComputePipelineState> ggml_metal_get_pipeline_flash_attn_ext_vec(
        [cv setConstantValue:&nsg  type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT_VEC + 22];
        [cv setConstantValue:&nwg  type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT_VEC + 23];

-        return ggml_metal_compile_kernel(backend, base, name, cv);
+        res = ggml_metal_compile_kernel(backend, base, name, cv);
+
+        [cv release];
+
+        return res;
    }
 }

@@ -1663,8 +1648,6 @@ static id<MTLComputePipelineState> ggml_metal_get_pipeline_flash_attn_ext_vec_re
    char name[256];

    @autoreleasepool {
-        MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];
-
        snprintf(base, 256, "kernel_flash_attn_ext_vec_reduce");
        snprintf(name, 256, "kernel_flash_attn_ext_vec_reduce_dv=%d_nwg=%d", dv, nwg);

@@ -1674,12 +1657,83 @@ static id<MTLComputePipelineState> ggml_metal_get_pipeline_flash_attn_ext_vec_re
            return res;
        }

-        cv = [[MTLFunctionConstantValues alloc] init];
+        MTLFunctionConstantValues * cv = [[MTLFunctionConstantValues alloc] init];

        [cv setConstantValue:&dv  type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT_VEC_REDUCE + 0];
        [cv setConstantValue:&nwg type:MTLDataTypeInt atIndex:FC_FLASH_ATTN_EXT_VEC_REDUCE + 1];

-        return ggml_metal_compile_kernel(backend, base, name, cv);
+        res = ggml_metal_compile_kernel(backend, base, name, cv);
+
+        [cv release];
+
+        return res;
+    }
+
+    GGML_UNUSED(op);
+}
+
+static id<MTLComputePipelineState> ggml_metal_get_pipeline_bin(
+        ggml_backend_t backend, enum ggml_op op,
+        int32_t n_fuse,
+        bool row) {
+    struct ggml_backend_metal_context * ctx = backend->context;
+
+    char base[256];
+    char name[256];
+
+    @autoreleasepool {
+        const char * op_str = "undefined";
+        switch (op) {
+            case GGML_OP_ADD:   op_str = "add";   break;
+            case GGML_OP_SUB:   op_str = "sub";   break;
+            case GGML_OP_MUL:   op_str = "mul";   break;
+            case GGML_OP_DIV:   op_str = "div";   break;
+            default: GGML_ABORT("fatal error");
+        };
+
+        if (row) {
+            snprintf(base, 256, "kernel_%s_row_c4_fuse_%d", op_str, n_fuse);
+        } else {
+            snprintf(base, 256, "kernel_%s_fuse_%d", op_str, n_fuse);
+        }
+
+        snprintf(name, 256, "%s", base);
+
+        id<MTLComputePipelineState> res = ggml_metal_get_kernel(ctx, name);
+        if (res) {
+            // kernel found
+            return res;
+        }
+
+        return ggml_metal_compile_kernel(backend, base, name, nil);
+    }
+}
+
+static id<MTLComputePipelineState> ggml_metal_get_pipeline_rms_norm(
+        ggml_backend_t backend, struct ggml_tensor * op,
+        int32_t n_fuse) {
+    struct ggml_backend_metal_context * ctx = backend->context;
+
+    char base[256];
+    char name[256];
+
+    @autoreleasepool {
+        switch (n_fuse) {
+            case 1: snprintf(base, 256, "kernel_rms_norm");              break;
+            case 2: snprintf(base, 256, "kernel_rms_norm_mul");     break;
+            case 3: snprintf(base, 256, "kernel_rms_norm_mul_add"); break;
+            default: GGML_ABORT("fatal error");
+        }
+
+        snprintf(name, 256, "%s", base);
+
+        id<MTLComputePipelineState> res = ggml_metal_get_kernel(ctx, name);
+        if (res) {
+            // kernel found
+            return res;
+        }
+
+        return ggml_metal_compile_kernel(backend, base, name, nil);
    }

    GGML_UNUSED(op);
@@ -1707,6 +1761,10 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
        }

        ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
+
+        if (ctx->cmd_bufs[i].mem_ranges) {
+            ggml_mem_ranges_free(ctx->cmd_bufs[i].mem_ranges);
+        }
    }

    [ctx->cmd_bufs_ext removeAllObjects];
@@ -1924,7 +1982,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_L2_NORM:
            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
        case GGML_OP_ARGMAX:
-            return true;
+            return has_simdgroup_reduction;
        case GGML_OP_NORM:
            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
        case GGML_OP_ROPE:
@@ -1970,6 +2028,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
            return has_simdgroup_mm; // TODO: over-restricted for vec-kernels
        case GGML_OP_SSM_CONV:
        case GGML_OP_SSM_SCAN:
+            return has_simdgroup_reduction;
        case GGML_OP_RWKV_WKV6:
        case GGML_OP_RWKV_WKV7:
            return true;
@@ -2063,12 +2122,51 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
    }
 }

-static int ggml_metal_encode_node(
-                        ggml_backend_t   backend,
-                                   int   idx,
-                                   int   idx_end,
-          id<MTLComputeCommandEncoder>   encoder,
-            struct ggml_metal_mem_pool * mem_pool) {
+struct ggml_metal_encode_context {
+    ggml_backend_t backend;
+
+    id<MTLComputeCommandEncoder> encoder;
+
+    struct ggml_metal_mem_pool * mem_pool;
+
+    struct ggml_mem_ranges * mem_ranges;
+};
+
+static bool ggml_metal_encode_concurrency_reset(struct ggml_metal_encode_context * ctx) {
+    if (!ctx->mem_ranges) {
+        return true;
+    }
+
+    [ctx->encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+
+    ggml_mem_ranges_reset(ctx->mem_ranges);
+
+    return true;
+}
+
+static bool ggml_metal_encode_concurrency_check(struct ggml_metal_encode_context * ctx, const struct ggml_tensor * node) {
+    if (!ctx->mem_ranges) {
+        return false;
+    }
+
+    return ggml_mem_ranges_check(ctx->mem_ranges, node);
+}
+
+static bool ggml_metal_encode_concurrency_add(struct ggml_metal_encode_context * ctx, const struct ggml_tensor * node) {
+    if (!ctx->mem_ranges) {
+        return true;
+    }
+
+    return ggml_mem_ranges_add(ctx->mem_ranges, node);
+}
+
+static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, int idx, int idx_end) {
+    ggml_backend_t backend = ctx_enc->backend;
+
+    id<MTLComputeCommandEncoder> encoder = ctx_enc->encoder;
+
+    struct ggml_metal_mem_pool * mem_pool = ctx_enc->mem_pool;
+
    struct ggml_backend_metal_context        * ctx     = backend->context;
    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;

@@ -2151,38 +2249,71 @@ static int ggml_metal_encode_node(
    const uint64_t nb2  =  dst ?  dst->nb[2] : 0;
    const uint64_t nb3  =  dst ?  dst->nb[3] : 0;

+    size_t offs_src[GGML_MAX_SRC];
+
+    id<MTLBuffer> id_src[GGML_MAX_SRC];
+
+    enum ggml_type srct[GGML_MAX_SRC];
+
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        offs_src[i] = 0;
+        id_src[i] = node->src[i] ? ggml_metal_get_buffer(node->src[i], &offs_src[i]) : nil;
+        srct[i]   = node->src[i] ? node->src[i]->type : GGML_TYPE_COUNT;
+    }
+
+    // TODO: tmp shorthands - remove
+    size_t offs_src0 = offs_src[0];
+    size_t offs_src1 = offs_src[1];
+    size_t offs_src2 = offs_src[2];
+
+    id<MTLBuffer> id_src0 = id_src[0];
+    id<MTLBuffer> id_src1 = id_src[1];
+    id<MTLBuffer> id_src2 = id_src[2];
+
    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
    const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT;
    const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;

-    size_t offs_src0 = 0;
-    size_t offs_src1 = 0;
-    size_t offs_src2 = 0;
-    size_t offs_dst  = 0;
+    size_t offs_dst = 0;

-    id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
-    id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
-    id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
-    id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
+    id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;

    int n_fuse = 1;

-#if 0
-    GGML_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
-    if (src0) {
-        GGML_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03,
-                ggml_is_contiguous(src0), src0->name);
+    // check if the current node can run concurrently with other nodes before it
+    // the condition is that:
+    //  - the current node cannot write to any previous src or dst ranges
+    //  - the current node cannot read from any previous dst ranges
+    //
+    // if the condition is not satisfied, we put a memory barrier and clear all ranges
+    // otherwise, we add the new ranges to the encoding context and process the node concurrently
+    //
+    {
+        const bool is_concurrent = ggml_metal_encode_concurrency_check(ctx_enc, node);
+
+        if (!is_concurrent) {
+            ggml_metal_encode_concurrency_reset(ctx_enc);
+        }
+
+        if (ctx_dev->debug_graph > 0) {
+            GGML_LOG_DEBUG("%s: node[%5d] - %-12s %s\n", __func__, idx, ggml_op_name(dst->op), is_concurrent ? "(concurrent)" : "");
+        }
+        if (ctx_dev->debug_graph > 1) {
+            if (src0) {
+                GGML_LOG_DEBUG("%s: src0 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03,
+                        ggml_is_contiguous(src0), src0->name);
+            }
+            if (src1) {
+                GGML_LOG_DEBUG("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
+                        ggml_is_contiguous(src1), src1->name);
+            }
+            if (dst) {
+                GGML_LOG_DEBUG("%s: dst  - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3,
+                        dst->name);
+            }
+        }
    }
-    if (src1) {
-        GGML_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
-                ggml_is_contiguous(src1), src1->name);
-    }
-    if (dst) {
-        GGML_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld, %5lld] [%5lld, %5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2, ne3, nb0, nb1, nb2, nb3,
-                dst->name);
-    }
-#endif

    id<MTLDevice> device = ctx_dev->mtl_device;

@@ -2246,8 +2377,6 @@ static int ggml_metal_encode_node(

                bool bcast_row = false;

-                id<MTLComputePipelineState> pipeline = nil;
-
                ggml_metal_kargs_bin args = {
                    /*.ne00 =*/ ne00,
                    /*.ne01 =*/ ne01,
@@ -2328,59 +2457,31 @@ static int ggml_metal_encode_node(
                    }
                }

+                id<MTLComputePipelineState> pipeline = nil;
+
                if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
                    GGML_ASSERT(ggml_is_contiguous(src0));

                    // src1 is a row
                    GGML_ASSERT(ne11 == 1);

-                    switch (dst->op) {
-                        case GGML_OP_ADD:
-                            {
-                                switch (n_fuse) {
-                                    case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4       ].pipeline; break;
-                                    case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_2].pipeline; break;
-                                    case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_3].pipeline; break;
-                                    case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_4].pipeline; break;
-                                    case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_5].pipeline; break;
-                                    case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_6].pipeline; break;
-                                    case 7: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_7].pipeline; break;
-                                    case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW_C4_FUSE_8].pipeline; break;
-                                    default: GGML_ABORT("fatal error");
-                                }
-                            } break;
-                        case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW_C4].pipeline; break;
-                        case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW_C4].pipeline; break;
-                        case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW_C4].pipeline; break;
-                        default: GGML_ABORT("fatal error");
-                    }
+                    pipeline = ggml_metal_get_pipeline_bin(backend, dst->op, n_fuse, true);

                    bcast_row = true;
                } else {
-                    switch (dst->op) {
-                        case GGML_OP_ADD:
-                            {
-                                switch (n_fuse) {
-                                    case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD       ].pipeline; break;
-                                    case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_2].pipeline; break;
-                                    case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_3].pipeline; break;
-                                    case 4: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_4].pipeline; break;
-                                    case 5: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_5].pipeline; break;
-                                    case 6: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_6].pipeline; break;
-                                    case 7: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_7].pipeline; break;
-                                    case 8: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_FUSE_8].pipeline; break;
-                                    default: GGML_ABORT("fatal error");
-                                }
-                            } break;
-                        case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
-                        case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
-                        case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
-                        default: GGML_ABORT("fatal error");
-                    }
+                    pipeline = ggml_metal_get_pipeline_bin(backend, dst->op, n_fuse, false);
                }

                if (n_fuse > 1) {
                    id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
+
+                    for (int i = 1; i < n_fuse; ++i) {
+                        if (!ggml_metal_encode_concurrency_check(ctx_enc, nodes[i])) {
+                            ggml_metal_encode_concurrency_reset(ctx_enc);
+
+                            break;
+                        }
+                    }
                }

                [encoder setComputePipelineState:pipeline];
@@ -2525,9 +2626,9 @@ static int ggml_metal_encode_node(
                    const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);

                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                }

-                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
+                    ggml_metal_encode_concurrency_reset(ctx_enc);
+                }

                ggml_metal_kargs_bin args = {
                    /*.ne00 =*/ ne00,
@@ -2558,6 +2659,9 @@ static int ggml_metal_encode_node(
                    /*.o1   =*/ { offs_src1},
                };

+                //const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
+                const id<MTLComputePipelineState> pipeline = ggml_metal_get_pipeline_bin(backend, GGML_OP_ADD, 1, false);
+
                [encoder setComputePipelineState:pipeline];
                [encoder setBytes:&args length:sizeof(args) atIndex:0];
                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
@@ -3989,6 +4093,12 @@ static int ggml_metal_encode_node(
                        default: break;
                    }

+                    // TODO: using mem pool allocations with enabled concurrency is not safe because the mem pool
+                    // reuses buffers. this can result in 2 concurrent MUL_MAT_ID ops using the same mem pool buffer.
+                    // so we add this extra barrier to prevent the race.
+                    // the correct solution is to remove mem pools and then remove this barrier [TAG_MEM_POOL_REMOVE]
+                    ggml_metal_encode_concurrency_reset(ctx_enc);
+
                    // tokens per expert
                    const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02;
                    id<MTLBuffer> h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
@@ -4049,6 +4159,9 @@ static int ggml_metal_encode_node(
                        [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(ne02, 1, 1)];
                    }

+                    // this barrier is always needed because the next kernel has to wait for the id maps to be computed
+                    ggml_metal_encode_concurrency_reset(ctx_enc);
+
                    {
                        id<MTLComputePipelineState> pipeline = nil;

@@ -4517,16 +4630,17 @@ static int ggml_metal_encode_node(

                if (n_fuse > 1) {
                    id_dst = ggml_metal_get_buffer(nodes[n_fuse - 1], &offs_dst);
+
+                    for (int i = 1; i < n_fuse; ++i) {
+                        if (!ggml_metal_encode_concurrency_check(ctx_enc, nodes[i])) {
+                            ggml_metal_encode_concurrency_reset(ctx_enc);
+
+                            break;
+                        }
+                    }
                }

-                id<MTLComputePipelineState> pipeline;
-
-                switch (n_fuse) {
-                    case 1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM        ].pipeline; break;
-                    case 2: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL    ].pipeline; break;
-                    case 3: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM_MUL_ADD].pipeline; break;
-                    default: GGML_ABORT("unsupported n_fuse = %d\n", n_fuse);
-                }
+                const id<MTLComputePipelineState> pipeline = ggml_metal_get_pipeline_rms_norm(backend, node, n_fuse);

                int nth = 32; // SIMD width

@@ -4660,7 +4774,6 @@ static int ggml_metal_encode_node(
            } break;
        case GGML_OP_ROPE:
            {
-
                // make sure we have one or more position id(ne10) per token(ne02)
                GGML_ASSERT(ne10 % ne02 == 0);
                GGML_ASSERT(ne10 >= ne02);
@@ -5419,6 +5532,10 @@ static int ggml_metal_encode_node(
                        GGML_ASSERT(ne01*ne02*ne03 == ne1*ne2*ne3);
                        GGML_ASSERT(ne1*ne2*ne3 <= (1u << 31));

+                        // using mem pool allocations with enabled concurrency is not safe [TAG_MEM_POOL_REMOVE]
+                        // still, we assume that concurrent FA won't happen before we do the refactor
+                        //ggml_metal_encode_concurrency_reset(ctx_enc);
+
                        const int32_t nrows = ne1*ne2*ne3;

                        // temp buffer for writing the results from each workgroup
@@ -5439,6 +5556,8 @@ static int ggml_metal_encode_node(
                        [encoder setThreadgroupMemoryLength:smem atIndex:0];
                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];

+                        ggml_metal_encode_concurrency_reset(ctx_enc);
+
                        // reduce the results from the workgroups
                        {
                            ggml_metal_kargs_flash_attn_ext_vec_reduce args0 = {
@@ -5669,7 +5788,7 @@ static int ggml_metal_encode_node(

                [encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)];
            } break;
-            case GGML_OP_ARGMAX:
+        case GGML_OP_ARGMAX:
            {
                GGML_ASSERT(src0->type == GGML_TYPE_F32);
                GGML_ASSERT(ggml_is_contiguous_1(src0));
@@ -5701,6 +5820,19 @@ static int ggml_metal_encode_node(
            }
    }

+    if (ctx_dev->debug_graph > 0) {
+        if (n_fuse > 1) {
+            GGML_LOG_DEBUG("%s:               fuse %d ops\n", __func__, n_fuse);
+        }
+    }
+
+    // update the mem ranges in the encoding context
+    for (int i = 0; i < n_fuse; ++i) {
+        if (!ggml_metal_encode_concurrency_add(ctx_enc, nodes[i])) {
+            ggml_metal_encode_concurrency_reset(ctx_enc);
+        }
+    }
+
    return n_fuse;
 }

@@ -5711,7 +5843,7 @@ static enum ggml_status ggml_metal_graph_compute(
    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;

    // number of nodes encoded by the main thread (empirically determined)
-    const int n_main = 128;
+    const int n_main = 64;

    // number of threads in addition to the main thread
    const int n_cb = ctx->n_cb;
@@ -5766,10 +5898,14 @@ static enum ggml_status ggml_metal_graph_compute(
            // cannot use commandBufferWithUnretainedReferences because the buffers from the memory pool can get destroyed
            // TODO: when the memory pools are removed, we can again use commandBufferWithUnretainedReferences
            //       https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2334215009
+            // [TAG_MEM_POOL_REMOVE]
            //id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
            [cmd_buf retain];

+            if (ctx->cmd_bufs[n_cb].obj) {
+                [ctx->cmd_bufs[n_cb].obj release];
+            }
            ctx->cmd_bufs[n_cb].obj = cmd_buf;

            [cmd_buf enqueue];
@@ -6536,6 +6672,18 @@ static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend,
    return ggml_metal_graph_compute(backend, cgraph);
 }

+static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+    //const int64_t t_start = ggml_time_us();
+
+    if (ctx_dev->use_graph_optimize) {
+        ggml_metal_graph_optimize(cgraph);
+    }
+
+    //printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
+}
+
 static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
    GGML_ASSERT(ggml_backend_is_metal(backend));

@@ -6562,12 +6710,25 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {

        const int n_nodes_per_cb = ctx->n_nodes_per_cb;

-        id<MTLCommandBuffer>         cmd_buf  = ctx->cmd_bufs[cb_idx].obj;
-        struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
+        id<MTLCommandBuffer>         cmd_buf    = ctx->cmd_bufs[cb_idx].obj;
+        struct ggml_metal_mem_pool * mem_pool   = ctx->cmd_bufs[cb_idx].mem_pool;
+        struct ggml_mem_ranges     * mem_ranges = ctx->cmd_bufs[cb_idx].mem_ranges;

        ggml_metal_mem_pool_reset(mem_pool);

-        id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
+        if (mem_ranges) {
+            ggml_mem_ranges_reset(mem_ranges);
+        }
+
+        id<MTLComputeCommandEncoder> encoder;
+
+        struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+
+        if (ctx_dev->use_concurrency) {
+            encoder = [cmd_buf computeCommandEncoderWithDispatchType: MTLDispatchTypeConcurrent];
+        } else {
+            encoder = [cmd_buf computeCommandEncoder];
+        }

        int node_start = 0;
        int node_end   = n_nodes_0;
@@ -6579,12 +6740,19 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {

        const bool should_capture = ctx->capture_next_compute;

+        struct ggml_metal_encode_context ctx_enc = {
+            /*.backend    =*/ backend,
+            /*.encoder    =*/ encoder,
+            /*.mem_pool   =*/ mem_pool,
+            /*.mem_ranges =*/ mem_ranges,
+        };
+
        for (int idx = node_start; idx < node_end;) {
            if (should_capture) {
                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
            }

-            const int res = ggml_metal_encode_node(backend, idx, node_end, encoder, mem_pool);
+            const int res = ggml_metal_encode_node(&ctx_enc, idx, node_end);
            if (idx + res > node_end) {
                GGML_ABORT("fusion error: nodes spanning multiple encoders have been fused. this indicates a bug in the fusion logic %s",
                        "https://github.com/ggml-org/llama.cpp/pull/14849");
@@ -6627,7 +6795,7 @@ static struct ggml_backend_i ggml_backend_metal_i = {
    // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
    /* .event_record            = */ NULL,
    /* .event_wait              = */ NULL,
-    /* .optimize_graph          = */ NULL,
+    /* .optimize_graph          = */ ggml_backend_metal_graph_optimize,
 };

 static ggml_guid_t ggml_backend_metal_guid(void) {
@@ -928,7 +928,7 @@ kernel void kernel_add_fuse_impl(

 typedef decltype(kernel_add_fuse_impl<2>) kernel_add_fuse_t;

-template [[host_name("kernel_add")]]        kernel kernel_add_fuse_t kernel_add_fuse_impl<1>;
+template [[host_name("kernel_add_fuse_1")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<1>;
 template [[host_name("kernel_add_fuse_2")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<2>;
 template [[host_name("kernel_add_fuse_3")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<3>;
 template [[host_name("kernel_add_fuse_4")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<4>;
@@ -937,7 +937,7 @@ template [[host_name("kernel_add_fuse_6")]] kernel kernel_add_fuse_t kernel_add_
 template [[host_name("kernel_add_fuse_7")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<7>;
 template [[host_name("kernel_add_fuse_8")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<8>;

-kernel void kernel_sub(
+kernel void kernel_sub_fuse_1(
        constant ggml_metal_kargs_bin & args,
        device const char * src0,
        device const char * src1,
@@ -963,7 +963,7 @@ kernel void kernel_sub(
    }
 }

-kernel void kernel_mul(
+kernel void kernel_mul_fuse_1(
        constant ggml_metal_kargs_bin & args,
        device const char * src0,
        device const char * src1,
@@ -996,7 +996,7 @@ kernel void kernel_mul(
    }
 }

-kernel void kernel_div(
+kernel void kernel_div_fuse_1(
        constant ggml_metal_kargs_bin & args,
        device const char * src0,
        device const char * src1,
@@ -1096,23 +1096,17 @@ kernel void kernel_add_row_c4_fuse_impl(
        device const char * src1,
        device       char * dst,
        uint tpig[[thread_position_in_grid]]) {
-
    const uint nb = args.ne00/4;
    const uint i  = tpig % nb;

    device const float4 * src0_row = (device const float4 *) (src0);
    device       float4 *  dst_row = (device       float4 *) (dst);

-    device const float4 * src1_row[F];
-    for (short j = 0; j < F; ++j) {
-        src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
-    }
-
    float4 res = src0_row[tpig];

 #pragma unroll(F)
    for (short j = 0; j < F; ++j) {
-        res += src1_row[j][i];
+        res += ((device const float4 *) (src1 + args.o1[j]))[i];
    }

    dst_row[tpig] = res;
@@ -1120,7 +1114,7 @@ kernel void kernel_add_row_c4_fuse_impl(

 typedef decltype(kernel_add_row_c4_fuse_impl<1>) kernel_add_row_c4_fuse_t;

-template [[host_name("kernel_add_row_c4")]]        kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>;
+template [[host_name("kernel_add_row_c4_fuse_1")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>;
 template [[host_name("kernel_add_row_c4_fuse_2")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<2>;
 template [[host_name("kernel_add_row_c4_fuse_3")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<3>;
 template [[host_name("kernel_add_row_c4_fuse_4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<4>;
@@ -1160,7 +1154,7 @@ kernel void kernel_sub_row_c4_fuse_impl(

 typedef decltype(kernel_sub_row_c4_fuse_impl<1>) kernel_sub_row_c4_fuse_t;

-template [[host_name("kernel_sub_row_c4")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>;
+template [[host_name("kernel_sub_row_c4_fuse_1")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>;

 template <short F>
 kernel void kernel_mul_row_c4_fuse_impl(
@@ -1193,7 +1187,7 @@ kernel void kernel_mul_row_c4_fuse_impl(

 typedef decltype(kernel_mul_row_c4_fuse_impl<1>) kernel_mul_row_c4_fuse_t;

-template [[host_name("kernel_mul_row_c4")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>;
+template [[host_name("kernel_mul_row_c4_fuse_1")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>;

 template <short F>
 kernel void kernel_div_row_c4_fuse_impl(
@@ -1226,7 +1220,7 @@ kernel void kernel_div_row_c4_fuse_impl(

 typedef decltype(kernel_div_row_c4_fuse_impl<1>) kernel_div_row_c4_fuse_t;

-template [[host_name("kernel_div_row_c4")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>;
+template [[host_name("kernel_div_row_c4_fuse_1")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>;

 kernel void kernel_scale(
        device const float * src0,
@@ -225,9 +225,9 @@ struct bin_bcast_sycl {
                    dpct::has_capability_or_fail(stream->get_device(),
                                                 {sycl::aspect::fp16});

-                    sycl_parallel_for(
-                        stream,
-                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size),
+                    stream->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
+                                              sycl::range<3>(1, 1, block_size),
                                          sycl::range<3>(1, 1, block_size)),
                        [=](sycl::nd_item<3> item_ct1) {
                            k_bin_bcast_unravel<bin_op>(
@@ -246,8 +246,9 @@ struct bin_bcast_sycl {
                dpct::has_capability_or_fail(stream->get_device(),
                                             {sycl::aspect::fp16});

-                sycl_parallel_for(
-                    stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                stream->parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
                                            ne2, ne3, ne10, ne11, ne12, ne13,
                                            s1, s2, s3, s01, s02, s03, s11, s12, s13,
@@ -89,24 +89,33 @@ static void concat_f32_sycl(const float *x, const float *y, float *dst,
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
  switch (dim) {
  case 0:
-      sycl_parallel_for(stream,
-                        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-                        [=](sycl::nd_item<3> item_ct1) { concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); });
-      break;
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
+        });
+    break;
  case 1:
-      sycl_parallel_for(stream,
-                        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-                        [=](sycl::nd_item<3> item_ct1) { concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); });
-      break;
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
+        });
+    break;
  // dim >=2 will be dispatched to the default path
  default:
-      sycl_parallel_for(stream,
-                        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-                        [=](sycl::nd_item<3> item_ct1) { concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); });
-      break;
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
+        });
+    break;
  }
 }

@@ -120,7 +129,7 @@ static void concat_f32_sycl_non_cont(
    int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2,
    uint64_t nb3, int32_t dim) {
  sycl::range<3> gridDim(ne3, ne2, ne1);
-  sycl_parallel_for(stream, sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
+  stream->parallel_for(sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
      int64_t i3 = item_ct1.get_group(0);
      int64_t i2 = item_ct1.get_group(1);
      int64_t i1 = item_ct1.get_group(2);
@@ -59,10 +59,16 @@ static void conv_transpose_1d_f32_f32_sycl(
    const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE;
    const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE);
    const sycl::range<3> block_nums(1, 1, num_blocks);
-    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-        conv_transpose_1d_kernel(s0, output_size, src0_ne0, src0_ne1, src0_ne2, src1_ne0, dst_ne0, src0, src1, dst,
-                                 item_ct1);
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(
+            block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            conv_transpose_1d_kernel(
+                s0, output_size,
+                src0_ne0, src0_ne1, src0_ne2,
+                src1_ne0, dst_ne0,
+                src0, src1, dst, item_ct1);
+        });
 }

 void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
@@ -33,11 +33,14 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});
-        sycl_parallel_for(
-            stream,
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1); });
+        stream->parallel_for(
+            sycl::nd_range<3>(
+                sycl::range<3>(1, 1, num_blocks) *
+                    sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
+                sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
+            });
    }
 }

@@ -50,18 +53,24 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
    }
 #else
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
    }

 #endif
@@ -76,18 +85,24 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
    }
 #else
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
    }
 #endif
 }
@@ -101,9 +116,12 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_0(vx, y, nb32, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_0(vx, y, nb32, item_ct1);
+                             });
    }
 }

@@ -117,12 +135,13 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
    int constexpr WARP_K = WARP_SIZE * QK4_0;
    const int n_warp = (k + WARP_K - 1) / WARP_K;
    GGML_ASSERT(k % 2 == 0);
-    sycl_parallel_for(stream,
-                      sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE),
-                                        sycl::range<3>(1, 1, WARP_SIZE)),
-                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                          dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
-                      });
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
+        sycl::range<3>(1, 1, WARP_SIZE),
+        sycl::range<3>(1, 1, WARP_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
+            dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
+        });
+
 }

 template <typename dst_t>
@@ -134,9 +153,12 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_1(vx, y, nb32, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_1(vx, y, nb32, item_ct1);
+                             });
    }
 }

@@ -149,13 +171,14 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler &cgh) {
            sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                [=](sycl::nd_item<3> item_ct1) {
-                    dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
-                });
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
+                             });
        });
    }
 }
@@ -168,13 +191,13 @@ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const i

    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-    sycl_launch(stream, [&](sycl::handler & cgh) {
+    stream->submit([&](sycl::handler & cgh) {
        sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);

-        sycl_parallel_for<1>(cgh, sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
-                             [=](sycl::nd_item<1> item_ct1) {
-                                 dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
-                             });
+        cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
+                         [=](sycl::nd_item<1> item_ct1) {
+                             dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
+                         });
    });
 }

@@ -187,18 +210,24 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
    }
 #else
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
    }

 #endif
@@ -213,18 +242,24 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
    }
 #else
    {
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-            [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
    }

 #endif
@@ -236,9 +271,9 @@ static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const i

    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-    sycl_parallel_for(stream,
-                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
-                      [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
+        [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
 }

 template <typename dst_t>
@@ -249,10 +284,15 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_s(vx, y, item_ct1, iq1s_grid_gpu); });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq1_s(
+                                     vx, y, item_ct1, iq1s_grid_gpu
+                                     );
+                             });
        });
    }
 }
@@ -265,10 +305,15 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_m(vx, y, item_ct1, iq1s_grid_gpu); });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq1_m(
+                                     vx, y, item_ct1, iq1s_grid_gpu
+                                     );
+                             });
        });
    }
 }
@@ -281,12 +326,15 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                [=](sycl::nd_item<3> item_ct1) {
-                    dequantize_block_iq2_xxs(vx, y, item_ct1, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
-                });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_xxs(
+                                     vx, y, item_ct1, iq2xxs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
        });
    }
 }
@@ -299,12 +347,15 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                [=](sycl::nd_item<3> item_ct1) {
-                    dequantize_block_iq2_xs(vx, y, item_ct1, iq2xs_grid, ksigns_iq2xs, kmask_iq2xs);
-                });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_xs(
+                                     vx, y, item_ct1, iq2xs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
        });
    }
 }
@@ -317,10 +368,13 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_s(vx, y, item_ct1); });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq2_s(vx, y, item_ct1);
+                             });
        });
    }
 }
@@ -334,12 +388,15 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                [=](sycl::nd_item<3> item_ct1) {
-                    dequantize_block_iq3_xxs(vx, y, item_ct1, iq3xxs_grid, ksigns_iq2xs, kmask_iq2xs);
-                });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq3_xxs(
+                                     vx, y, item_ct1, iq3xxs_grid,
+                                     ksigns_iq2xs, kmask_iq2xs);
+                             });
        });
    }
 }
@@ -352,10 +409,14 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s(vx, y, item_ct1, kmask_iq2xs, iq3s_grid); });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_iq3_s(
+                                     vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
+                             });
        });
    }
 }
@@ -371,11 +432,14 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
-                sycl_parallel_for(
-                    cgh,
-                    sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                    [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_xs(vx, y, item_ct1); });
+            stream->submit([&](sycl::handler &cgh) {
+                  cgh.parallel_for(
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                            sycl::range<3>(1, 1, 32),
+                                        sycl::range<3>(1, 1, 32)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                            dequantize_block_iq4_xs(vx, y, item_ct1);
+                      });
            });
      }
 #endif
@@ -389,11 +453,14 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
-                sycl_parallel_for(
-                    cgh,
-                    sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
-                    [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_nl(vx, y, item_ct1); });
+            stream->submit([&](sycl::handler &cgh) {
+                  cgh.parallel_for(
+                      sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                            sycl::range<3>(1, 1, 32),
+                                        sycl::range<3>(1, 1, 32)),
+                      [=](sycl::nd_item<3> item_ct1) {
+                            dequantize_block_iq4_nl(vx, y, item_ct1);
+                      });
            });
      }
 }
@@ -201,8 +201,7 @@ static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, co
    {
        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-        sycl_parallel_for(
-            stream,
+        stream->parallel_for(
            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
            [=](sycl::nd_item<3> item_ct1) {
@@ -220,8 +219,7 @@ static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, co
    {
        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-        sycl_parallel_for(
-            stream,
+        stream->parallel_for(
            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
            [=](sycl::nd_item<3> item_ct1) {
@@ -239,8 +237,7 @@ static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, co
    {
        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-        sycl_parallel_for(
-            stream,
+        stream->parallel_for(
            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
            [=](sycl::nd_item<3> item_ct1) {
@@ -256,11 +253,11 @@ static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, c
                                   const int nb12, const int nb13, queue_ptr stream) {
    GGML_ASSERT(ne % QK8_0 == 0);
    const int num_blocks = ne / QK8_0;
-    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                         });
 }

 static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -268,11 +265,11 @@ static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, c
                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ne;
-    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                         });
 }

 static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -281,11 +278,11 @@ static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, c
                                   const int nb12, const int nb13, queue_ptr stream) {
    GGML_ASSERT(ne % QK4_0 == 0);
    const int num_blocks = ne / QK4_0;
-    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                         });
 }

 static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -293,9 +290,8 @@ static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, c
                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ne;
-    sycl_parallel_for(
-        stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-        [=](sycl::nd_item<3> item_ct1) {
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
            cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
                                                                     nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
                                                                     item_ct1);
@@ -308,11 +304,11 @@ static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, c
                                   const int nb12, const int nb13, queue_ptr stream) {
    GGML_ASSERT(ne % QK4_1 == 0);
    const int num_blocks = ne / QK4_1;
-    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                         });
 }

 static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -320,9 +316,8 @@ static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, c
                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ne;
-    sycl_parallel_for(
-        stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-        [=](sycl::nd_item<3> item_ct1) {
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
            cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
                                                                     nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
                                                                     item_ct1);
@@ -335,11 +330,11 @@ static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, c
                                   const int nb12, const int nb13, queue_ptr stream) {
    GGML_ASSERT(ne % QK5_0 == 0);
    const int num_blocks = ne / QK5_0;
-    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                         });
 }

 static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -347,9 +342,8 @@ static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, c
                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ne;
-    sycl_parallel_for(
-        stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-        [=](sycl::nd_item<3> item_ct1) {
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
            cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
                                                                     nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
                                                                     item_ct1);
@@ -362,11 +356,11 @@ static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, c
                                   const int nb12, const int nb13, queue_ptr stream) {
    GGML_ASSERT(ne % QK5_1 == 0);
    const int num_blocks = ne / QK5_1;
-    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                              ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
+                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+                         });
 }

 static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -374,9 +368,8 @@ static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, c
                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ne;
-    sycl_parallel_for(
-        stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-        [=](sycl::nd_item<3> item_ct1) {
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
            cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02,
                                                                     nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13,
                                                                     item_ct1);
@@ -389,11 +382,11 @@ static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne,
                                     const int nb12, const int nb13, queue_ptr stream) {
    GGML_ASSERT(ne % QK4_NL == 0);
    const int num_blocks = ne / QK4_NL;
-    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-                                                                 ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) {
+            cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
+                                                   ne12, nb10, nb11, nb12, nb13, item_ct1);
+        });
 }

 static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
@@ -404,8 +397,7 @@ static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, co
    {
        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

-        sycl_parallel_for(
-            stream,
+        stream->parallel_for(
            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
            [=](sycl::nd_item<3> item_ct1) {
@@ -424,8 +416,7 @@ static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, co
        // dpct::has_capability_or_fail(stream->get_device(),
        //                              {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream,
+        stream->parallel_for(
            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
            [=](sycl::nd_item<3> item_ct1) {
@@ -444,8 +435,7 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
        // dpct::has_capability_or_fail(stream->get_device(),
        //                              {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream,
+        stream->parallel_for(
            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
            [=](sycl::nd_item<3> item_ct1) {
@@ -460,13 +450,11 @@ static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const
                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
-    sycl_parallel_for(stream,
-                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                                        sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
-                                                     ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
+            cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+        });
 }


@@ -475,13 +463,11 @@ static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const
                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
-    sycl_parallel_for(stream,
-                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                                        sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
-                                                     ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
+            cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+        });
 }


@@ -491,13 +477,11 @@ static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);

-    sycl_parallel_for(stream,
-                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                                        sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
-                                                     ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
+            cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+        });
 }


@@ -506,13 +490,10 @@ static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const
                                   const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                   const int nb12, const int nb13, queue_ptr stream) {
    const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
-    sycl_parallel_for(stream,
-                      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                                        sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-                      [=](sycl::nd_item<3> item_ct1) {
-                          cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
-                                                     ne12, nb10, nb11, nb12, nb13, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
+            cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+        });
 }


@@ -522,13 +503,10 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const
                                   const int nb12, const int nb13, queue_ptr stream) {

   const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
-   sycl_parallel_for(stream,
-                     sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
-                                       sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
-                     [=](sycl::nd_item<3> item_ct1) {
-                         cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11,
-                                                    ne12, nb10, nb11, nb12, nb13, item_ct1);
-                     });
+   stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
+            cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
+        });
 }

 void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
@@ -208,10 +208,12 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1);
-                          });
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
+                                                          nrows, item_ct1);
+            });
    }
 }

@@ -875,11 +877,12 @@ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloa
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(vx, y, dst, ncols,
-                                                                                                    nrows, item_ct1);
-                          });
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
    }
 }

@@ -897,10 +900,12 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(vx, y, dst, ncols, nrows, item_ct1);
-                          });
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
    }
 }

@@ -916,10 +921,12 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(vx, y, dst, ncols, nrows, item_ct1);
-                          });
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
    }
 }

@@ -935,10 +942,12 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(vx, y, dst, ncols, nrows, item_ct1);
-                          });
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
    }
 }

@@ -954,10 +963,12 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(vx, y, dst, ncols, nrows, item_ct1);
-                          });
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
    }
 }

@@ -973,10 +984,12 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(vx, y, dst, ncols, nrows, item_ct1);
-                          });
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
    }
 }

@@ -989,10 +1002,11 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
    const int block_num_y = (nrows + ny - 1) / ny;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
-    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-                          dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
 }

 static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
@@ -1004,10 +1018,11 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
    const int block_num_y = (nrows + ny - 1) / ny;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
-    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-                          dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
 }

 static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
@@ -1019,10 +1034,11 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
    const int block_num_y = (nrows + ny - 1) / ny;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
-    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-                          dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
 }

 static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
@@ -1031,10 +1047,11 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
                                             dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
-    sycl_parallel_for(stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
-                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-                          dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
+        });
 }

 static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
@@ -1046,10 +1063,11 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
    const int block_num_y = (nrows + ny - 1) / ny;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
-    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
-                          dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
-                      });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
+            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
 }

 void ggml_sycl_op_dequantize_mul_mat_vec(
@@ -13,10 +13,10 @@
 #ifndef GGML_SYCL_DPCT_HELPER_HPP
 #define GGML_SYCL_DPCT_HELPER_HPP

-#include <map>
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
 #include <syclcompat/math.hpp>
+#include <map>

 #ifdef GGML_SYCL_USE_INTEL_ONEMKL
 #include <oneapi/mkl.hpp>
@@ -118,36 +118,6 @@ inline auto get_onemath_backend(sycl::queue& queue)
 #endif
 }

-#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
-    namespace syclex = sycl::ext::oneapi::experimental;
-#endif
-
-template <int NR, typename Func>
-__dpct_inline__ void sycl_parallel_for(sycl::handler & cgh, sycl::nd_range<NR> nd_range, Func && func) {
-#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
-    syclex::nd_launch(cgh, nd_range, func);
-#else
-    cgh.parallel_for(nd_range, func);
-#endif
-}
-
-template <int NR, typename Func>
-__dpct_inline__ void sycl_parallel_for(sycl::queue * q, sycl::nd_range<NR> nd_range, Func && func) {
-#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
-    syclex::nd_launch(*q, nd_range, func);
-#else
-    q->parallel_for(nd_range, func);
-#endif
-}
-
-template <typename Func> __dpct_inline__ void sycl_launch(sycl::queue * stream, Func && func) {
-#ifdef SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS
-    syclex::submit(*stream, func);
-#else
-    stream->submit(func);
-#endif
-}
-
 namespace dpct
 {
    typedef sycl::queue *queue_ptr;
@@ -407,7 +407,7 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
                         const int ne12, const int nb1, const int nb2,
                         const int offset, queue_ptr stream) {
    int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE);
-    sycl_parallel_for(stream,
+    stream->parallel_for(
        sycl::nd_range<1>(sycl::range<1>(num_blocks) *
                              sycl::range<1>(SYCL_ACC_BLOCK_SIZE),
                          sycl::range<1>(SYCL_ACC_BLOCK_SIZE)),
@@ -425,8 +425,8 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
    int dst_size = ne10 * ne11 * ne12 * ne13;
    int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE);
    sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
-    sycl_parallel_for<1>(
-        stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
+    stream->parallel_for(
+        sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
            upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
        });
 }
@@ -437,7 +437,7 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
                         const int ne1, const int ne2, queue_ptr stream) {
    int num_blocks = ceil_div(ne0, SYCL_PAD_BLOCK_SIZE);
    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    sycl_parallel_for(stream,
+    stream->parallel_for(
                      sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
                                        sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
                      [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
@@ -639,7 +639,7 @@ static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, 256);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                  sycl::range<1>(256)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -652,7 +652,7 @@ static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, 256);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                  sycl::range<1>(256)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -665,7 +665,7 @@ static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, 256);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                  sycl::range<1>(256)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -678,7 +678,7 @@ static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tenso
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -691,7 +691,7 @@ static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tenso
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -704,7 +704,7 @@ static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -717,7 +717,7 @@ static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_t
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -730,7 +730,7 @@ static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tenso
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -743,7 +743,7 @@ static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tenso
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -756,7 +756,7 @@ static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggm
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -769,7 +769,7 @@ static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -782,7 +782,7 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -795,7 +795,7 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -808,7 +808,7 @@ static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -821,7 +821,7 @@ static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tenso
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -834,7 +834,7 @@ static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_te
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -847,7 +847,7 @@ static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tenso
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -860,7 +860,7 @@ static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -873,7 +873,7 @@ static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -888,7 +888,7 @@ static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) {
            const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -901,7 +901,7 @@ static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -935,7 +935,7 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) {
            const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->parallel_for(
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
@@ -967,7 +967,7 @@ static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tens
    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
            const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(main_stream,
+            main_stream->parallel_for(
                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
            });
@@ -978,7 +978,7 @@ static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tens
    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
            const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
-            sycl_parallel_for(main_stream,
+            main_stream->parallel_for(
                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
            });
@@ -989,7 +989,7 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
            const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
-            sycl_parallel_for(main_stream,
+            main_stream->parallel_for(
                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
            });
@@ -1000,7 +1000,7 @@ static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_
    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
            const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(main_stream,
+            main_stream->parallel_for(
                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
            });
@@ -1011,7 +1011,7 @@ static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggm
    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
            const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(main_stream,
+            main_stream->parallel_for(
                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
            });
@@ -118,10 +118,12 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr

    GGML_ASSERT(ne00 % 2 == 0);

-    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-        k_get_rows<qk, qr, dq>(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12,
-                               item_ct1);
-    });
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             k_get_rows<qk, qr, dq>(
+                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+                         });

    GGML_UNUSED(dst);
    GGML_UNUSED(ctx);
@@ -154,8 +156,9 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});

-        sycl_parallel_for(
-            stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
            });
@@ -1746,12 +1746,13 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
    const size_t shared_mem = ncols_pad * sizeof(int);

    if (order == GGML_SORT_ORDER_ASC) {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler &cgh) {
            sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
                sycl::range<1>(shared_mem), cgh);

-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
                    k_argsort_f32_i32<GGML_SORT_ORDER_ASC>(
                        x, dst, ncols, ncols_pad, item_ct1,
                        dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
@@ -1759,12 +1760,13 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
                });
        });
    } else if (order == GGML_SORT_ORDER_DESC) {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler &cgh) {
            sycl::local_accessor<uint8_t, 1> dpct_local_acc_ct1(
                sycl::range<1>(shared_mem), cgh);

-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
                    k_argsort_f32_i32<GGML_SORT_ORDER_DESC>(
                        x, dst, ncols, ncols_pad, item_ct1,
                        dpct_local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>()
@@ -1782,47 +1784,50 @@ static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
    const sycl::range<3> block_nums(1, nrows, 1);
    const size_t shared_mem = 256 * sizeof(float);

-    sycl_launch(stream, [&](sycl::handler & cgh) {
+    stream->submit([&](sycl::handler &cgh) {
        sycl::local_accessor<float, 1> shared_data(
            sycl::range<1>(shared_mem/sizeof(float)), cgh);
        sycl::local_accessor<int, 1> shared_indices(
            sycl::range<1>(shared_mem/sizeof(float)), cgh);

-        sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-            const int tid = item_ct1.get_local_id(2);
-            const int row = item_ct1.get_global_id(1);
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                const int tid = item_ct1.get_local_id(2);
+                const int row = item_ct1.get_global_id(1);

-            float max_val = -INFINITY;
-            int   max_idx = -1;
+                float max_val = -INFINITY;
+                int max_idx = -1;

-            for (int col = tid; col < ncols; col += 256) {
-                float val = x[row * ncols + col];
-                if (val > max_val) {
-                    max_val = val;
-                    max_idx = col;
-                }
-            }
-
-            shared_data[tid]    = max_val;
-            shared_indices[tid] = max_idx;
-            item_ct1.barrier(sycl::access::fence_space::local_space);
-
-            for (int stride = 256 / 2; stride > 0; stride >>= 1) {
-                if (tid < stride) {
-                    float val1 = shared_data[tid];
-                    float val2 = shared_data[tid + stride];
-                    if (val2 > val1) {
-                        shared_data[tid]    = val2;
-                        shared_indices[tid] = shared_indices[tid + stride];
+                for (int col = tid; col < ncols; col += 256) {
+                    float val = x[row * ncols + col];
+                    if (val > max_val) {
+                        max_val = val;
+                        max_idx = col;
                    }
                }
-                item_ct1.barrier(sycl::access::fence_space::local_space);
-            }

-            if (tid == 0) {
-                dst[row] = shared_indices[0];
-            }
-        });
+                shared_data[tid] = max_val;
+                shared_indices[tid] = max_idx;
+                item_ct1.barrier(sycl::access::fence_space::local_space);
+
+                for (int stride = 256/2; stride > 0; stride >>= 1) {
+                    if (tid < stride) {
+                        float val1 = shared_data[tid];
+                        float val2 = shared_data[tid + stride];
+                        if (val2 > val1) {
+                            shared_data[tid] = val2;
+                            shared_indices[tid] = shared_indices[tid + stride];
+                        }
+                    }
+                    item_ct1.barrier(sycl::access::fence_space::local_space);
+                }
+
+
+                if (tid == 0) {
+                    dst[row] = shared_indices[0];
+                }
+            });
    });
 }
 static void diag_mask_inf_f32_sycl(const float *x, float *dst,
@@ -2895,7 +2900,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
                void **       ptrs_dst_get = ptrs_dst.get();
                size_t        nb12_scaled  = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
                size_t        nb13_scaled  = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
-                sycl_parallel_for(cgh, sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
                    k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
                                           nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
                });
@@ -3403,7 +3408,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
            {
                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
                sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
-                sycl_launch(stream, [&](sycl::handler & cgh) {
+                stream->submit([&](sycl::handler &cgh) {
                    sycl::local_accessor<int, 0> src1_row_acc(cgh);

                    char *__restrict src1_contiguous_get =
@@ -3415,8 +3420,9 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                    size_t ids_nb_ct6 = ids->nb[1];
                    size_t ids_nb_ct7 = ids->nb[0];

-                    sycl_parallel_for(
-                        cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                    cgh.parallel_for(
+                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                        [=](sycl::nd_item<3> item_ct1) {
                            k_copy_src1_to_contiguous(
                                src1_original, src1_contiguous_get,
                                dev_cur_src1_row_get,
@@ -3447,14 +3453,15 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
            {
                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
                sycl::range<3> grid_dims(1, 1, num_src1_rows);
-                sycl_launch(stream, [&](sycl::handler & cgh) {
+                stream->submit([&](sycl::handler &cgh) {
                    const char *__restrict dst_contiguous_get =
                        dst_contiguous.get();
                    const mmid_row_mapping *__restrict dev_row_mapping_get =
                        dev_row_mapping.get();

-                    sycl_parallel_for(
-                        cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                    cgh.parallel_for(
+                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                        [=](sycl::nd_item<3> item_ct1) {
                            k_copy_dst_from_contiguous(dst_original,
                                                       dst_contiguous_get,
                                                       dev_row_mapping_get,
@@ -11,13 +11,13 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
    const u_int n_seq_tokens = T / B;
    sycl::range<1> block_dims((C / H));
    sycl::range<1> grid_dims((B * H));
-    sycl_launch(stream, [&](sycl::handler & cgh) {
+    stream->submit([&](sycl::handler & cgh) {
        /* local memory accessors*/
        auto _k  = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
        auto _r  = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
        auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);

-        sycl_parallel_for<1>(cgh, sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
+        cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
            u_int tid = item.get_local_id(0);
            u_int bid = item.get_group(0);

@@ -70,7 +70,7 @@ static void im2col_sycl_internal(const float * x, T * dst, int64_t IW, int64_t I

    const int64_t CHW = IC * KH * KW;

-    sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
+    stream->parallel_for(sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
        im2col_kernel<T>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, CHW, s0, s1,
                         p0, p1, d0, d1, item_ct1);
    });
@@ -1818,7 +1818,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1829,8 +1829,9 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q4_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -1852,7 +1853,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
@@ -1863,8 +1864,9 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q4_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -1931,7 +1933,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1942,8 +1944,9 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q4_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -1965,7 +1968,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
@@ -1976,8 +1979,9 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q4_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2044,7 +2048,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2055,8 +2059,9 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q5_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2078,7 +2083,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
@@ -2089,8 +2094,9 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q5_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2157,7 +2163,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2168,8 +2174,9 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q5_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2191,7 +2198,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
@@ -2202,8 +2209,9 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q5_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2270,7 +2278,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2281,8 +2289,9 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q8_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2304,7 +2313,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
@@ -2315,8 +2324,9 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q8_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2383,7 +2393,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2396,8 +2406,9 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q2_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2420,7 +2431,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
@@ -2433,8 +2444,9 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q2_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2504,7 +2516,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2519,8 +2531,9 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q3_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2544,7 +2557,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
@@ -2559,8 +2572,9 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q3_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2630,7 +2644,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2643,8 +2657,9 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q4_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2667,7 +2682,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
@@ -2680,8 +2695,9 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q4_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2749,7 +2765,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2762,8 +2778,9 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q5_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2786,7 +2803,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
@@ -2799,8 +2816,9 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q5_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2868,7 +2886,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2881,8 +2899,9 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q6_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -2905,7 +2924,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
            dpct::has_capability_or_fail(stream->get_device(),
                                         {sycl::aspect::fp16});

-            sycl_launch(stream, [&](sycl::handler & cgh) {
+            stream->submit([&](sycl::handler &cgh) {
                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
@@ -2918,8 +2937,9 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);

-                sycl_parallel_for(
-                    cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
                        mul_mat_q6_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
@@ -544,12 +544,12 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy,
    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);

-    sycl_launch(stream, [&](sycl::handler & cgh) {
-        sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
-                          [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
-                                                                                            nd_item);
-                          });
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
+                         [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                             mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
+                                                                                           nd_item);
+                         });
    });
 }

@@ -561,12 +561,12 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float *
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);

    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+        stream->submit([&](sycl::handler & cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                             [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                 mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
+                                     vx, vy, dst, ncols, nrows, item_ct1);
+                             });
        });
    }
 }
@@ -580,12 +580,17 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
+                                      VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -599,12 +604,17 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
+                                      VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -618,12 +628,17 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
+                                      VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -637,12 +652,17 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
+                                      VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -656,12 +676,17 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
+                                      VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -675,12 +700,17 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
+                                      VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -694,12 +724,17 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
+                                      VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -715,12 +750,12 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy,
    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);

-    sycl_launch(stream, [&](sycl::handler & cgh) {
-        sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
-                          [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols, nrows,
-                                                                                            nd_item);
-                          });
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
+                            [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
+                                                                                            nrows, nd_item);
+                            });
    });
 }

@@ -734,12 +769,17 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
+                                      VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -754,12 +794,12 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy,
    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);

-    sycl_launch(stream, [&](sycl::handler & cgh) {
-        sycl_parallel_for(cgh, sycl::nd_range<3>(global_size, workgroup_size),
-                          [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                              mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
-                                                                                            nd_item);
-                          });
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
+                         [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                             mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
+                                                                                           nd_item);
+                         });
    });
 }
 static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
@@ -771,12 +811,17 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
-                                      vx, vy, dst, ncols, nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
+                                      VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -791,12 +836,14 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS / 2, block_iq2_xxs, 1>(vx, vy, dst, ncols,
-                                                                                                  nrows, item_ct1);
-                              });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -810,12 +857,14 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS / 2, block_iq2_xs, 1>(vx, vy, dst, ncols,
-                                                                                               nrows, item_ct1);
-                              });
+        stream->submit([&](sycl::handler & cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -829,12 +878,15 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S / 2, block_iq2_s, 1>(vx, vy, dst, ncols, nrows,
-                                                                                            item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -848,12 +900,15 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS / 2, block_iq3_xxs, 1>(vx, vy, dst, ncols,
-                                                                                                  nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -867,12 +922,15 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S / 2, block_iq3_s, 1>(vx, vy, dst, ncols, nrows,
-                                                                                            item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -886,12 +944,15 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(vx, vy, dst, ncols, nrows,
-                                                                                        item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -905,12 +966,14 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(vx, vy, dst, ncols, nrows,
-                                                                                        item_ct1);
-                              });
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -924,12 +987,15 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(vx, vy, dst, ncols, nrows,
-                                                                                             item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -943,12 +1009,15 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS / 4, block_iq4_xs, 1>(vx, vy, dst, ncols,
-                                                                                               nrows, item_ct1);
-                              });
+
+        stream->submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                        mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
+                            vx, vy, dst, ncols, nrows, item_ct1);
+                    });
        });
    }
 }
@@ -254,13 +254,14 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
    GGML_ASSERT(ncols % WARP_SIZE == 0);
    if (ncols < 1024) {
        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
-                                           nullptr, WARP_SIZE);
-                              });
-        });
+        stream->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
+                });
+            });
    }
    else {
        const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -271,15 +272,16 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
        the limit. To get the device limit, query
        info::device::max_work_group_size. Adjust the work-group size if needed.
        */
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
                            sycl::range<1>(work_group_size / WARP_SIZE), cgh);
-            sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
-                                           get_pointer(s_sum_acc_ct1), work_group_size);
-                              });
-        });
+            cgh.parallel_for(
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
+                });
+            });
    }
 }

@@ -288,14 +290,18 @@ static void group_norm_f32_sycl(const float* x, float* dst,
    const int ne_elements, queue_ptr stream, int device) {
    if (group_size < 1024) {
        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            const float eps_ct4 = eps;
-            sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, nullptr,
-                                                 WARP_SIZE);
-                              });
-        });
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    group_norm_f32(
+                        x, dst, group_size, ne_elements, eps_ct4, item_ct1,
+                        nullptr, WARP_SIZE);
+                });
+            });
    }
    else {
        const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -307,18 +313,22 @@ static void group_norm_f32_sycl(const float* x, float* dst,
        info::device::max_work_group_size. Adjust the work-group size if needed.
        */

-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
                cgh);

            const float eps_ct4 = eps;

-            sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1,
-                                                 get_pointer(s_sum_acc_ct1), work_group_size);
-                              });
-        });
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    group_norm_f32(x, dst, group_size, ne_elements,
+                        eps_ct4, item_ct1,
+                        get_pointer(s_sum_acc_ct1), work_group_size);
+                });
+            });
    }
 }

@@ -330,13 +340,14 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
    const sycl::range<3> global_dims(nsamples, nchannels, nrows);
    if (ncols < 1024) {
        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
-                                               nullptr, WARP_SIZE);
-                              });
-        });
+        stream->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
+                });
+            });
    }
    else {
        const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -347,15 +358,16 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
        the limit. To get the device limit, query
        info::device::max_work_group_size. Adjust the work-group size if needed.
        */
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
                cgh);
-            sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
-                                               get_pointer(s_sum_acc_ct1), work_group_size);
-                              });
-        });
+            cgh.parallel_for(
+                sycl::nd_range<3>(global_dims * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
+                });
+            });
    }
 }

@@ -366,12 +378,16 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
    // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
    if (ncols < 1024) {
        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        sycl_launch(stream, [&](sycl::handler & cgh) {
-            sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  l2_norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE);
-                              });
-        });
+        stream->submit([&](sycl::handler& cgh) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    l2_norm_f32(x, dst, ncols, eps, item_ct1,
+                        nullptr, WARP_SIZE);
+                });
+            });
    }
    else {
        const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -382,15 +398,18 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
        the limit. To get the device limit, query
        info::device::max_work_group_size. Adjust the work-group size if needed.
        */
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
                cgh);
-            sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
-                              [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                  l2_norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1),
-                                              work_group_size);
-                              });
-        });
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                    block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                    l2_norm_f32(x, dst, ncols, eps, item_ct1,
+                        get_pointer(s_sum_acc_ct1), work_group_size);
+                });
+            });
    }
 }

@@ -232,22 +232,20 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
        the limit. To get the device limit, query
        info::device::max_work_group_size. Adjust the work-group size if needed.
        */
-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) {
-                              rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
-                                                  attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
-                          });
+        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
+                                theta_scale, freq_factors, item_ct1);
+        });
    } else {
        /*
        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
        the limit. To get the device limit, query
        info::device::max_work_group_size. Adjust the work-group size if needed.
        */
-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) {
-                              rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
-                                                 attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
-                          });
+        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
+                               theta_scale, freq_factors, item_ct1);
+        });
    }
 }

@@ -266,17 +264,15 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

    if (freq_factors == nullptr) {
-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) {
-                              rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
-                                                  attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
-                          });
+        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
+                                theta_scale, freq_factors, item_ct1);
+        });
    } else {
-        sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                          [=](sycl::nd_item<3> item_ct1) {
-                              rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
-                                                 attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
-                          });
+        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
+                               theta_scale, freq_factors, item_ct1);
+        });
    }
 }

@@ -299,12 +295,12 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1,
    }
    // launch kernel
    if (freq_factors == nullptr) {
-        sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
+        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
            rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                  corr_dims, theta_scale, freq_factors, sections, item_ct1);
        });
    } else {
-        sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
+        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
            rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                 corr_dims, theta_scale, freq_factors, sections, item_ct1);
        });
@@ -334,12 +330,12 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
    }
    // launch kernel
    if (freq_factors == nullptr) {
-        sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
+        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
            rope_vision<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                  corr_dims, theta_scale, freq_factors, sections, item_ct1);
        });
    } else {
-        sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
+        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
            rope_vision<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                 corr_dims, theta_scale, freq_factors, sections, item_ct1);
        });
@@ -48,7 +48,7 @@ static void set_rows_sycl_q(const char * __restrict__ src0_d,
    constexpr int block_size   = 256;
    const int64_t grid_size    = ceil_div(total_blocks, block_size);

-    sycl_parallel_for(stream, sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {
+    stream->parallel_for(sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {
        const int64_t i = item_ct1.get_global_linear_id();
        if (i >= total_blocks) {
            return;
@@ -129,8 +129,7 @@ static void set_rows_sycl(
    constexpr int block_size = 64;
    const int64_t grid_size = ceil_div(total_elements, block_size);

-    sycl_parallel_for(
-        stream,
+    stream->parallel_for(
        sycl::nd_range<1>(grid_size * block_size, block_size),
        [=](sycl::nd_item<1> item_ct1) {
            k_set_rows<TIn, TOut>(
@@ -127,11 +127,11 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
                                   const int nrows_y, const float scale, const float max_bias, const float m0,
                                   const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
                                   const size_t n_local_scratch, queue_ptr stream) {
-    sycl_launch(stream, [&](sycl::handler & cgh) {
+    stream->submit([&](sycl::handler &cgh) {
        sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);

-        sycl_parallel_for(
-            cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
                soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
                                                                             nrows_y, scale, max_bias, m0,
@@ -45,9 +45,14 @@ static void timestep_embedding_f32_sycl(
    int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
    sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
    sycl::range<3> gridDim(1, ne00, num_blocks);
-    sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
-        timestep_embedding_f32(x, dst, nb1, dim, max_period, item_ct1);
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(
+            gridDim * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) {
+            timestep_embedding_f32(
+                x, dst, nb1, dim, max_period, item_ct1
+            );
+        });
 }

 void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
@@ -207,11 +207,12 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {

    // Submit kernel
    if (C / H == WKV_BLOCK_SIZE) {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);

-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
                    rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE>(
                        B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
                        item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -219,11 +220,12 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
                });
        });
    } else {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);

-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
                    rwkv_wkv6_f32_kernel<WKV_BLOCK_SIZE * 2>(
                        B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d,
                        item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -262,11 +264,12 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {

    // Submit kernel
    if (C / H == WKV_BLOCK_SIZE) {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);

-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
                    rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE>(
                        B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
                        item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -274,11 +277,12 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
                });
        });
    } else {
-        sycl_launch(stream, [&](sycl::handler & cgh) {
+        stream->submit([&](sycl::handler& cgh) {
            sycl::local_accessor<float, 1> shared_mem_acc(shared_mem_size, cgh);

-            sycl_parallel_for(
-                cgh, sycl::nd_range<3>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            cgh.parallel_for(
+                sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
                    rwkv_wkv7_f32_kernel<WKV_BLOCK_SIZE * 2>(
                        B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d,
                        item_ct1, (float*)shared_mem_acc.get_multi_ptr<sycl::access::decorated::no>().get()
@@ -5,8 +5,14 @@
 #include "ggml-cpu.h"
 #endif

+// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
+#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
+
 #include <vulkan/vulkan.hpp>

+// See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
+VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+
 #include <algorithm>
 #include <cmath>
 #include <iomanip>
@@ -121,6 +127,8 @@ struct vk_pipeline_struct {
    bool needed {};
    // set to true when the shader has been compiled
    bool compiled {};
+    // number of registers used, extracted from pipeline executable properties
+    uint32_t register_count {};
 };

 typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
@@ -429,6 +437,8 @@ struct vk_device_struct {

    bool coopmat2;

+    bool pipeline_executable_properties_support {};
+
    size_t idx;

    bool mul_mat_l[GGML_TYPE_COUNT];
@@ -1221,8 +1231,6 @@ static std::string format_size(size_t size) {
    return oss.str();
 }

-static std::mutex log_mutex;
-
 class vk_memory_logger {
 public:
    vk_memory_logger(): total_device(0), total_host(0) {}
@@ -1412,6 +1420,8 @@ struct ggml_backend_vk_buffer_context {
 };

 #ifdef GGML_VULKAN_MEMORY_DEBUG
+static std::mutex log_mutex;
+
 void vk_memory_logger::log_allocation(vk_buffer_ref buf_ref, size_t size) {
    std::lock_guard<std::mutex> guard(log_mutex);
    vk_buffer buf = buf_ref.lock();
@@ -1603,6 +1613,20 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
        vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
    }

+    if (device->pipeline_executable_properties_support) {
+        vk::PipelineExecutableInfoKHR executableInfo;
+        executableInfo.pipeline = pipeline->pipeline;
+
+        auto statistics = device->device.getPipelineExecutableStatisticsKHR(executableInfo);
+        for (auto & s : statistics) {
+            // "Register Count" is reported by NVIDIA drivers.
+            if (strcmp(s.name, "Register Count") == 0) {
+                VK_LOG_DEBUG(pipeline->name << " " << s.name << ": " << s.value.u64 << " registers");
+                pipeline->register_count = (uint32_t)s.value.u64;
+            }
+        }
+    }
+
    {
        std::lock_guard<std::recursive_mutex> guard(device->mutex);
        device->all_pipelines.push_back(pipeline);
@@ -1960,7 +1984,7 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
        }
    }

-    if (buf->device_memory == VK_NULL_HANDLE) {
+    if (!buf->device_memory) {
        device->device.destroyBuffer(buf->buffer);
        throw vk::OutOfDeviceMemoryError("No suitable memory type found");
    }
@@ -3610,6 +3634,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
        bool amd_shader_core_properties2 = false;
        bool pipeline_robustness = false;
        bool coopmat2_support = false;
+        bool pipeline_executable_properties_support = false;
        device->coopmat_support = false;
        device->integer_dot_product = false;
        bool bfloat16_support = false;
@@ -3652,6 +3677,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
                       !getenv("GGML_VK_DISABLE_BFLOAT16")) {
                bfloat16_support = true;
 #endif
+            } else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) {
+                pipeline_executable_properties_support = true;
            }
        }

@@ -3878,8 +3905,18 @@ static vk_device ggml_vk_get_device(size_t idx) {
            device_extensions.push_back("VK_KHR_shader_integer_dot_product");
        }

+        VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR pep_features {};
+        pep_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
+        if (pipeline_executable_properties_support) {
+            last_struct->pNext = (VkBaseOutStructure *)&pep_features;
+            last_struct = (VkBaseOutStructure *)&pep_features;
+            device_extensions.push_back("VK_KHR_pipeline_executable_properties");
+        }
+
        vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);

+        device->pipeline_executable_properties_support = pipeline_executable_properties_support;
+
        device->fp16 = device->fp16 && vk12_features.shaderFloat16;

 #if defined(VK_KHR_shader_bfloat16)
@@ -4395,6 +4432,9 @@ static void ggml_vk_instance_init() {
    }
    VK_LOG_DEBUG("ggml_vk_instance_init()");

+    // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+
    uint32_t api_version = vk::enumerateInstanceVersion();

    if (api_version < VK_API_VERSION_1_2) {
@@ -4462,6 +4502,9 @@ static void ggml_vk_instance_init() {

    vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;

+    // See https://github.com/KhronosGroup/Vulkan-Hpp?tab=readme-ov-file#extensions--per-device-function-pointers-
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vk_instance.instance);
+
    std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();

    // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@@ -4497,7 +4540,7 @@ static void ggml_vk_instance_init() {
            new_driver.pNext = &new_id;
            devices[i].getProperties2(&new_props);

-            if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
+            if (new_props.properties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu || new_props.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) {
                // Check if there are two physical devices corresponding to the same GPU
                auto old_device = std::find_if(
                    vk_instance.device_indices.begin(),
@@ -4567,7 +4610,7 @@ static void ggml_vk_instance_init() {
            }
        }

-        // If no dedicated GPUs found, fall back to the first non-CPU device.
+        // If no GPUs found, fall back to the first non-CPU device.
        // If only CPU devices are available, return without devices.
        if (vk_instance.device_indices.empty()) {
            for (size_t i = 0; i < devices.size(); i++) {
@@ -12078,12 +12121,63 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
    }
 }

+static vk::PhysicalDeviceType ggml_backend_vk_get_device_type(int device_idx) {
+    GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());
+
+    vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]];
+
+    vk::PhysicalDeviceProperties2 props = {};
+    device.getProperties2(&props);
+
+    return props.properties.deviceType;
+}
+
+static std::string ggml_backend_vk_get_device_pci_id(int device_idx) {
+    GGML_ASSERT(device_idx >= 0 && device_idx < (int) vk_instance.device_indices.size());
+
+    vk::PhysicalDevice device = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device_idx]];
+
+    const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
+
+    bool ext_support = false;
+
+    for (const auto& properties : ext_props) {
+        if (strcmp("VK_EXT_pci_bus_info", properties.extensionName) == 0) {
+            ext_support = true;
+            break;
+        }
+    }
+
+    if (!ext_support) {
+        return "";
+    }
+
+    vk::PhysicalDeviceProperties2 props = {};
+    vk::PhysicalDevicePCIBusInfoPropertiesEXT pci_bus_info = {};
+
+    props.pNext = &pci_bus_info;
+
+    device.getProperties2(&props);
+
+    const uint32_t pci_domain = pci_bus_info.pciDomain;
+    const uint32_t pci_bus = pci_bus_info.pciBus;
+    const uint32_t pci_device = pci_bus_info.pciDevice;
+    const uint8_t pci_function = (uint8_t) pci_bus_info.pciFunction; // pci function is between 0 and 7, prevent printf overflow warning
+
+    char pci_bus_id[16] = {};
+    snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.%x", pci_domain, pci_bus, pci_device, pci_function);
+
+    return std::string(pci_bus_id);
+}
+
 //////////////////////////

 struct ggml_backend_vk_device_context {
    size_t device;
    std::string name;
    std::string description;
+    bool is_integrated_gpu;
+    std::string pci_bus_id;
 };

 static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
@@ -12112,16 +12206,18 @@ static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(gg
 }

 static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    // TODO: return GGML_BACKEND_DEVICE_TYPE_IGPU for integrated GPUs
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+
+    return ctx->is_integrated_gpu ? GGML_BACKEND_DEVICE_TYPE_IGPU : GGML_BACKEND_DEVICE_TYPE_GPU;
 }

 static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+
    props->name        = ggml_backend_vk_device_get_name(dev);
    props->description = ggml_backend_vk_device_get_description(dev);
    props->type        = ggml_backend_vk_device_get_type(dev);
-    // TODO: set props->device_id to PCI bus id
+    props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
    ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = {
        /* .async                 = */ false,
@@ -12388,8 +12484,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                }

                if (
-                    src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32 ||
-                    src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32
+                    (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) ||
+                    (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32)
                ) {
                    return true;
                }
@@ -12554,6 +12650,8 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                ctx->device = i;
                ctx->name = GGML_VK_NAME + std::to_string(i);
                ctx->description = desc;
+                ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
+                ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
                devices.push_back(new ggml_backend_device {
                    /* .iface   = */ ggml_backend_vk_device_i,
                    /* .reg     = */ reg,
@@ -13040,16 +13138,16 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
    } else if (tensor->op == GGML_OP_IM2COL_3D) {
        const int32_t s0 = tensor->op_params[0];
        const int32_t s1 = tensor->op_params[1];
-        const int32_t s1 = tensor->op_params[2];
+        const int32_t s2 = tensor->op_params[2];
        const int32_t p0 = tensor->op_params[3];
        const int32_t p1 = tensor->op_params[4];
-        const int32_t p1 = tensor->op_params[5];
+        const int32_t p2 = tensor->op_params[5];
        const int32_t d0 = tensor->op_params[6];
        const int32_t d1 = tensor->op_params[7];
-        const int32_t d1 = tensor->op_params[8];
+        const int32_t d2 = tensor->op_params[8];
        const int32_t IC = tensor->op_params[9];

-        tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
+        tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
    } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
        const int32_t dim = tensor->op_params[0];
        const int32_t max_period = tensor->op_params[1];
@@ -29,7 +29,7 @@ void main() {
        uint qs = data_a[ib].qs[4 * ib32 + l];
        const uint8_t sign = data_a[ib].qs[QUANT_K / 8 + 4 * ib32 + l];
        qs |= (qh << (8 - 2 * l)) & 0x300;
-        const uvec2 grid = iq2s_grid[qs & 511];
+        const uvec2 grid = iq2s_grid[qs];
        const u8vec4 grid0 = unpack8(grid.x);
        const u8vec4 grid1 = unpack8(grid.y);
        data_b[b_idx + 8 * l + 0] = D_TYPE(db[l/2] * grid0.x * ((sign & 1) != 0 ? -1.0 : 1.0));
@@ -33,7 +33,8 @@ void main() {
    [[unroll]] for (uint l = 0; l < 4; ++l) {
        const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
        const uint sign8 = sign7 | (bitCount(sign7) << 7); // parity bit
-        const uvec2 grid = iq2xxs_grid[data_a[ib].qs[8 * is + l]];
+        const uint qs = data_a[ib].qs[8 * is + l];
+        const uvec2 grid = iq2xxs_grid[qs];
        const u8vec4 grid0 = unpack8(grid.x);
        const u8vec4 grid1 = unpack8(grid.y);
        data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
@@ -22,15 +22,16 @@ void main() {
    const uint b_idx = 256 * ib + 32 * is;

    const float d = float(data_a[ib].d);
-    const float db = d * (1 + 2 * ((data_a[ib].scales[is] >> (4 * (is % 2))) & 0xf));
+    const float db = d * (1 + 2 * ((data_a[ib].scales[is / 2] >> (4 * (is % 2))) & 0xf));

    // We must produce 32 values using 4 sign bytes, 1 qh byte, 8 qs bytes.
    uint qh = data_a[ib].qh[is];
    [[unroll]] for (uint l = 0; l < 8; ++l) {
-        uint qs = data_a[ib].qs[8 * is + l];
-        uint gidx = qs | ((qh << (8 - l)) & 256);
-        uint8_t signs = data_a[ib].signs[8 * is + l / 2] >> (4 * (l & 1));
-        u8vec4 grid = unpack8(iq3s_grid[gidx]);
+        const uint iqs = 8 * is + l;
+        const uint qs = data_a[ib].qs[iqs];
+        const uint gidx = qs | ((qh << (8 - l)) & 256);
+        const uint8_t signs = data_a[ib].signs[iqs / 2] >> (4 * (l & 1));
+        const u8vec4 grid = unpack8(iq3s_grid[gidx]);
        data_b[b_idx + 4 * l + 0] = D_TYPE(db * grid.x * ((signs & 1) != 0 ? -1.0 : 1.0));
        data_b[b_idx + 4 * l + 1] = D_TYPE(db * grid.y * ((signs & 2) != 0 ? -1.0 : 1.0));
        data_b[b_idx + 4 * l + 2] = D_TYPE(db * grid.z * ((signs & 4) != 0 ? -1.0 : 1.0));
@@ -35,8 +35,10 @@ void main() {
        const uint sign7 = bitfieldExtract(signscale, 7 * int(l), 7);
        // Restore parity bit.
        const uint sign8 = sign7 | (bitCount(sign7) << 7);
-        const u8vec4 grid0 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l]]);
-        const u8vec4 grid1 = unpack8(iq3xxs_grid[data_a[ib].qs[8 * is + 2 * l + 1]]);
+        const uint qs0 = data_a[ib].qs[8 * is + 2 * l];
+        const uint qs1 = data_a[ib].qs[8 * is + 2 * l + 1];
+        const u8vec4 grid0 = unpack8(iq3xxs_grid[qs0]);
+        const u8vec4 grid1 = unpack8(iq3xxs_grid[qs1]);
        data_b[b_idx + 8 * l + 0] = D_TYPE(db * grid0.x * ((sign8 & 1) != 0 ? -1.0 : 1.0));
        data_b[b_idx + 8 * l + 1] = D_TYPE(db * grid0.y * ((sign8 & 2) != 0 ? -1.0 : 1.0));
        data_b[b_idx + 8 * l + 2] = D_TYPE(db * grid0.z * ((sign8 & 4) != 0 ? -1.0 : 1.0));
@@ -183,6 +183,8 @@ void load_row_ids(uint expert_idx, bool nei0_is_pow2, uint ic) {
 shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
 #endif

+#include "mul_mm_funcs.comp"
+
 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
    init_iq_shmem(gl_WorkGroupSize);
@@ -310,550 +312,13 @@ void main() {

    for (uint block = start_k; block < end_k; block += BK) {
        [[unroll]] for (uint l = 0; l < BM; l += loadstride_a) {
-
-#if defined(DATA_A_F32) || defined(DATA_A_F16)
-#if LOAD_VEC_A == 8
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-            A_TYPE32 aa = A_TYPE32(data_a[idx]);
-            buf_a[buf_idx    ] = FLOAT_TYPE(aa[0].x);
-            buf_a[buf_idx + 1] = FLOAT_TYPE(aa[0].y);
-            buf_a[buf_idx + 2] = FLOAT_TYPE(aa[0].z);
-            buf_a[buf_idx + 3] = FLOAT_TYPE(aa[0].w);
-            buf_a[buf_idx + 4] = FLOAT_TYPE(aa[1].x);
-            buf_a[buf_idx + 5] = FLOAT_TYPE(aa[1].y);
-            buf_a[buf_idx + 6] = FLOAT_TYPE(aa[1].z);
-            buf_a[buf_idx + 7] = FLOAT_TYPE(aa[1].w);
-#elif LOAD_VEC_A == 4
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-            A_TYPE32 aa = A_TYPE32(data_a[idx]);
-            buf_a[buf_idx    ] = FLOAT_TYPE(aa.x);
-            buf_a[buf_idx + 1] = FLOAT_TYPE(aa.y);
-            buf_a[buf_idx + 2] = FLOAT_TYPE(aa.z);
-            buf_a[buf_idx + 3] = FLOAT_TYPE(aa.w);
-#else
-            if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
-                buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
-            } else {
-                buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f);
-            }
-#endif
-#elif defined(DATA_A_BF16)
-#if LOAD_VEC_A == 4
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-            buf_a[buf_idx    ] = TO_FLOAT_TYPE(data_a[idx].x);
-            buf_a[buf_idx + 1] = TO_FLOAT_TYPE(data_a[idx].y);
-            buf_a[buf_idx + 2] = TO_FLOAT_TYPE(data_a[idx].z);
-            buf_a[buf_idx + 3] = TO_FLOAT_TYPE(data_a[idx].w);
-#else
-            if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) {
-                buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = TO_FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]);
-            } else {
-                buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = TO_FLOAT_TYPE(uint16_t(0));
-            }
-#endif
-#elif defined(DATA_A_Q4_0)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
-
-            const uint ib = idx / 4;
-            const uint iqs = idx & 0x03;
-
-            const float d = float(data_a_packed16[ib].d);
-            const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
-            const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
-            const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
-
-            buf_a[buf_idx     ] = FLOAT_TYPE(v0.x);
-            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
-            buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
-            buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
-            buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
-            buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
-            buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
-#elif defined(DATA_A_Q4_1)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
-
-            const uint ib = idx / 4;
-            const uint iqs = idx & 0x03;
-
-            const float d = float(data_a_packed16[ib].d);
-            const float m = float(data_a_packed16[ib].m);
-            const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
-            const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
-            const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
-
-            buf_a[buf_idx     ] = FLOAT_TYPE(v0.x);
-            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
-            buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
-            buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
-            buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
-            buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
-            buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
-#elif defined(DATA_A_Q5_0)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
-
-            const uint ib = idx / 8;
-            const uint iqs = idx & 0x07;
-
-            const float d = float(data_a_packed16[ib].d);
-            const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]);
-            const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
-            const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
-
-            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
-            const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d;
-
-            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
-            buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
-#elif defined(DATA_A_Q5_1)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
-
-            const uint ib = idx / 8;
-            const uint iqs = idx & 0x07;
-
-            const float d = float(data_a_packed16[ib].d);
-            const float m = float(data_a_packed16[ib].m);
-            const uint uint_qh = data_a_packed16[ib].qh;
-            const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
-            const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
-
-            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
-            const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m;
-
-            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
-            buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
-#elif defined(DATA_A_Q8_0)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 8;
-            const uint iqs = idx & 0x07;
-
-            const float d = float(data_a_packed16[ib].d);
-            const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147
-            const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy;
-            const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;
-
-            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
-            buf_a[buf_idx + 2] = FLOAT_TYPE(v.z);
-            buf_a[buf_idx + 3] = FLOAT_TYPE(v.w);
-#elif defined(DATA_A_Q2_K)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 128;                         // 2 values per idx
-            const uint iqs = idx % 128;                        // 0..127
-
-            const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
-            const uint scalesi = iqs / 8;                      // 0..15
-            const uint qsshift = ((iqs % 64) / 16) * 2;        // 0,2,4,6
-
-            const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
-            const uint scales = data_a[ib].scales[scalesi];
-            const vec2 d = vec2(data_a[ib].d);
-
-            const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
-
-            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
-#elif defined(DATA_A_Q3_K)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 128;                   // 2 values per idx
-            const uint iqs = idx % 128;                  // 0..127
-
-            const uint n = iqs / 64;                     // 0,1
-            const uint qsi = n * 32 + (iqs % 16) * 2;    // 0,2,4..62
-            const uint hmi =          (iqs % 16) * 2;    // 0,2,4..30
-            const uint j = (iqs % 64) / 4;               // 0..3
-            const uint is = iqs / 8;                     // 0..15
-            const uint halfsplit = ((iqs % 64) / 16);    // 0,1,2,3
-            const uint qsshift = halfsplit * 2;          // 0,2,4,6
-            const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128
-
-            const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
-                                  | (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
-            const float dl = float(data_a[ib].d) * float(us - 32);
-
-            buf_a[buf_idx    ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi    ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi    ] & m) != 0) ? 0 : 4)));
-            buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
-#elif defined(DATA_A_Q4_K)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 128;                 // 2 values per idx
-            const uint iqs = idx % 128;                // 0..127
-
-            const uint n = iqs / 32;                   // 0,1,2,3
-            const uint b = (iqs % 32) / 16;            // 0,1
-            const uint is = 2 * n + b;                 // 0..7
-            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
-
-            const vec2 loadd = vec2(data_a[ib].d);
-
-            const uint scidx0 = (is < 4) ? is : (is + 4);
-            const uint scidx1 = (is < 4) ? is : (is - 4);
-            const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
-            const uint scidxshift1 = (is < 4) ? 0 : 2;
-            const uint mbidx0 = is + 4;
-            const uint mbidx1 = (is < 4) ? is + 4 : is;
-            const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
-            const uint mbidxshift0 = (is < 4) ? 0 : 4;
-            const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
-            const uint mbidxshift1 = (is < 4) ? 0 : 2;
-
-            const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
-            const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
-
-            const float d = loadd.x * sc;
-            const float m = -loadd.y * mbyte;
-
-            buf_a[buf_idx    ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF), m));
-            buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
-#elif defined(DATA_A_Q5_K)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 128;                 // 2 values per idx
-            const uint iqs = idx % 128;                // 0..127
-
-            const uint n = iqs / 32;                   // 0,1,2,3
-            const uint b = (iqs % 32) / 16;            // 0,1
-            const uint is = 2 * n + b;                 // 0..7
-            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
-            const uint qhi = (iqs % 16) * 2;           // 0,2,4..30
-
-            const uint8_t hm = uint8_t(1 << (iqs / 16));
-
-            const vec2 loadd = vec2(data_a[ib].d);
-
-            const uint scidx0 = (is < 4) ? is : (is + 4);
-            const uint scidx1 = (is < 4) ? is : (is - 4);
-            const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
-            const uint scidxshift1 = (is < 4) ? 0 : 2;
-            const uint mbidx0 = is + 4;
-            const uint mbidx1 = (is < 4) ? is + 4 : is;
-            const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
-            const uint mbidxshift0 = (is < 4) ? 0 : 4;
-            const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
-            const uint mbidxshift1 = (is < 4) ? 0 : 2;
-
-            const uint8_t sc    = uint8_t((data_a[ib].scales[scidx0] & 0xF)                         | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
-            const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
-
-            const float d = loadd.x * sc;
-            const float m = -loadd.y * mbyte;
-
-            buf_a[buf_idx    ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi    ] & hm) != 0 ? 16 : 0), m));
-            buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
-#elif defined(DATA_A_Q6_K)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 128;                  // 2 values per idx
-            const uint iqs = idx % 128;                 // 0..127
-
-            const uint n = iqs / 64;                    // 0,1
-            const uint b = (iqs % 64) / 32;             // 0,1
-            const uint is_b = (iqs % 16) / 8;           // 0,1
-            const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
-            const uint is = 8 * n + qhshift + is_b;     // 0..15
-            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
-            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
-
-            const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
-
-            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
-            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
-#elif defined(DATA_A_IQ1_S)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 32;                  // 8 values per idx
-            const uint ib32 = (idx % 32) / 4;         // 0..7
-            const uint ib8 = idx % 32;
-
-            const float d = float(data_a[ib].d);
-            const uint qh = data_a[ib].qh[ib32];
-            const uint qs = data_a[ib].qs[ib8];
-            const float dl = d * (2 * bitfieldExtract(qh, 12, 3) + 1);
-            const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
-            const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]);
-
-            [[unroll]] for (int k = 0; k < 8; ++k) {
-                buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta));
-            }
-#elif defined(DATA_A_IQ1_M)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 32;  // 8 values per idx
-            const uint ib8 = idx % 32;
-            const uint ib16 = ib8 / 2;
-
-            const uint16_t[4] scales = data_a[ib].scales;
-            const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
-            const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
-            const uint sc = scales[ib8 / 8];
-            const uint qs = data_a[ib].qs[ib8];
-            const uint qh = data_a[ib].qh[ib16] >> (4 * (ib8 & 1));
-            const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1);
-            const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
-            const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);
-
-            [[unroll]] for (int k = 0; k < 8; ++k) {
-                buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta));
-            }
-#elif defined(DATA_A_IQ2_XXS)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 32;                 // 8 values per idx
-            const uint ib32 = (idx % 32) / 4;         // 0..7
-            const uint ib8 = idx % 4;
-
-            const float d = float(data_a[ib].d);
-            const uint qs = data_a[ib].qs[8 * ib32 + ib8];
-            const uint signs = pack32(u8vec4(
-                data_a[ib].qs[8*ib32 + 4],
-                data_a[ib].qs[8*ib32 + 5],
-                data_a[ib].qs[8*ib32 + 6],
-                data_a[ib].qs[8*ib32 + 7]
-            ));
-            const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + (signs >> 28)));
-            const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
-            const uint sign = sign7 | (bitCount(sign7) << 7);
-            const uvec2 grid = iq2xxs_grid[qs];
-            const vec4 grid0 = vec4(unpack8(grid.x));
-            const vec4 grid1 = vec4(unpack8(grid.y));
-
-            buf_a[buf_idx    ] = db * FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x);
-            buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y);
-            buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z);
-            buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w);
-            buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x);
-            buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y);
-            buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z);
-            buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
-#elif defined(DATA_A_IQ2_XS)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 32;            // 8 values per idx
-            const uint ib32 = (idx % 32) / 4;    // 0..7
-            const uint ib8 = idx % 4;            // 0..3
-
-            const float d = float(data_a[ib].d);
-            const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
-            const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale));
-            const uint qs = data_a[ib].qs[4 * ib32 + ib8];
-            const uint sign7 = qs >> 9;
-            const uint sign = sign7 | (bitCount(sign7) << 7);
-            const uvec2 grid = iq2xs_grid[qs & 511];
-            const vec4 grid0 = vec4(unpack8(grid.x));
-            const vec4 grid1 = vec4(unpack8(grid.y));
-
-            buf_a[buf_idx    ] = db * FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x);
-            buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y);
-            buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z);
-            buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w);
-            buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x);
-            buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y);
-            buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z);
-            buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
-#elif defined(DATA_A_IQ2_S)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 32;  // 8 values per idx
-            const uint ib8 = idx % 32; // 0..31
-            const uint ib32 = ib8 / 4; // 0..7
-
-            const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
-            const uint qs = data_a[ib].qs[ib8];
-            const uint qh = data_a[ib].qh[ib32];
-            const uint qhshift = 2 * (ib8 % 4);
-            const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8];
-
-            const float d = float(data_a[ib].d);
-            const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale));
-            const uvec2 grid = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)];
-            const vec4 grid0 = vec4(unpack8(grid.x));
-            const vec4 grid1 = vec4(unpack8(grid.y));
-
-            buf_a[buf_idx    ] = db * FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x);
-            buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y);
-            buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z);
-            buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w);
-            buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x);
-            buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y);
-            buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z);
-            buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
-#elif defined(DATA_A_IQ3_XXS)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 64;            // 4 values per idx
-            const uint iqs = idx % 64;           // 0..63
-            const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
-
-            const float d = float(data_a[ib].d);
-            const uint qs = data_a[ib].qs[iqs];
-            const uint signs = pack32(u8vec4(
-                data_a[ib].qs[is+0],
-                data_a[ib].qs[is+1],
-                data_a[ib].qs[is+2],
-                data_a[ib].qs[is+3]
-            ));
-            const float db = d * 0.5 * (0.5 + (signs >> 28));
-            const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
-            const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (4 * (idx % 2));
-            const uint grid = iq3xxs_grid[qs];
-            const vec4 v = db * vec4(unpack8(grid));
-
-            buf_a[buf_idx    ] = FLOAT_TYPE((sign &   1) != 0 ? -v.x : v.x);
-            buf_a[buf_idx + 1] = FLOAT_TYPE((sign &   2) != 0 ? -v.y : v.y);
-            buf_a[buf_idx + 2] = FLOAT_TYPE((sign &   4) != 0 ? -v.z : v.z);
-            buf_a[buf_idx + 3] = FLOAT_TYPE((sign &   8) != 0 ? -v.w : v.w);
-#elif defined(DATA_A_IQ3_S)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 64;            // 4 values per idx
-            const uint iqs = idx % 64;           // 0..63
-            const uint iqh = iqs / 8;
-
-            const float d = float(data_a[ib].d);
-            const uint qs = data_a[ib].qs[iqs];
-            const uint qh = data_a[ib].qh[iqh];
-            const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (4 * (idx % 2)));
-            const uint scale = data_a[ib].scales[iqs / 16];
-            const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign)));
-            const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
-            const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
-            const vec4 v = db * vec4(unpack8(grid));
-
-            buf_a[buf_idx    ] = FLOAT_TYPE((sign &   1) != 0 ? -v.x : v.x);
-            buf_a[buf_idx + 1] = FLOAT_TYPE((sign &   2) != 0 ? -v.y : v.y);
-            buf_a[buf_idx + 2] = FLOAT_TYPE((sign &   4) != 0 ? -v.z : v.z);
-            buf_a[buf_idx + 3] = FLOAT_TYPE((sign &   8) != 0 ? -v.w : v.w);
-#elif defined(DATA_A_IQ4_XS)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
-
-            const uint ib = idx / 128;                  // 2 values per idx
-            const uint ib32 = (idx % 128) / 16;         // 0..7
-            const uint iq = 16 * ib32 + 2 * (idx % 8);
-
-            const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
-            const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
-            const uint qshift = (idx & 8) >> 1;
-            u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
-            qs = (qs >> qshift) & uint8_t(0xF);
-
-            const float d = float(data_a[ib].d);
-            const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
-
-            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
-#elif defined(DATA_A_IQ4_NL)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
-
-            const uint ib = idx / 8;
-            const uint iqs = idx & 0x07;
-
-            const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d);
-            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
-
-            buf_a[buf_idx     ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d;
-            buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d;
-            buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d;
-            buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d;
-#elif defined(DATA_A_MXFP4)
-            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
-
-            const uint ib = idx / 8;
-            const uint iqs = (idx & 0x07) * 2;
-
-            const float d = e8m0_to_fp32(data_a[ib].e);
-            const uint vui = uint(data_a[ib].qs[iqs]);
-            const uint vui2 = uint(data_a[ib].qs[iqs+1]);
-
-            buf_a[buf_idx     ] = FLOAT_TYPE(kvalues_mxfp4[vui & 0xF] * d);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_mxfp4[vui >>  4] * d);
-            buf_a[buf_idx +  1] = FLOAT_TYPE(kvalues_mxfp4[vui2 & 0xF] * d);
-            buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_mxfp4[vui2 >>  4] * d);
-#endif
+            load_a_to_shmem(pos_a, loadr_a, loadc_a + l, ir * BM + loadc_a + l, block + loadr_a, end_k);
        }
        [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
-#if LOAD_VEC_B == 8
-#ifdef MUL_MAT_ID
-            const u16vec2 row_idx = row_ids[loadc_b + l];
-            const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
+#if !defined(MUL_MAT_ID)
+            load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic * BN + loadc_b + l, block + loadr_b, end_k);
 #else
-            const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
-#endif
-            const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
-#if defined(DATA_B_BF16)
-            B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]);
-#else
-            B_TYPE32 bb = B_TYPE32(data_b[idx]);
-#endif
-            buf_b[buf_idx + 0] = FLOAT_TYPE(bb[0].x);
-            buf_b[buf_idx + 1] = FLOAT_TYPE(bb[0].y);
-            buf_b[buf_idx + 2] = FLOAT_TYPE(bb[0].z);
-            buf_b[buf_idx + 3] = FLOAT_TYPE(bb[0].w);
-            buf_b[buf_idx + 4] = FLOAT_TYPE(bb[1].x);
-            buf_b[buf_idx + 5] = FLOAT_TYPE(bb[1].y);
-            buf_b[buf_idx + 6] = FLOAT_TYPE(bb[1].z);
-            buf_b[buf_idx + 7] = FLOAT_TYPE(bb[1].w);
-#elif LOAD_VEC_B == 4
-#ifdef MUL_MAT_ID
-            const u16vec2 row_idx = row_ids[loadc_b + l];
-            const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b;
-#else
-            const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b;
-#endif
-            const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B;
-#if defined(DATA_B_BF16)
-            B_TYPE32 bb = TO_FLOAT_TYPE(data_b[idx]);
-#else
-            B_TYPE32 bb = B_TYPE32(data_b[idx]);
-#endif
-            buf_b[buf_idx + 0] = FLOAT_TYPE(bb.x);
-            buf_b[buf_idx + 1] = FLOAT_TYPE(bb.y);
-            buf_b[buf_idx + 2] = FLOAT_TYPE(bb.z);
-            buf_b[buf_idx + 3] = FLOAT_TYPE(bb.w);
-#elif !MUL_MAT_ID
-            if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) {
-                buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]);
-            } else {
-                buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f);
-            }
-#else
-            const uint row_i = ic * BN + loadc_b + l;
-            if (row_i < _ne1 && block + loadr_b < end_k) {
-                const u16vec2 row_idx = row_ids[loadc_b + l];
-                buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]);
-            } else {
-                buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f);
-            }
+            load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic, _ne1, block + loadr_b, end_k);
 #endif
        }

@@ -0,0 +1,568 @@
+void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uint idx_m, const uint idx_k, const uint end_k) {
+#if defined(DATA_A_F32) || defined(DATA_A_F16)
+#if LOAD_VEC_A == 8
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+            FLOAT_TYPE_VEC8 aa = FLOAT_TYPE_VEC8(data_a[idx]);
+            buf_a[buf_idx    ] = aa[0].x;
+            buf_a[buf_idx + 1] = aa[0].y;
+            buf_a[buf_idx + 2] = aa[0].z;
+            buf_a[buf_idx + 3] = aa[0].w;
+            buf_a[buf_idx + 4] = aa[1].x;
+            buf_a[buf_idx + 5] = aa[1].y;
+            buf_a[buf_idx + 6] = aa[1].z;
+            buf_a[buf_idx + 7] = aa[1].w;
+#elif LOAD_VEC_A == 4
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+            FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(data_a[idx]);
+            buf_a[buf_idx    ] = aa.x;
+            buf_a[buf_idx + 1] = aa.y;
+            buf_a[buf_idx + 2] = aa.z;
+            buf_a[buf_idx + 3] = aa.w;
+#else
+            if (idx_m < p.M && idx_k < end_k) {
+                buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE(data_a[pos_a + col * p.stride_a + row]);
+            } else {
+                buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f);
+            }
+#endif
+#elif defined(DATA_A_BF16)
+#if LOAD_VEC_A == 4
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+            FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_a[idx]));
+            buf_a[buf_idx    ] = aa.x;
+            buf_a[buf_idx + 1] = aa.y;
+            buf_a[buf_idx + 2] = aa.z;
+            buf_a[buf_idx + 3] = aa.w;
+#else
+            if (idx_m < p.M && idx_k < end_k) {
+                buf_a[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_a[pos_a + col * p.stride_a + row]);
+            } else {
+                buf_a[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(uint16_t(0));
+            }
+#endif
+#elif defined(DATA_A_Q4_0)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + 4 * row;
+
+            const uint ib = idx / 4;
+            const uint iqs = idx & 0x03;
+
+            const float d = float(data_a_packed16[ib].d);
+            const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
+            const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
+            const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v0.x);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
+            buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
+            buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
+            buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
+            buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
+#elif defined(DATA_A_Q4_1)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + 4 * row;
+
+            const uint ib = idx / 4;
+            const uint iqs = idx & 0x03;
+
+            const float d = float(data_a_packed16[ib].d);
+            const float m = float(data_a_packed16[ib].m);
+            const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
+            const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
+            const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v0.x);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
+            buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
+            buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
+            buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
+            buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
+#elif defined(DATA_A_Q5_0)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
+
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x07;
+
+            const float d = float(data_a_packed16[ib].d);
+            const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]);
+            const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
+            const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
+
+            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
+            const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
+#elif defined(DATA_A_Q5_1)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
+
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x07;
+
+            const float d = float(data_a_packed16[ib].d);
+            const float m = float(data_a_packed16[ib].m);
+            const uint uint_qh = data_a_packed16[ib].qh;
+            const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
+            const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
+
+            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
+            const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m;
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
+#elif defined(DATA_A_Q8_0)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x07;
+
+            const float d = float(data_a_packed16[ib].d);
+            const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147
+            const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy;
+            const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx + 2] = FLOAT_TYPE(v.z);
+            buf_a[buf_idx + 3] = FLOAT_TYPE(v.w);
+#elif defined(DATA_A_Q2_K)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                         // 2 values per idx
+            const uint iqs = idx % 128;                        // 0..127
+
+            const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
+            const uint scalesi = iqs / 8;                      // 0..15
+            const uint qsshift = ((iqs % 64) / 16) * 2;        // 0,2,4,6
+
+            const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]);
+            const uint scales = data_a[ib].scales[scalesi];
+            const vec2 d = vec2(data_a[ib].d);
+
+            const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_Q3_K)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                   // 2 values per idx
+            const uint iqs = idx % 128;                  // 0..127
+
+            const uint n = iqs / 64;                     // 0,1
+            const uint qsi = n * 32 + (iqs % 16) * 2;    // 0,2,4..62
+            const uint hmi =          (iqs % 16) * 2;    // 0,2,4..30
+            const uint j = (iqs % 64) / 4;               // 0..3
+            const uint is = iqs / 8;                     // 0..15
+            const uint halfsplit = ((iqs % 64) / 16);    // 0,1,2,3
+            const uint qsshift = halfsplit * 2;          // 0,2,4,6
+            const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128
+
+            const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
+                                  | (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
+            const float dl = float(data_a[ib].d) * float(us - 32);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi    ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi    ] & m) != 0) ? 0 : 4)));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
+#elif defined(DATA_A_Q4_K)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                 // 2 values per idx
+            const uint iqs = idx % 128;                // 0..127
+
+            const uint n = iqs / 32;                   // 0,1,2,3
+            const uint b = (iqs % 32) / 16;            // 0,1
+            const uint is = 2 * n + b;                 // 0..7
+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
+
+            const vec2 loadd = vec2(data_a[ib].d);
+
+            const uint scidx0 = (is < 4) ? is : (is + 4);
+            const uint scidx1 = (is < 4) ? is : (is - 4);
+            const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+            const uint scidxshift1 = (is < 4) ? 0 : 2;
+            const uint mbidx0 = is + 4;
+            const uint mbidx1 = (is < 4) ? is + 4 : is;
+            const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+            const uint mbidxshift0 = (is < 4) ? 0 : 4;
+            const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+            const uint mbidxshift1 = (is < 4) ? 0 : 2;
+
+            const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+            const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
+            const float d = loadd.x * sc;
+            const float m = -loadd.y * mbyte;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF), m));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
+#elif defined(DATA_A_Q5_K)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                 // 2 values per idx
+            const uint iqs = idx % 128;                // 0..127
+
+            const uint n = iqs / 32;                   // 0,1,2,3
+            const uint b = (iqs % 32) / 16;            // 0,1
+            const uint is = 2 * n + b;                 // 0..7
+            const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
+            const uint qhi = (iqs % 16) * 2;           // 0,2,4..30
+
+            const uint8_t hm = uint8_t(1 << (iqs / 16));
+
+            const vec2 loadd = vec2(data_a[ib].d);
+
+            const uint scidx0 = (is < 4) ? is : (is + 4);
+            const uint scidx1 = (is < 4) ? is : (is - 4);
+            const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
+            const uint scidxshift1 = (is < 4) ? 0 : 2;
+            const uint mbidx0 = is + 4;
+            const uint mbidx1 = (is < 4) ? is + 4 : is;
+            const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
+            const uint mbidxshift0 = (is < 4) ? 0 : 4;
+            const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
+            const uint mbidxshift1 = (is < 4) ? 0 : 2;
+
+            const uint8_t sc    = uint8_t((data_a[ib].scales[scidx0] & 0xF)                         | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
+            const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
+
+            const float d = loadd.x * sc;
+            const float m = -loadd.y * mbyte;
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi    ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi    ] & hm) != 0 ? 16 : 0), m));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
+#elif defined(DATA_A_Q6_K)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint iqs = idx % 128;                 // 0..127
+
+            const uint n = iqs / 64;                    // 0,1
+            const uint b = (iqs % 64) / 32;             // 0,1
+            const uint is_b = (iqs % 16) / 8;           // 0,1
+            const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
+            const uint is = 8 * n + qhshift + is_b;     // 0..15
+            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
+            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
+
+            const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
+            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
+#elif defined(DATA_A_IQ1_S)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 32;                  // 8 values per idx
+            const uint ib32 = (idx % 32) / 4;         // 0..7
+            const uint ib8 = idx % 32;
+
+            const float d = float(data_a[ib].d);
+            const uint qh = data_a[ib].qh[ib32];
+            const uint qs = data_a[ib].qs[ib8];
+            const float dl = d * (2 * bitfieldExtract(qh, 12, 3) + 1);
+            const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
+            const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]);
+
+            [[unroll]] for (int k = 0; k < 8; ++k) {
+                buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta));
+            }
+#elif defined(DATA_A_IQ1_M)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 32;  // 8 values per idx
+            const uint ib8 = idx % 32;
+            const uint ib16 = ib8 / 2;
+
+            const uint16_t[4] scales = data_a[ib].scales;
+            const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
+            const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
+            const uint sc = scales[ib8 / 8];
+            const uint qs = data_a[ib].qs[ib8];
+            const uint qh = data_a[ib].qh[ib16] >> (4 * (ib8 & 1));
+            const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1);
+            const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
+            const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);
+
+            [[unroll]] for (int k = 0; k < 8; ++k) {
+                buf_a[buf_idx + k] = FLOAT_TYPE(dl * (bitfieldExtract(grid, 2 * k, 2) + delta));
+            }
+#elif defined(DATA_A_IQ2_XXS)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 32;                 // 8 values per idx
+            const uint ib32 = (idx % 32) / 4;         // 0..7
+            const uint ib8 = idx % 4;
+
+            const float d = float(data_a[ib].d);
+            const uint qs = data_a[ib].qs[8 * ib32 + ib8];
+            const uint signs = pack32(u8vec4(
+                data_a[ib].qs[8*ib32 + 4],
+                data_a[ib].qs[8*ib32 + 5],
+                data_a[ib].qs[8*ib32 + 6],
+                data_a[ib].qs[8*ib32 + 7]
+            ));
+            const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + (signs >> 28)));
+            const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
+            const uint sign = sign7 | (bitCount(sign7) << 7);
+            const uvec2 grid = iq2xxs_grid[qs];
+            const vec4 grid0 = vec4(unpack8(grid.x));
+            const vec4 grid1 = vec4(unpack8(grid.y));
+
+            buf_a[buf_idx    ] = db * FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x);
+            buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y);
+            buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z);
+            buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w);
+            buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x);
+            buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y);
+            buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z);
+            buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
+#elif defined(DATA_A_IQ2_XS)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 32;            // 8 values per idx
+            const uint ib32 = (idx % 32) / 4;    // 0..7
+            const uint ib8 = idx % 4;            // 0..3
+
+            const float d = float(data_a[ib].d);
+            const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
+            const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale));
+            const uint qs = data_a[ib].qs[4 * ib32 + ib8];
+            const uint sign7 = qs >> 9;
+            const uint sign = sign7 | (bitCount(sign7) << 7);
+            const uvec2 grid = iq2xs_grid[qs & 511];
+            const vec4 grid0 = vec4(unpack8(grid.x));
+            const vec4 grid1 = vec4(unpack8(grid.y));
+
+            buf_a[buf_idx    ] = db * FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x);
+            buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y);
+            buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z);
+            buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w);
+            buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x);
+            buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y);
+            buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z);
+            buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
+#elif defined(DATA_A_IQ2_S)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 32;  // 8 values per idx
+            const uint ib8 = idx % 32; // 0..31
+            const uint ib32 = ib8 / 4; // 0..7
+
+            const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf;
+            const uint qs = data_a[ib].qs[ib8];
+            const uint qh = data_a[ib].qh[ib32];
+            const uint qhshift = 2 * (ib8 % 4);
+            const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8];
+
+            const float d = float(data_a[ib].d);
+            const FLOAT_TYPE db = FLOAT_TYPE(d * 0.25 * (0.5 + scale));
+            const uvec2 grid = iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)];
+            const vec4 grid0 = vec4(unpack8(grid.x));
+            const vec4 grid1 = vec4(unpack8(grid.y));
+
+            buf_a[buf_idx    ] = db * FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x);
+            buf_a[buf_idx + 1] = db * FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y);
+            buf_a[buf_idx + 2] = db * FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z);
+            buf_a[buf_idx + 3] = db * FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w);
+            buf_a[buf_idx + 4] = db * FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x);
+            buf_a[buf_idx + 5] = db * FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y);
+            buf_a[buf_idx + 6] = db * FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z);
+            buf_a[buf_idx + 7] = db * FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w);
+#elif defined(DATA_A_IQ3_XXS)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 64;            // 4 values per idx
+            const uint iqs = idx % 64;           // 0..63
+            const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values
+
+            const float d = float(data_a[ib].d);
+            const uint qs = data_a[ib].qs[iqs];
+            const uint signs = pack32(u8vec4(
+                data_a[ib].qs[is+0],
+                data_a[ib].qs[is+1],
+                data_a[ib].qs[is+2],
+                data_a[ib].qs[is+3]
+            ));
+            const float db = d * 0.5 * (0.5 + (signs >> 28));
+            const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7);
+            const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (4 * (idx % 2));
+            const uint grid = iq3xxs_grid[qs];
+            const vec4 v = db * vec4(unpack8(grid));
+
+            buf_a[buf_idx    ] = FLOAT_TYPE((sign &   1) != 0 ? -v.x : v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE((sign &   2) != 0 ? -v.y : v.y);
+            buf_a[buf_idx + 2] = FLOAT_TYPE((sign &   4) != 0 ? -v.z : v.z);
+            buf_a[buf_idx + 3] = FLOAT_TYPE((sign &   8) != 0 ? -v.w : v.w);
+#elif defined(DATA_A_IQ3_S)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 64;            // 4 values per idx
+            const uint iqs = idx % 64;           // 0..63
+            const uint iqh = iqs / 8;
+
+            const float d = float(data_a[ib].d);
+            const uint qs = data_a[ib].qs[iqs];
+            const uint qh = data_a[ib].qh[iqh];
+            const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (4 * (idx % 2)));
+            const uint scale = data_a[ib].scales[iqs / 16];
+            const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign)));
+            const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf));
+            const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)];
+            const vec4 v = db * vec4(unpack8(grid));
+
+            buf_a[buf_idx    ] = FLOAT_TYPE((sign &   1) != 0 ? -v.x : v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE((sign &   2) != 0 ? -v.y : v.y);
+            buf_a[buf_idx + 2] = FLOAT_TYPE((sign &   4) != 0 ? -v.z : v.z);
+            buf_a[buf_idx + 3] = FLOAT_TYPE((sign &   8) != 0 ? -v.w : v.w);
+#elif defined(DATA_A_IQ4_XS)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A;
+
+            const uint ib = idx / 128;                  // 2 values per idx
+            const uint ib32 = (idx % 128) / 16;         // 0..7
+            const uint iq = 16 * ib32 + 2 * (idx % 8);
+
+            const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+            const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
+            const uint qshift = (idx & 8) >> 1;
+            u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]);
+            qs = (qs >> qshift) & uint8_t(0xF);
+
+            const float d = float(data_a[ib].d);
+            const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
+
+            buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+#elif defined(DATA_A_IQ4_NL)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
+
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x07;
+
+            const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d);
+            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d;
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d;
+            buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d;
+            buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d;
+#elif defined(DATA_A_MXFP4)
+            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
+            const uint buf_idx = col * SHMEM_STRIDE + 2 * row;
+
+            const uint ib = idx / 8;
+            const uint iqs = (idx & 0x07) * 2;
+
+            const float d = e8m0_to_fp32(data_a[ib].e);
+            const uint vui = uint(data_a[ib].qs[iqs]);
+            const uint vui2 = uint(data_a[ib].qs[iqs+1]);
+
+            buf_a[buf_idx     ] = FLOAT_TYPE(kvalues_mxfp4[vui & 0xF] * d);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_mxfp4[vui >>  4] * d);
+            buf_a[buf_idx +  1] = FLOAT_TYPE(kvalues_mxfp4[vui2 & 0xF] * d);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_mxfp4[vui2 >>  4] * d);
+#endif
+}
+
+#if !defined(MUL_MAT_ID)
+void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint idx_n, const uint idx_k, const uint end_k) {
+#if LOAD_VEC_B == 8
+            // Not supported for b_type bf16 because bf16mat2x4 does not exist
+            const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B;
+            FLOAT_TYPE_VEC8 bb = FLOAT_TYPE_VEC8(data_b[idx]);
+            buf_b[buf_idx + 0] = bb[0].x;
+            buf_b[buf_idx + 1] = bb[0].y;
+            buf_b[buf_idx + 2] = bb[0].z;
+            buf_b[buf_idx + 3] = bb[0].w;
+            buf_b[buf_idx + 4] = bb[1].x;
+            buf_b[buf_idx + 5] = bb[1].y;
+            buf_b[buf_idx + 6] = bb[1].z;
+            buf_b[buf_idx + 7] = bb[1].w;
+#elif LOAD_VEC_B == 4
+            const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B;
+#if defined(DATA_B_BF16)
+            FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_b[idx]));
+#else
+            FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(data_b[idx]);
+#endif
+            buf_b[buf_idx + 0] = bb.x;
+            buf_b[buf_idx + 1] = bb.y;
+            buf_b[buf_idx + 2] = bb.z;
+            buf_b[buf_idx + 3] = bb.w;
+#else // LOAD_VEC_B == 1
+            if (idx_n < p.N && idx_k < end_k) {
+                buf_b[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_b[pos_b + col * p.stride_b + row]);
+            } else {
+                buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f);
+            }
+#endif
+}
+#else
+void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint ic, const uint _ne1, const uint idx_k, const uint end_k) {
+#if LOAD_VEC_B == 8
+            // Not supported for b_type bf16 because bf16mat2x4 does not exist
+            const u16vec2 row_idx = row_ids[col];
+            const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B;
+            FLOAT_TYPE_VEC8 bb = FLOAT_TYPE_VEC8(data_b[idx]);
+            buf_b[buf_idx + 0] = bb[0].x;
+            buf_b[buf_idx + 1] = bb[0].y;
+            buf_b[buf_idx + 2] = bb[0].z;
+            buf_b[buf_idx + 3] = bb[0].w;
+            buf_b[buf_idx + 4] = bb[1].x;
+            buf_b[buf_idx + 5] = bb[1].y;
+            buf_b[buf_idx + 6] = bb[1].z;
+            buf_b[buf_idx + 7] = bb[1].w;
+#elif LOAD_VEC_B == 4
+            const u16vec2 row_idx = row_ids[col];
+            const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row;
+            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B;
+#if defined(DATA_B_BF16)
+            FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_b[idx]));
+#else
+            FLOAT_TYPE_VEC4 bb = FLOAT_TYPE_VEC4(data_b[idx]);
+#endif
+            buf_b[buf_idx + 0] = bb.x;
+            buf_b[buf_idx + 1] = bb.y;
+            buf_b[buf_idx + 2] = bb.z;
+            buf_b[buf_idx + 3] = bb.w;
+#else // LOAD_VEC_B == 1
+            const uint row_i = ic * BN + col;
+            if (row_i < _ne1 && idx_k < end_k) {
+                const u16vec2 row_idx = row_ids[col];
+                buf_b[col * SHMEM_STRIDE + row] = TO_FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row]);
+            } else {
+                buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE(0.0f);
+            }
+#endif
+}
+#endif
@@ -13,13 +13,10 @@

 #if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
 #define A_TYPE float
-#define A_TYPE32 float
 #elif LOAD_VEC_A == 4
 #define A_TYPE vec4
-#define A_TYPE32 vec4
 #elif LOAD_VEC_A == 8
 #define A_TYPE mat2x4
-#define A_TYPE32 mat2x4
 #endif
 #endif

@@ -29,13 +26,10 @@

 #if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
 #define A_TYPE float16_t
-#define A_TYPE32 float
 #elif LOAD_VEC_A == 4
 #define A_TYPE f16vec4
-#define A_TYPE32 vec4
 #elif LOAD_VEC_A == 8
 #define A_TYPE f16mat2x4
-#define A_TYPE32 mat2x4
 #endif
 #endif

@@ -320,9 +320,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
    std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4";
    std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4";

-    std::map<std::string, std::string> base_dict = {
-        {"FLOAT_TYPE_VEC2", (coopmat2 || fp16) ? "f16vec2" : "vec2"},
-    };
+    std::map<std::string, std::string> base_dict;
    std::string shader_name = "matmul";

    if (matmul_id_type == MatMulIdType::DEFAULT) {
@@ -349,26 +347,74 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c

    const std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp";

-    auto const &FLOAT_TYPE = [&](const std::string &t) -> std::string {
-        if (t == "bf16") {
-            // scalar path promotes to float
-            if (!coopmat && !coopmat2) {
-                return "float";
+    auto const &FLOAT_TYPE = [&](int vec, const std::string &t) -> std::string {
+        switch (vec) {
+        case 1:
+            if (t == "bf16") {
+                // scalar path promotes to float
+                if (!coopmat && !coopmat2) {
+                    return "float";
+                }
+                return "bfloat16_t";
            }
-            return "bfloat16_t";
+            if (coopmat2 || fp16) {
+                return "float16_t";
+            }
+            return "float";
+        case 2:
+            if (t == "bf16") {
+                // scalar path promotes to float
+                if (!coopmat && !coopmat2) {
+                    return "vec2";
+                }
+                return "bf16vec2";
+            }
+            if (coopmat2 || fp16) {
+                return "f16vec2";
+            }
+            return "vec2";
+        case 4:
+            if (t == "bf16") {
+                // scalar path promotes to float
+                if (!coopmat && !coopmat2) {
+                    return "vec4";
+                }
+                return "bf16vec4";
+            }
+            if (coopmat2 || fp16) {
+                return "f16vec4";
+            }
+            return "vec4";
+        case 8:
+            if (t == "bf16") {
+                // scalar path promotes to float
+                if (!coopmat && !coopmat2) {
+                    return "mat2x4";
+                }
+                throw std::runtime_error("bf16 vec8 not supported");
+            }
+            if (coopmat2 || fp16) {
+                return "f16mat2x4";
+            }
+            return "mat2x4";
+        default:
+            throw std::runtime_error("invalid vector size");
        }
-        if (coopmat2 || fp16) {
-            return "float16_t";
-        }
-        return "float";
+    };
+
+    const std::map<std::string, std::string> float_type_dict_f16 = {
+        {"FLOAT_TYPE",      FLOAT_TYPE(1, "f16")},
+        {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, "f16")},
+        {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, "f16")},
+        {"FLOAT_TYPE_VEC8", FLOAT_TYPE(8, "f16")},
    };

    // Shaders with f16 B_TYPE
-    string_to_spv(shader_name + "_f32_f16",         source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"},                                                     {"B_TYPE", "float16_t"},                                          {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
-    string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
+    string_to_spv(shader_name + "_f32_f16",         source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"},                                                     {"B_TYPE", "float16_t"},        {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc);
+    string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);

-    string_to_spv(shader_name + "_f16_aligned",     source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
-    string_to_spv(shader_name + "_f16",             source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("f16")}, {"DATA_A_F16", "1"},                                                     {"B_TYPE", "float16_t"},                                          {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
+    string_to_spv(shader_name + "_f16",             source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"},                                                     {"B_TYPE", "float16_t"},        {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
+    string_to_spv(shader_name + "_f16_aligned",     source_name, merge_maps(merge_maps(base_dict, float_type_dict_f16), {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);

    // bf16
    {
@@ -379,13 +425,19 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
        // scalar path promotes to float
        std::string to_float_type = (coopmat || coopmat2) ? "uintBitsToBFloat16EXT" : "bf16_to_fp32";

+        const std::map<std::string, std::string> float_type_dict_bf16 = {
+            {"FLOAT_TYPE",      FLOAT_TYPE(1, "bf16")},
+            {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, "bf16")},
+            {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, "bf16")},
+        };
+
        // If bfloat16 is not supported, then only compile the scalar (promote to fp32) shader
 #if !defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
        if (!(coopmat || coopmat2))
 #endif
        {
-            string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"},   {"B_TYPE32", "vec4"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
-            string_to_spv(shader_name + "_bf16",         source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE("bf16")}, {"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                      {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"},                        {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}),                   fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_bf16_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", "4"}, {"B_TYPE", coopmat2 ? "bfloat16_t" : "u16vec4"},  {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_bf16",         source_name, merge_maps(merge_maps(base_dict, float_type_dict_bf16), {{"TO_FLOAT_TYPE", to_float_type}, {"DATA_A_BF16", "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                      {"B_TYPE", coopmat2 ? "bfloat16_t" : "uint16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"DATA_B_BF16", "1"}}),                   fp16, coopmat, coopmat2, f16acc);
        }
    }

@@ -406,20 +458,27 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
        // For aligned matmul loads
        std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16" || tname == "bf16") ? load_vec : load_vec_quant;

+        const std::map<std::string, std::string> float_type_dict = {
+            {"FLOAT_TYPE",      FLOAT_TYPE(1, tname)},
+            {"FLOAT_TYPE_VEC2", FLOAT_TYPE(2, tname)},
+            {"FLOAT_TYPE_VEC4", FLOAT_TYPE(4, tname)},
+            {"FLOAT_TYPE_VEC8", FLOAT_TYPE(8, tname)},
+        };
+
        // don't generate f32 variants for coopmat2
        if (!coopmat2) {
-            string_to_spv(shader_name + "_" + tname + "_f32",         source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                           {"B_TYPE", "float"},                                              {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
-            string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_" + tname + "_f32",         source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                           {"B_TYPE", "float"},            {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
        }

        if (tname != "f16" && tname != "f32") {
-            string_to_spv(shader_name + "_" + tname + "_f16",         source_name,  merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                           {"B_TYPE", "float16_t"},                                          {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
-            string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name,  merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"B_TYPE32", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_" + tname + "_f16",         source_name,  merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned},                           {"B_TYPE", "float16_t"},        {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name,  merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a},           {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
        }

 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
        if (!coopmat && !coopmat2 && matmul_id_type == MatMulIdType::NONE && is_legacy_quant(tname)) {
-            string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(base_dict, {{"FLOAT_TYPE", FLOAT_TYPE(tname)}, {data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
+            string_to_spv(shader_name + "_" + tname + "_q8_1", "mul_mmq.comp", merge_maps(merge_maps(base_dict, float_type_dict), {{data_a_key, "1"}, {"D_TYPE", "float"},}), fp16, coopmat, coopmat2, f16acc);
        }
 #endif
    }
@@ -76,6 +76,7 @@ struct ggml_backend_zdnn_context {

 struct ggml_backend_zdnn_buffer {
    void * data;
+    ggml_backend_zdnn_buffer * extra;  // for bias, etc.
    size_t size;

    zdnn_tensor_desc pre_tfm_desc;
@@ -115,9 +115,7 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
    ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
    ggml_backend_zdnn_buffer * inputs_extra  = (ggml_backend_zdnn_buffer *)inputs->extra;
    ggml_backend_zdnn_buffer * output_extra  = (ggml_backend_zdnn_buffer *)output->extra;
-
-    zdnn_tensor_desc ptd_bias, td_bias;
-    zdnn_ztensor zt_bias;
+    ggml_backend_zdnn_buffer * bias_extra    = (ggml_backend_zdnn_buffer *)output_extra->extra;

    const int64_t weights_rows = ne01;
    const int64_t weights_cols = ne00;
@@ -129,14 +127,6 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
    const int64_t output_rows = ne1;
    const int64_t output_cols = ne0;

-    const int64_t bias_dim  [GGML_MAX_DIMS]  = { 1, 1, 1, output_cols };
-    ggml_zdnn_create_tensor(ptd_bias, td_bias, zt_bias, output, bias_dim, ZDNN_1D);
-
-    void * bias_data = (void *)calloc(ne0, ggml_element_size(output));
-    if (weights_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(weights_extra->ztensor, weights->data);
-    if (inputs_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(inputs_extra->ztensor, inputs->data);
-    ggml_zdnn_load_tensor(zt_bias, bias_data);
-
    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
    //               __func__, weights_extra->name,
    //               weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
@@ -158,29 +148,21 @@ static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_ten
    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1  == inputs->ne[0]  && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2  == inputs->ne[1]  && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");

-    ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &zt_bias,
+    ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor,
                                        false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
    // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
    ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));

-    ZDNN_CHECK(zdnn_free_ztensor_buffer(&zt_bias));
-    free(bias_data);
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(weights_rows);
+    GGML_UNUSED(weights_cols);
+    GGML_UNUSED(inputs_rows);
+    GGML_UNUSED(inputs_cols);
+    GGML_UNUSED(output_rows);
+    GGML_UNUSED(output_cols);
 }

 static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    bool use_mul_mat_vec =
-        (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F16)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
-
-    bool use_mul_mat_vec_q =
-        ggml_is_quantized(src0->type)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-
-    bool use_mul_mat_q =
-        ggml_is_quantized(src0->type)
-        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-
    // debug helpers
    // GGML_LOG_INFO("%s: use_mul_mat_vec   = %d\n", __func__, use_mul_mat_vec);
    // GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q);
@@ -192,25 +174,7 @@ static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const gg
    // GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    // GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);

-    if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16
-        && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
-        && src1->ne[2] * src1->ne[3] > 1) {
-        // general KQ + KQV multi-batch
-        GGML_LOG_INFO("%s: using zdnn_mul_mat_batched for KQ + KQV multi-batch\n", __func__);
-        // ggml_zdnn_mul_mat_batched(ctx, src0, src1, dst);
-    } else if (use_mul_mat_vec) {
-        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec for vector multiplication\n", __func__);
-        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec, nullptr);
-    } else if (use_mul_mat_vec_q) {
-        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec_q for quantized vector multiplication\n", __func__);
-        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec_q, ggml_zdnn_quantize_row_q8_1);
-    } else if (use_mul_mat_q) {
-        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_q for quantized matrix multiplication\n", __func__);
-        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_q, ggml_zdnn_quantize_mmq_q8_1);
-    } else {
-        // GGML_LOG_INFO("%s: using zdnn_op_mul_mat for general matrix multiplication\n", __func__);
-        ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
-    }
+    ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
 }

 static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) {
@@ -253,6 +217,8 @@ static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgr
    }

    return GGML_STATUS_SUCCESS;
+
+    GGML_UNUSED(ctx_dev);
 }

 static bool ggml_zdnn_supports_op(const ggml_backend_zdnn_device_context * ctx_dev, const ggml_tensor * op) {
@@ -266,22 +232,30 @@ static bool ggml_zdnn_supports_op(const ggml_backend_zdnn_device_context * ctx_d

        case GGML_OP_MUL_MAT:
            {
-                const ggml_tensor * src0 = op->src[0];
-                const ggml_tensor * src1 = op->src[1];
+                const ggml_tensor * weights = op->src[0];
+                const ggml_tensor * inputs  = op->src[1];

-                const int64_t ne10 = src1->ne[0];
-                const int64_t ne0 = op->ne[0];
-                const int64_t ne1 = op->ne[1];
+                const int64_t ne10 = inputs->ne[0];
+                const int64_t ne0  = op->ne[0];
+                const int64_t ne1  = op->ne[1];

                const int64_t max_batch = ctx_dev->max_size;

-                return ggml_is_matrix(src0) &&
-                       ggml_is_matrix(src1) &&
-                       ggml_is_contiguous(src0) &&
-                       ggml_is_contiguous(src1) &&
-                       src0->view_src == nullptr && src1->view_src == nullptr &&
-                       src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
-                       (ne0 <= max_batch && ne1 <= max_batch && ne10 <= max_batch);
+                if (!ggml_is_matrix(weights) || !ggml_is_matrix(inputs) ||
+                    !ggml_is_contiguous(weights) || !ggml_is_contiguous(inputs) ||
+                    weights->view_src != nullptr || inputs->view_src != nullptr ||
+                    ne0 > max_batch || ne1 > max_batch || ne10 > max_batch) {
+                        return false;
+                }
+
+                switch (weights->type) {
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_BF16:
+                        return true;
+                    default:
+                        return false;
+                }
            } break;

        default:
@@ -374,10 +348,12 @@ static void ggml_zdnn_free(ggml_backend_zdnn_context * ctx) {
 static void ggml_backend_zdnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;

-    for (int i = 0; i < ctx->n_buffers; i++) {
-        if (ctx->buffers[i]->ztensor.buffer != NULL && ctx->buffers[i]->ztensor.is_transformed) {
-            ZDNN_CHECK(zdnn_free_ztensor_buffer(&ctx->buffers[i]->ztensor));
-        }
+    for (const auto & buf_ptr : ctx->buffers) {
+        ggml_backend_zdnn_buffer * buf = buf_ptr.get();
+
+        // Free any extra buffer allocated for the tensor. E.g., bias for GGML_OP_MUL_MAT
+        if (buf->extra != nullptr) free(buf->extra->data);
+        if (buf->ztensor.buffer_size > 0) ZDNN_CHECK(zdnn_free_ztensor_buffer(&buf->ztensor));
    }

    delete ctx;
@@ -402,11 +378,37 @@ static enum ggml_status ggml_backend_zdnn_buffer_init_tensor(ggml_backend_buffer
    std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
    zdnn_buffer->data = tensor->data;
    zdnn_buffer->size = tsize;
-    strncpy(zdnn_buffer->name, tensor->name, GGML_MAX_NAME - 1);
+    zdnn_buffer->extra = nullptr;
+    snprintf(zdnn_buffer->name, GGML_MAX_NAME, "%s", tensor->name);

    ggml_zdnn_init_tensor(zdnn_buffer.get(), tensor);
    tensor->extra = zdnn_buffer.get();

+    switch (tensor->op) {
+        case GGML_OP_MUL_MAT:
+            {
+                std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_bias_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
+                zdnn_bias_buffer->data = (void *)calloc(tensor->ne[0], ggml_element_size(tensor));
+                zdnn_bias_buffer->size = ggml_element_size(tensor) * tensor->ne[0];
+                snprintf(zdnn_bias_buffer->name, GGML_MAX_NAME, "%.*s (bias)",
+                         GGML_MAX_NAME - (int)sizeof(" (bias)"), tensor->name);
+
+                const int64_t bias_dim[GGML_MAX_DIMS] = { 1, 1, 1, tensor->ne[0] };
+                ggml_zdnn_create_tensor(zdnn_bias_buffer->pre_tfm_desc,
+                                        zdnn_bias_buffer->tfm_desc,
+                                        zdnn_bias_buffer->ztensor,
+                                        tensor, bias_dim, ZDNN_1D);
+
+                ggml_zdnn_load_tensor(zdnn_bias_buffer->ztensor, zdnn_bias_buffer->data);
+                zdnn_buffer->extra = zdnn_bias_buffer.get();
+
+                ctx->buffers.push_back(std::move(zdnn_bias_buffer));
+                ctx->n_buffers++;
+            } break;
+        default:
+            break;
+    }
+
    ctx->buffers.push_back(std::move(zdnn_buffer));
    ctx->n_buffers++;

@@ -414,6 +416,8 @@ static enum ggml_status ggml_backend_zdnn_buffer_init_tensor(ggml_backend_buffer
    //               __func__, tensor->name, buffer_idx, tsize);

    return GGML_STATUS_SUCCESS;
+
+    GGML_UNUSED(buffer_idx);
 }

 static void ggml_backend_zdnn_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -425,6 +429,13 @@ static void ggml_backend_zdnn_buffer_memset_tensor(ggml_backend_buffer_t buffer,
 static void ggml_backend_zdnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    memcpy((char *)tensor->data + offset, data, size);

+    ggml_backend_zdnn_buffer * extra = (ggml_backend_zdnn_buffer *)tensor->extra;
+
+    // Fixes the LLAMA_SET_ROWS bug
+    // see: https://github.com/ggml-org/llama.cpp/issues/15414
+    if (tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_COMPUTE && extra->ztensor.is_transformed) zdnn_reset_ztensor(&extra->ztensor);
+    if (extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(extra->ztensor, tensor->data);
+
    GGML_UNUSED(buffer);
 }

@@ -528,29 +539,6 @@ ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void) {
    return &ggml_backend_buffer_type_zdnn;
 }

-static const char * ggml_backend_zdnn_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
-    return GGML_ZDNN_NAME "_Mapped";
-
-    GGML_UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_from_ptr_type(void) {
-    static ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_zdnn = {
-        /* .iface = */ {
-            /* .get_name       = */ ggml_backend_zdnn_buffer_from_ptr_type_get_name,
-            /* .alloc_buffer   = */ ggml_backend_zdnn_buffer_type_alloc_buffer,
-            /* .get_alignment  = */ ggml_backend_zdnn_buffer_type_get_alignment,
-            /* .get_max_size   = */ NULL,
-            /* .get_alloc_size = */ NULL,  // defaults to ggml_nbytes
-            /* .is_host        = */ ggml_backend_zdnn_buffer_type_is_host,
-        },
-        /* .device  = */ &g_ggml_backend_zdnn_device,
-        /* .context = */ NULL,
-    };
-
-    return &ggml_backend_buffer_from_ptr_type_zdnn;
-}
-
 //
 // backend
 //
@@ -594,27 +582,6 @@ static ggml_guid_t ggml_backend_zdnn_guid(void) {
    return reinterpret_cast<ggml_guid_t>((void *)guid_str);
 }

-// TODO: remove in the future
-ggml_backend_t ggml_backend_zdnn_init(void) {
-    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_zdnn_reg(), 0);
-
-    ggml_backend_zdnn_context * ctx = ggml_zdnn_init(dev);
-    if (ctx == NULL) {
-        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
-        return NULL;
-    }
-
-    ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(ggml_backend));
-    *backend = (ggml_backend) {
-        /* .guid       = */ ggml_backend_zdnn_guid(),
-        /* .iface      = */ ggml_backend_zdnn_i,
-        /* .device     = */ dev,
-        /* .context    = */ ctx,
-    };
-
-    return backend;
-}
-
 bool ggml_backend_is_zdnn(ggml_backend_t backend) {
    return backend != NULL &&
           ggml_guid_matches(backend->guid, ggml_backend_zdnn_guid());
@@ -634,11 +601,15 @@ static const char * ggml_backend_zdnn_device_get_name(ggml_backend_dev_t dev) {

 static const char * ggml_backend_zdnn_device_get_description(ggml_backend_dev_t dev) {
    return "IBM Z Neural Network Processing Assist (NNPA)";
+
+    GGML_UNUSED(dev);
 }

 static void ggml_backend_zdnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    *free  = 0;
    *total = 0;
+
+    GGML_UNUSED(dev);
 }

 static enum ggml_backend_dev_type ggml_backend_zdnn_device_get_type(ggml_backend_dev_t dev) {
@@ -655,8 +626,8 @@ static void ggml_backend_zdnn_device_get_props(ggml_backend_dev_t dev, ggml_back
    props->caps = (ggml_backend_dev_caps) {
        /* .async                = */ false,
        /* .host_buffer          = */ false,
-        /* .buffer_from_host_ptr = */ true,
-        /* .events               = */ false,
+        /* .buffer_from_host_ptr = */ false,
+        /* .events               = */ false
    };
 }

@@ -672,7 +643,7 @@ static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, cons
        /* .guid       = */ ggml_backend_zdnn_guid(),
        /* .iface      = */ ggml_backend_zdnn_i,
        /* .device     = */ dev,
-        /* .context    = */ ctx,
+        /* .context    = */ ctx
    };

    return backend;
@@ -686,46 +657,6 @@ static ggml_backend_buffer_type_t ggml_backend_zdnn_device_get_buffer_type(ggml_
    GGML_UNUSED(dev);
 }

-static ggml_backend_buffer_t ggml_backend_zdnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-    ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context();
-
-    ctx->all_data  = ptr;
-    ctx->all_size  = size;
-    ctx->owned     = false;
-    ctx->n_buffers = 0;
-
-    const size_t size_page = sysconf(_SC_PAGESIZE);
-
-    // page-align the data ptr
-    {
-        const uintptr_t offs = (uintptr_t) ptr % size_page;
-        ptr  = (void *)((char *)ptr - offs);
-        size += offs;
-    }
-
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += size_page - (size_aligned % size_page);
-    }
-
-    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context;
-
-    GGML_ASSERT(ctx_dev->zdnn_device >= 0);
-    int device = ctx_dev->zdnn_device; GGML_UNUSED(device);
-
-    std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
-    zdnn_buffer->data = ptr;
-    zdnn_buffer->size = size;
-    ctx->buffers.push_back(std::move(zdnn_buffer));
-
-    GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB\n",
-                  __func__, size_aligned / 1024.0 / 1024.0);
-
-    ++ctx->n_buffers;
-
-    return ggml_backend_buffer_init(ggml_backend_zdnn_buffer_from_ptr_type(), ggml_backend_zdnn_buffer_i, ctx, size);
-}
-
 static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *) dev->context;

@@ -734,8 +665,7 @@ static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const g

 static bool ggml_backend_zdnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
    return
-        buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name ||
-        buft->iface.get_name == ggml_backend_zdnn_buffer_from_ptr_type_get_name;
+        buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name;

    GGML_UNUSED(dev);
 }
@@ -749,7 +679,7 @@ static ggml_backend_device_i ggml_backend_zdnn_device_i = {
    /* .init_backend         = */ ggml_backend_zdnn_device_init,
    /* .get_buffer_type      = */ ggml_backend_zdnn_device_get_buffer_type,
    /* .get_host_buffer_type = */ NULL,
-    /* .buffer_from_host_ptr = */ ggml_backend_zdnn_device_buffer_from_ptr,
+    /* .buffer_from_host_ptr = */ NULL,
    /* .supports_op          = */ ggml_backend_zdnn_device_supports_op,
    /* .supports_buft        = */ ggml_backend_zdnn_device_supports_buft,
    /* .offload_op           = */ NULL,
@@ -813,7 +743,7 @@ static ggml_backend_reg_i ggml_backend_zdnn_reg_i = {
    /* .get_name         = */ ggml_backend_zdnn_reg_get_name,
    /* .get_device_count = */ ggml_backend_zdnn_reg_device_count,
    /* .get_device       = */ ggml_backend_zdnn_reg_device_get,
-    /* .get_proc_address = */ ggml_backend_zdnn_get_proc_address,
+    /* .get_proc_address = */ ggml_backend_zdnn_get_proc_address
 };

 static void ggml_zdnn_cleanup(void) {
@@ -831,13 +761,13 @@ ggml_backend_reg_t ggml_backend_zdnn_reg(void) {
        g_ggml_backend_zdnn_reg = (ggml_backend_reg) {
            /* .api_version = */ GGML_ZDNN_VERSION,
            /* .iface       = */ ggml_backend_zdnn_reg_i,
-            /* .context     = */ NULL,
+            /* .context     = */ NULL
        };

        g_ggml_backend_zdnn_device = (ggml_backend_device) {
            /* .iface       = */ ggml_backend_zdnn_device_i,
            /* .reg         = */ &g_ggml_backend_zdnn_reg,
-            /* .context     = */ &g_ggml_ctx_dev_main,
+            /* .context     = */ &g_ggml_ctx_dev_main
        };

        return &g_ggml_backend_zdnn_reg;
@@ -181,7 +181,7 @@ llama_context::llama_context(
        // graph outputs buffer
        {
            // resized during inference when a batch uses more outputs
-            if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
+            if (output_reserve(params.n_seq_max) < params.n_seq_max) {
                throw std::runtime_error("failed to reserve initial output buffer");
            }

@@ -384,5 +384,5 @@ These options provide extra functionality and customization when running the LLa
 -   `--verbose-prompt`: Print the prompt before generating text.
 -   `--no-display-prompt`: Don't print prompt at generation.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple devices this option controls how tensors should be split across devices. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each device should get in order. For example, "3,2" will assign 60% of the data to device 0 and 40% to device 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. The list of the devices which are being used is printed on startup and can be different from the device list given by `--list-devices` or e.g. `nvidia-smi`.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
@@ -406,6 +406,7 @@ struct clip_ctx {
            }
            if (!backend) {
                backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
+                backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
            }
        }

@@ -227,14 +227,8 @@ static ggml_backend_t create_backend(const rpc_server_params & params) {
        }
    }

-    // try to initialize a GPU backend first
    if (!backend) {
-        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
-    }
-
-    // if there aren't GPU backends fallback to CPU backend
-    if (!backend) {
-        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        backend = ggml_backend_init_best();
    }

    if (backend) {
@@ -2372,7 +2372,7 @@ struct server_context {
            }

            if (ret != nullptr) {
-                SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
+                SLT_INF(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %.3f (> %.3f thold)\n", lcs_len, similarity, slot_prompt_similarity);
            }
        }

@@ -2394,7 +2394,7 @@ struct server_context {
            }

            if (ret != nullptr) {
-                SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last);
+                SLT_INF(*ret, "selected slot by LRU, t_last = %" PRId64 "\n", t_last);
            }
        }
Author	SHA1	Message	Date
Ruben Ortlam	261e6a20ff	Vulkan: Clean up mul_mm shader (#15987 ) * vulkan: move mul_mm dequantization steps into a separate file and functions * improve mul_mm vector load code * fix debug mode issues and warnings	2025-09-14 16:56:28 +02:00
lcy	a0e13dcbe5	build: fix the build failures of Windows HIP release job (#15984 ) * build: fix the cache keys for Windows HIP release job Update the cache keys to include the HIP SDK version, preventing the use of outdated ROCm installation caches. * build: sync changes from release.yml to build.yml - Update HIP SDK version to 25.Q3 and ROCm version to 6.4.2 - Update the cache keys to reflect the new versions * build: remove Windows HIP release for gfx1151 since the current stable rocWMMA does not support gfx1151.	2025-09-14 07:20:35 -07:00
Georgi Gerganov	a14bd35014	metal : fix kernel requirements (#15983 ) * metal : fix kernel requirements ggml-ci * cont : fix supports_op * cont : fix supports_op for ARGMAX	2025-09-14 15:33:22 +03:00
Radoslav Gerganov	918b26f197	rpc : fix regression when --device is used (#15981 ) Fix regression introduced with commit `50f4281a6`	2025-09-14 12:28:18 +03:00
Diego Devesa	9ecb884346	releases : update ROCM, add gfx1200, gfx1201, gfx1151 (#15972 ) * releases : update ROCM, add gfx1200, gfx1201, gfx1151 * releases : set target to 13.3 for macos-x64 * add hipblaslt.dll to release * add hipblaslt/library to release	2025-09-14 02:21:59 -07:00
Radoslav Gerganov	d1c6f11f47	doc : update documentation for --tensor-split (#15980 ) * doc : update documentation for --tensor-split * Update tools/main/README.md Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update tools/main/README.md Co-authored-by: Diego Devesa <slarengh@gmail.com> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Diego Devesa <slarengh@gmail.com>	2025-09-14 12:10:07 +03:00
Aaron Teo	6380d6a3e7	ggml-zdnn: rm user mapped buffers (#15965 ) * ggml-zdnn: rm user mapped buffers Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: rm dead code Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * ggml-zdnn: attempt to fix missing extra data buffer free Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> --------- Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>	2025-09-14 13:37:03 +08:00
Jeff Bolz	aa0c461efe	vulkan: fix failing dequant shaders (#15862 ) * vulkan: fix failing dequant shaders * add missing const	2025-09-13 17:29:43 +02:00
Jeff Bolz	b9c9c9f789	vulkan: initialize vulkan-hpp to allow using extension function pointers (#15705 ) Use this to query register count for shader compiles on NVIDIA. Currently this is only for performance debug, but it could eventually be used in some heuristics like split_k.	2025-09-13 17:23:30 +02:00
Diego Devesa	50f4281a6f	llama : allow using iGPUs with --device (#15951 ) * llama : allow using iGPUs with --device * mtmd : allow iGPU * rpc-server : allow iGPU	2025-09-13 16:49:49 +02:00
Georgi Gerganov	55758b00ca	metal : refactor kernel loading (#15964 ) * metal : refactor bin kernels loading ggml-ci * metal : refactor rms kernel loading ggml-ci * ci : try to add memory leaks check ggml-ci * ci : try to enable memory leak detection for Mac * cont : seems to be working	2025-09-13 16:24:22 +03:00
Georgi Gerganov	f161463a54	metal : allow ops to run concurrently (#15929 ) * metal : run graphs ops concurrently ggml-ci * cont : add flags for debugging and disabling concurrency ggml-ci * cont : refactor and handle fusing ggml-ci * cont : simplify - no need to use GPU address ggml-ci * cont : prepare mem ranges for reuse + add ggml-metal-common.cpp ggml-ci * cont : avoid redundant keywords in cpp [no ci] * metal : reorder graph for better concurrency ggml-ci * metal : fix race on mem pool buffers ggml-ci * cont : add env GGML_METAL_GRAPH_OPTIMIZE_DISABLE ggml-ci * cont : refactor, optimize, add comments ggml-ci * cont : refactor ggml-metal.m ggml-ci * minor : update logs [no ci]	2025-09-13 13:54:28 +03:00
Georgi Gerganov	84d7b2fca1	metal : fix memory leaks (#15962 ) ggml-ci	2025-09-13 12:45:04 +03:00
Aaron Teo	40be51152d	ggml-zdnn: fix #15414 , activate FP16 and BF16 acceleration and incorrect zTensor free (#15839 )	2025-09-13 02:39:52 +08:00
Eric Curtin	4bf5549269	Add docker protocol support for llama-server model loading (#15790 ) To pull and run models via: llama-server -dr gemma3 Add some validators and sanitizers for Docker Model urls and metadata Signed-off-by: Eric Curtin <eric.curtin@docker.com>	2025-09-12 16:31:50 +01:00
Haiyue Wang	f4e664f838	context : remove redundant explicit casting to the same type (#15948 ) The function 'output_reserve' return type is 'uint32_t', so need to add explicit casting.	2025-09-12 18:16:32 +03:00
Georgi Gerganov	f088b6a84f	server : adjust prompt similarity thold + add logs (#15913 ) ggml-ci	2025-09-12 17:02:55 +03:00
Ruben Ortlam	304ac5693d	Vulkan iGPU device selection overhaul and PCI ID API support (#15947 ) * vulkan: implement ggml igpu device type, implement pci id support * fix compiler warning * prevent printf overflow warning	2025-09-12 13:24:21 +02:00
Mathieu Baudier	6c88ad8fa7	vulkan: Make device memory check more portable (#15939 )	2025-09-12 09:06:20 +02:00
Neo Zhang Jianyu	704d90c987	Revert "sycl: add usage of enqueue_functions extension (#14244 )" (#15910 ) * Revert "sycl: add usage of enqueue_functions extension (#14244)" This reverts commit `8308f98c7f`. * fix missed revert code, format the code	2025-09-12 09:15:12 +08:00