mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-26 23:57:40 +02:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 3fc4e10527 | |||
| 5d8ccdf9d1 | |||
| 024930c6ad | |||
| 5397c36194 | |||
| e7ea94afcb | |||
| 96183e9820 | |||
| 487a6cc164 | |||
| 5a6a0dd7e1 | |||
| ded1561b42 | |||
| 9df06805ee |
@@ -145,7 +145,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
# ==============================================================================
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
ENTRYPOINT [ "/app/llama-cli" ]
|
||||
|
||||
@@ -156,7 +156,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
|
||||
HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||
|
||||
|
||||
@@ -104,7 +104,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -115,7 +115,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -113,7 +113,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -124,7 +124,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -141,7 +141,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -153,7 +153,7 @@ FROM base AS server
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/lib/ /app
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -115,7 +115,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -126,7 +126,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
ARG OPENVINO_VERSION_MAJOR=2026.2
|
||||
ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
|
||||
ARG OPENVINO_VERSION_MAJOR=2026.2.1
|
||||
ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
|
||||
ARG IGC_VERSION=v2.34.4
|
||||
ARG IGC_VERSION_FULL=2_2.34.4+21428
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
|
||||
ARG IGC_VERSION=v2.36.3
|
||||
ARG IGC_VERSION_FULL=2_2.36.3+21719
|
||||
ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4
|
||||
ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0
|
||||
ARG IGDGMM_VERSION=22.10.0
|
||||
|
||||
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
|
||||
@@ -214,7 +214,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app/
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -225,7 +225,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app/
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app/
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -127,7 +127,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -138,7 +138,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ WORKDIR /llama.cpp/bin
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin
|
||||
|
||||
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
||||
|
||||
@@ -138,7 +138,7 @@ WORKDIR /llama.cpp/bin
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
|
||||
COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-server /llama.cpp/bin
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -118,7 +118,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@ ENTRYPOINT ["/app/tools.sh"]
|
||||
### Light, CLI only
|
||||
FROM base AS light
|
||||
|
||||
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -108,7 +108,7 @@ FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
COPY --from=build /app/full/llama-server /app
|
||||
COPY --from=build /app/full/llama /app/full/llama-server /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -68,8 +68,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -96,8 +96,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
@@ -39,8 +39,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -96,8 +96,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
@@ -266,8 +266,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
||||
@@ -446,8 +446,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
|
||||
steps:
|
||||
- name: Set OpenVINO version output
|
||||
@@ -506,8 +506,11 @@ jobs:
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DGGML_OPENVINO=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }}
|
||||
cmake --build build/ReleaseOV --config Release -j $(nproc)
|
||||
-DCMAKE_INSTALL_RPATH='$ORIGIN' \
|
||||
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
|
||||
-DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \
|
||||
${{ env.CMAKE_ARGS }}
|
||||
cmake --build build/ReleaseOV --config Release --parallel
|
||||
|
||||
- name: ccache-clear
|
||||
uses: ./.github/actions/ccache-clear
|
||||
@@ -521,8 +524,26 @@ jobs:
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
run: |
|
||||
cp LICENSE ./build/ReleaseOV/bin/
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .
|
||||
dest=./build/ReleaseOV/bin
|
||||
OPENVINO_ROOT=./openvino_toolkit
|
||||
ov_lib="$OPENVINO_ROOT/runtime/lib/intel64"
|
||||
|
||||
# Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN
|
||||
# load these siblings without setupvars.sh / LD_LIBRARY_PATH.
|
||||
cp -P "$ov_lib"/libopenvino.so* \
|
||||
"$ov_lib"/libopenvino_c.so* \
|
||||
"$ov_lib"/libopenvino_*_plugin.so \
|
||||
"$ov_lib"/libopenvino_intel_npu_compiler*.so \
|
||||
"$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \
|
||||
"$dest"
|
||||
cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true
|
||||
cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true
|
||||
|
||||
# OpenVINO licensing
|
||||
cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing
|
||||
|
||||
cp LICENSE "$dest"
|
||||
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" .
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
@@ -538,8 +559,8 @@ jobs:
|
||||
|
||||
env:
|
||||
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
|
||||
OPENVINO_VERSION_MAJOR: "2026.2"
|
||||
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
|
||||
OPENVINO_VERSION_MAJOR: "2026.2.1"
|
||||
OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3"
|
||||
|
||||
steps:
|
||||
- name: Set OpenVINO version output
|
||||
@@ -607,7 +628,9 @@ jobs:
|
||||
-A x64 ^
|
||||
-DCMAKE_BUILD_TYPE=Release ^
|
||||
-DGGML_OPENVINO=ON ^
|
||||
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
|
||||
-DLLAMA_BUILD_BORINGSSL=ON ^
|
||||
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^
|
||||
${{ env.CMAKE_ARGS }}
|
||||
|
||||
cmake --build build\ReleaseOV --config Release -- /m
|
||||
|
||||
@@ -624,8 +647,29 @@ jobs:
|
||||
id: pack_artifacts
|
||||
shell: powershell
|
||||
run: |
|
||||
Copy-Item LICENSE .\build\ReleaseOV\bin\
|
||||
7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\*
|
||||
# Locate the extracted OpenVINO toolkit root (same pattern as the Build step).
|
||||
$OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName
|
||||
if (-not $OPENVINO_ROOT) {
|
||||
Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit"
|
||||
exit 1
|
||||
}
|
||||
|
||||
$dest = ".\build\ReleaseOV\bin\Release"
|
||||
|
||||
$ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release'
|
||||
Copy-Item -Path (Join-Path $ovBin '*.dll') -Destination $dest -Force
|
||||
Copy-Item -Path (Join-Path $ovBin 'cache.json') -Destination $dest -Force
|
||||
|
||||
$tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin'
|
||||
Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force
|
||||
|
||||
# OpenVINO licensing
|
||||
$licensingDest = Join-Path $dest 'openvino-licensing'
|
||||
New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null
|
||||
Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force
|
||||
|
||||
Copy-Item LICENSE $dest
|
||||
7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\*
|
||||
|
||||
- name: Upload artifacts
|
||||
uses: actions/upload-artifact@v6
|
||||
|
||||
+45
-7
@@ -352,6 +352,8 @@ static std::string get_default_local_path(const std::string & url) {
|
||||
|
||||
common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) {
|
||||
common_download_hf_plan plan;
|
||||
common_download_hf_plan plan_spec;
|
||||
common_download_hf_plan plan_voc;
|
||||
common_download_opts opts;
|
||||
|
||||
const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
|
||||
@@ -377,7 +379,15 @@ common_models_handler common_models_handler_init(const common_params & params, l
|
||||
plan = common_download_get_hf_plan(params.model, opts);
|
||||
}
|
||||
|
||||
return common_models_handler{plan, opts};
|
||||
if (!params.speculative.draft.mparams.hf_repo.empty()) {
|
||||
plan_spec = common_download_get_hf_plan(params.speculative.draft.mparams, opts);
|
||||
}
|
||||
|
||||
if (!params.vocoder.model.hf_repo.empty()) {
|
||||
plan_voc = common_download_get_hf_plan(params.vocoder.model, opts);
|
||||
}
|
||||
|
||||
return common_models_handler{plan, plan_spec, plan_voc, opts};
|
||||
}
|
||||
|
||||
bool common_models_handler_is_preset_repo(const common_models_handler & handler) {
|
||||
@@ -425,7 +435,9 @@ static std::vector<common_download_task> build_url_tasks(const common_params_mod
|
||||
void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) {
|
||||
std::vector<common_download_task> tasks;
|
||||
|
||||
auto & plan = handler.plan;
|
||||
auto & plan = handler.plan;
|
||||
auto & plan_spec = handler.plan_spec;
|
||||
auto & plan_voc = handler.plan_voc;
|
||||
|
||||
auto opts = handler.opts; // copy
|
||||
opts.callback = callback;
|
||||
@@ -484,19 +496,22 @@ void common_models_handler_apply(common_models_handler & handler, common_params
|
||||
}
|
||||
|
||||
// handle hf_plan tasks
|
||||
if (!plan.model_files.empty()) {
|
||||
for (size_t i = 0; i < plan.model_files.size(); ++i) {
|
||||
auto & model_file = plan.model_files[i];
|
||||
auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) {
|
||||
for (size_t i = 0; i < model_files.size(); ++i) {
|
||||
auto & model_file = model_files[i];
|
||||
bool is_first = (i == 0);
|
||||
tasks.emplace_back(model_file, opts, [&, is_first]() {
|
||||
if (is_first) {
|
||||
// only use first part as model path
|
||||
params.model.path = hf_cache::finalize_file(model_file);
|
||||
model.path = hf_cache::finalize_file(model_file);
|
||||
} else {
|
||||
hf_cache::finalize_file(model_file);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
if (!plan.model_files.empty()) {
|
||||
add_tasks(plan.model_files, params.model);
|
||||
}
|
||||
if (!plan.mmproj.local_path.empty()) {
|
||||
tasks.emplace_back(plan.mmproj, opts, [&]() {
|
||||
@@ -522,9 +537,31 @@ void common_models_handler_apply(common_models_handler & handler, common_params
|
||||
});
|
||||
}
|
||||
|
||||
// handle plan_spec (e.g. --spec-draft-hf)
|
||||
if (!plan_spec.model_files.empty()) {
|
||||
add_tasks(plan_spec.model_files, params.speculative.draft.mparams);
|
||||
}
|
||||
|
||||
// handle vocoder plan (e.g. --hf-repo-v)
|
||||
if (!plan_voc.model_files.empty()) {
|
||||
add_tasks(plan_voc.model_files, params.vocoder.model);
|
||||
}
|
||||
|
||||
// run all tasks in parallel
|
||||
if (!params.offline) {
|
||||
common_download_run_tasks(tasks);
|
||||
// if duplicated files are found, only download once (but still call on_done for each task)
|
||||
std::unordered_map<std::string, common_download_task *> unique_tasks;
|
||||
for (auto & task : tasks) {
|
||||
auto it = unique_tasks.find(task.local_path);
|
||||
if (it == unique_tasks.end()) {
|
||||
unique_tasks[task.local_path] = &task;
|
||||
}
|
||||
}
|
||||
std::vector<common_download_task> unique_tasks_vec;
|
||||
for (auto & pair : unique_tasks) {
|
||||
unique_tasks_vec.push_back(*pair.second);
|
||||
}
|
||||
common_download_run_tasks(unique_tasks_vec);
|
||||
}
|
||||
|
||||
// download successful, update params with the downloaded paths
|
||||
@@ -3711,6 +3748,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
"draft model for speculative decoding (default: unused)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.speculative.draft.mparams.path = value;
|
||||
params.speculative.draft.mparams.hf_file = value; // will be used if --spec-draft-hf is set
|
||||
}
|
||||
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
|
||||
add_opt(common_arg(
|
||||
|
||||
@@ -133,6 +133,8 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
|
||||
|
||||
struct common_models_handler {
|
||||
common_download_hf_plan plan;
|
||||
common_download_hf_plan plan_spec;
|
||||
common_download_hf_plan plan_voc;
|
||||
common_download_opts opts;
|
||||
};
|
||||
|
||||
|
||||
@@ -237,8 +237,8 @@ chmod +x ubuntu-llamacpp-ov-install.sh
|
||||
# ============================================
|
||||
set -euo pipefail
|
||||
|
||||
OPENVINO_VERSION_MAJOR="2026.2"
|
||||
OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857"
|
||||
OPENVINO_VERSION_MAJOR="2026.2.1"
|
||||
OPENVINO_VERSION_FULL="2026.2.1.21919.ede283a88e3"
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}"
|
||||
@@ -334,7 +334,7 @@ echo " ./build/ReleaseOV/bin/llama-cli -m model.gguf"
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
|
||||
> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
|
||||
|
||||
</details>
|
||||
|
||||
@@ -364,8 +364,8 @@ REM ============================================
|
||||
REM llama.cpp OpenVINO Build Script (Ninja)
|
||||
REM ============================================
|
||||
|
||||
set "OPENVINO_VERSION_MAJOR=2026.2"
|
||||
set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857"
|
||||
set "OPENVINO_VERSION_MAJOR=2026.2.1"
|
||||
set "OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3"
|
||||
|
||||
set "SCRIPT_DIR=%~dp0"
|
||||
set "VCPKG_DIR=C:\vcpkg"
|
||||
@@ -547,7 +547,7 @@ endlocal
|
||||
```
|
||||
|
||||
> [!NOTE]
|
||||
> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
|
||||
> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
+1
-1
@@ -5,7 +5,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 15)
|
||||
set(GGML_VERSION_PATCH 2)
|
||||
set(GGML_VERSION_PATCH 3)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
||||
@@ -1551,6 +1551,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
int split_backend_id = split->backend_id;
|
||||
ggml_backend_t split_backend = sched->backends[split_backend_id];
|
||||
|
||||
ggml_backend_synchronize(split_backend);
|
||||
|
||||
// copy the input tensors to the split backend
|
||||
for (int input_id = 0; input_id < split->n_inputs; input_id++) {
|
||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
|
||||
@@ -1561,15 +1563,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
} else if (!split_backend->iface.cpy_tensor_async) {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
ggml_backend_tensor_copy(input, input_cpy);
|
||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||
} else {
|
||||
// wait for the split backend to finish using the input before overwriting it
|
||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||
} else {
|
||||
} else if (!split_backend->iface.cpy_tensor_async) {
|
||||
ggml_backend_synchronize(split_backend);
|
||||
}
|
||||
|
||||
@@ -1674,6 +1676,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_synchronize(split_backend);
|
||||
|
||||
if (!sched->callback_eval) {
|
||||
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
|
||||
if (ec != GGML_STATUS_SUCCESS) {
|
||||
|
||||
@@ -3192,11 +3192,24 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||
|
||||
if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
// Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA
|
||||
// Excluding this path for HIP and MUSA as a precaution.
|
||||
// According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways.
|
||||
// Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior.
|
||||
// e.g. https://github.com/ROCm/rocm-systems/issues/5109
|
||||
// It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA.
|
||||
// This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends.
|
||||
#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
|
||||
const bool copy_from_host = false;
|
||||
#else
|
||||
const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
#endif
|
||||
|
||||
if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) {
|
||||
if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3207,14 +3220,17 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context;
|
||||
ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context;
|
||||
|
||||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||
if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) ||
|
||||
!copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||
#endif // NDEBUG
|
||||
return false;
|
||||
}
|
||||
|
||||
if (backend_src != backend_dst) {
|
||||
if (copy_from_host) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream()));
|
||||
} else if (backend_src != backend_dst) {
|
||||
// copy on src stream
|
||||
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
|
||||
|
||||
@@ -1270,77 +1270,14 @@ void GgmlOvDecoder::visit_subgraph(std::function<void(std::shared_ptr<GgmlDecode
|
||||
}
|
||||
|
||||
std::string GgmlOvDecoder::compute_op_type(const ggml_tensor * node) {
|
||||
static const std::map<ggml_op, std::string> ops = {
|
||||
{GGML_OP_NONE, "GGML_OP_NONE" },
|
||||
{GGML_OP_ACC, "GGML_OP_ACC" },
|
||||
{GGML_OP_ADD, "GGML_OP_ADD" },
|
||||
{GGML_OP_ADD1, "GGML_OP_ADD1" },
|
||||
{GGML_OP_ADD_ID, "GGML_OP_ADD_ID" },
|
||||
{GGML_OP_CONCAT, "GGML_OP_CONCAT" },
|
||||
{GGML_OP_CONT, "GGML_OP_CONT" },
|
||||
{GGML_OP_DIV, "GGML_OP_DIV" },
|
||||
{GGML_OP_DUP, "GGML_OP_DUP" },
|
||||
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" },
|
||||
{GGML_OP_MUL, "GGML_OP_MUL" },
|
||||
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" },
|
||||
{GGML_OP_MUL_MAT_ID, "GGML_OP_MUL_MAT_ID" },
|
||||
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE" },
|
||||
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE" },
|
||||
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" },
|
||||
{GGML_OP_NORM, "GGML_OP_NORM" },
|
||||
{GGML_OP_ROPE, "GGML_OP_ROPE" },
|
||||
{GGML_OP_SCALE, "GGML_OP_SCALE" },
|
||||
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" },
|
||||
{GGML_OP_SUM_ROWS, "GGML_OP_SUM_ROWS" },
|
||||
{GGML_OP_SUB, "GGML_OP_SUB" },
|
||||
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" },
|
||||
{GGML_OP_VIEW, "GGML_OP_VIEW" },
|
||||
{GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" },
|
||||
{GGML_OP_CPY, "GGML_OP_CPY" },
|
||||
{GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT" },
|
||||
{GGML_OP_L2_NORM, "GGML_OP_L2_NORM" },
|
||||
{GGML_OP_CLAMP, "GGML_OP_CLAMP" },
|
||||
{GGML_OP_PAD, "GGML_OP_PAD" },
|
||||
{GGML_OP_SSM_CONV, "GGML_OP_SSM_CONV" },
|
||||
{GGML_OP_GATED_DELTA_NET, "GGML_OP_GATED_DELTA_NET"},
|
||||
{GGML_OP_ARGSORT, "GGML_OP_ARGSORT" },
|
||||
{GGML_OP_REPEAT, "GGML_OP_REPEAT" },
|
||||
{GGML_OP_IM2COL, "GGML_OP_IM2COL" }
|
||||
};
|
||||
static const std::map<ggml_unary_op, std::string> unary_ops = {
|
||||
{GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" },
|
||||
{GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" },
|
||||
{GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" },
|
||||
{GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" },
|
||||
{GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" },
|
||||
{GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" },
|
||||
{GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" },
|
||||
{GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" },
|
||||
{GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" },
|
||||
{GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" },
|
||||
{GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" },
|
||||
{GGML_UNARY_OP_SOFTPLUS, "GGML_UNARY_OP_SOFTPLUS" },
|
||||
{GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" },
|
||||
{GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"},
|
||||
{GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" },
|
||||
{GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" }
|
||||
};
|
||||
static const std::map<ggml_glu_op, std::string> glu_ops = {
|
||||
{GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"},
|
||||
{GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" },
|
||||
{GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" }
|
||||
};
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_UNARY:
|
||||
return unary_ops.at(ggml_get_unary_op(node));
|
||||
return std::string("GGML_UNARY_OP_") + ggml_unary_op_name(ggml_get_unary_op(node));
|
||||
case GGML_OP_GLU:
|
||||
return glu_ops.at(ggml_get_glu_op(node));
|
||||
return std::string("GGML_GLU_OP_") + ggml_glu_op_name(ggml_get_glu_op(node));
|
||||
default:
|
||||
return ops.at(node->op);
|
||||
return std::string("GGML_OP_") + ggml_op_name(node->op);
|
||||
}
|
||||
static const std::string unknown_op = "UNKNOWN_GGML_OP";
|
||||
return unknown_op;
|
||||
}
|
||||
|
||||
const std::string & GgmlOvDecoder::get_op_type(int node_idx) const {
|
||||
|
||||
@@ -17,6 +17,22 @@ namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
static ov::Output<ov::Node> reshape_add_id_input_to_2d(const ov::Output<ov::Node> & input,
|
||||
const ov::PartialShape & input_shape,
|
||||
const std::vector<int> & dims) {
|
||||
const auto actual_shape = input.get_partial_shape();
|
||||
if (actual_shape.rank().is_static() && actual_shape.rank().get_length() == 2) {
|
||||
return input;
|
||||
}
|
||||
|
||||
if (input_shape.rank().is_static() && input_shape.rank().get_length() == 2) {
|
||||
return input;
|
||||
}
|
||||
|
||||
auto shape = std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64);
|
||||
return std::make_shared<ov::op::v1::Reshape>(input, get_dimensions(shape, dims), false);
|
||||
}
|
||||
|
||||
OutputVector translate_add_id(const NodeContext & context) {
|
||||
num_inputs_check(context, 3, 3);
|
||||
|
||||
@@ -28,11 +44,9 @@ OutputVector translate_add_id(const NodeContext & context) {
|
||||
// input: [1, n_token, n_used, n_embd]
|
||||
// bias: [1, 1, n_expert, n_embd]
|
||||
// ids: [1, 1, n_token, n_used]
|
||||
auto bias_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(bias, ov::element::i64);
|
||||
auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
|
||||
|
||||
bias = std::make_shared<ov::op::v1::Reshape>(bias, get_dimensions(bias_shape_4d, {2, 3}), false);
|
||||
ids = std::make_shared<ov::op::v1::Reshape>(ids, get_dimensions(ids_shape_4d, {2, 3}), false);
|
||||
// Model bias constants may already be stored as [n_expert, n_embd].
|
||||
bias = reshape_add_id_input_to_2d(bias, context.get_input_shape(1), {2, 3});
|
||||
ids = reshape_add_id_input_to_2d(ids, context.get_input_shape(2), {2, 3});
|
||||
|
||||
if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
|
||||
ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
|
||||
|
||||
@@ -3,8 +3,11 @@
|
||||
#include "../utils.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/clamp.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/sigmoid.hpp>
|
||||
@@ -15,7 +18,7 @@ namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
static std::pair<ov::Output<ov::Node>, ov::Output<ov::Node>> get_glu_inputs(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 2);
|
||||
|
||||
ov::Output<ov::Node> src0;
|
||||
@@ -52,6 +55,12 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
std::swap(src0, src1);
|
||||
}
|
||||
|
||||
return {src0, src1};
|
||||
}
|
||||
|
||||
OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
auto [src0, src1] = get_glu_inputs(context);
|
||||
|
||||
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(src0);
|
||||
auto silu = std::make_shared<ov::op::v1::Multiply>(src0, sigmoid);
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(silu, src1);
|
||||
@@ -59,6 +68,27 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
OutputVector translate_glu_swiglu_oai(const NodeContext & context) {
|
||||
auto [src0, src1] = get_glu_inputs(context);
|
||||
|
||||
const int32_t * params = context.get_output_op_params();
|
||||
const float alpha = reinterpret_cast<const float *>(params)[2];
|
||||
const float limit = reinterpret_cast<const float *>(params)[3];
|
||||
|
||||
auto gate = std::make_shared<ov::op::v0::Clamp>(src0, -std::numeric_limits<float>::infinity(), limit);
|
||||
auto alpha_const = ov::op::v0::Constant::create(ov::element::f32, {}, {alpha});
|
||||
auto scaled_gate = std::make_shared<ov::op::v1::Multiply>(gate, alpha_const);
|
||||
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(scaled_gate);
|
||||
auto out_glu = std::make_shared<ov::op::v1::Multiply>(gate, sigmoid);
|
||||
|
||||
auto up = std::make_shared<ov::op::v0::Clamp>(src1, -limit, limit);
|
||||
auto one = ov::op::v0::Constant::create(ov::element::f32, {}, {1.0f});
|
||||
auto up_plus_one = std::make_shared<ov::op::v1::Add>(up, one);
|
||||
auto res = std::make_shared<ov::op::v1::Multiply>(out_glu, up_plus_one);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
} // namespace op
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
|
||||
@@ -2,23 +2,135 @@
|
||||
#include "../op_table.h"
|
||||
#include "../utils.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <openvino/op/bitwise_and.hpp>
|
||||
#include <openvino/op/bitwise_right_shift.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/gather.hpp>
|
||||
#include <openvino/op/matmul.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
namespace {
|
||||
|
||||
std::shared_ptr<ov::op::v0::Constant> const_i64(const std::vector<int64_t> & values) {
|
||||
return ov::op::v0::Constant::create(ov::element::i64, ov::Shape{values.size()}, values);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> slice_axis(const ov::Output<ov::Node> & input, int64_t axis, int64_t begin, int64_t end) {
|
||||
return std::make_shared<ov::op::v8::Slice>(input, const_i64({begin}), const_i64({end}), const_i64({1}),
|
||||
const_i64({axis}));
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> translate_mul_mat_id_mxfp4_packed(const NodeContext & context,
|
||||
ov::Output<ov::Node> expert_weights,
|
||||
ov::Output<ov::Node> activations,
|
||||
ov::Output<ov::Node> ids) {
|
||||
auto packed_shape = expert_weights.get_partial_shape().to_shape();
|
||||
FRONT_END_OP_CONVERSION_CHECK(packed_shape.size() == 5 && packed_shape[4] == 17,
|
||||
"Expected packed MXFP4 expert weights with shape [1, n_expert, m, k_blocks, 17]");
|
||||
|
||||
const int64_t n_expert = static_cast<int64_t>(packed_shape[1]);
|
||||
const int64_t rows = static_cast<int64_t>(packed_shape[2]);
|
||||
const int64_t k_blocks = static_cast<int64_t>(packed_shape[3]);
|
||||
const int64_t qk = 32;
|
||||
const int64_t cols = k_blocks * qk;
|
||||
|
||||
auto packed_shape_4d = const_i64({n_expert, rows, k_blocks, 17});
|
||||
expert_weights = std::make_shared<ov::op::v1::Reshape>(expert_weights, packed_shape_4d, false);
|
||||
|
||||
auto activations_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
|
||||
auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
|
||||
auto activations_shape_3d = get_dimensions(activations_shape_4d, {1, 2, 3});
|
||||
auto ids_shape_2d = get_dimensions(ids_shape_4d, {2, 3});
|
||||
|
||||
activations = std::make_shared<ov::op::v1::Reshape>(activations, activations_shape_3d, false);
|
||||
ids = std::make_shared<ov::op::v1::Reshape>(ids, ids_shape_2d, false);
|
||||
if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
|
||||
ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
|
||||
}
|
||||
|
||||
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
||||
|
||||
static const std::vector<float> f4e2m1_lut = {0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f,
|
||||
-0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f};
|
||||
std::vector<float> e8m0_lut(256);
|
||||
for (size_t i = 0; i < e8m0_lut.size(); ++i) {
|
||||
uint32_t bits = static_cast<uint32_t>(i) << 23;
|
||||
memcpy(&e8m0_lut[i], &bits, sizeof(float));
|
||||
}
|
||||
e8m0_lut[0] = std::numeric_limits<float>::min() / 2.0f;
|
||||
e8m0_lut[255] = std::numeric_limits<float>::quiet_NaN();
|
||||
|
||||
auto f4_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{f4e2m1_lut.size()}, f4e2m1_lut);
|
||||
auto scale_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{e8m0_lut.size()}, e8m0_lut);
|
||||
|
||||
auto selected_packed_weights = std::make_shared<ov::op::v8::Gather>(expert_weights, ids, gather_axis);
|
||||
auto scale_byte = slice_axis(selected_packed_weights, 4, 0, 1);
|
||||
auto qs = slice_axis(selected_packed_weights, 4, 1, 17);
|
||||
auto low = std::make_shared<ov::op::v13::BitwiseAnd>(
|
||||
qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {0x0F}), ov::op::AutoBroadcastType::NUMPY);
|
||||
auto high_shift = std::make_shared<ov::op::v15::BitwiseRightShift>(
|
||||
qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {4}), ov::op::AutoBroadcastType::NUMPY);
|
||||
auto nibbles = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{low, high_shift}, 4);
|
||||
auto nibble_indices = std::make_shared<ov::op::v0::Convert>(nibbles, ov::element::i32);
|
||||
auto weights_f32 = std::make_shared<ov::op::v8::Gather>(f4_lut, nibble_indices, gather_axis);
|
||||
|
||||
auto scale_indices = std::make_shared<ov::op::v0::Convert>(scale_byte, ov::element::i32);
|
||||
auto scales_f32 = std::make_shared<ov::op::v8::Gather>(scale_lut, scale_indices, gather_axis);
|
||||
ov::Output<ov::Node> selected_weights = std::make_shared<ov::op::v1::Multiply>(weights_f32, scales_f32,
|
||||
ov::op::AutoBroadcastType::NUMPY);
|
||||
|
||||
auto ids_shape = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
|
||||
auto selected_weights_target_dims = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{get_dimensions(ids_shape, {0, 1}), const_i64({rows, cols})}, 0);
|
||||
selected_weights = std::make_shared<ov::op::v1::Reshape>(selected_weights, selected_weights_target_dims, false);
|
||||
|
||||
auto activations_shape = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
|
||||
ov::Output<ov::Node> acts_target_dims = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{
|
||||
get_dimensions(activations_shape, {0}),
|
||||
get_dimensions(ids_shape, {1}),
|
||||
get_dimensions(activations_shape, {2}),
|
||||
},
|
||||
0);
|
||||
ov::Output<ov::Node> acts_broadcasted =
|
||||
std::make_shared<ov::op::v3::Broadcast>(activations, acts_target_dims, ov::op::BroadcastType::BIDIRECTIONAL);
|
||||
|
||||
auto activations_expanded = std::make_shared<ov::op::v0::Unsqueeze>(acts_broadcasted, const_i64({2}));
|
||||
ov::Output<ov::Node> result =
|
||||
std::make_shared<ov::op::v0::MatMul>(activations_expanded, selected_weights, false, true);
|
||||
|
||||
auto batch_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto row_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {rows});
|
||||
auto result_target_dims = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{batch_dim, get_dimensions(ids_shape, {0, 1}), row_dim}, 0);
|
||||
result = std::make_shared<ov::op::v1::Reshape>(result, result_target_dims, false);
|
||||
|
||||
const auto output_type = context.get_output_type();
|
||||
if (result.get_element_type() != output_type) {
|
||||
result = std::make_shared<ov::op::v0::Convert>(result, output_type);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
OutputVector translate_mul_mat_id(const NodeContext & context) {
|
||||
num_inputs_check(context, 3, 3);
|
||||
|
||||
@@ -26,6 +138,12 @@ OutputVector translate_mul_mat_id(const NodeContext & context) {
|
||||
auto activations = process_view_input_new(context, 1);
|
||||
auto ids = process_view_input_new(context, 2);
|
||||
|
||||
if (expert_weights.get_element_type() == ov::element::u8 && expert_weights.get_partial_shape().rank().is_static() &&
|
||||
expert_weights.get_partial_shape().rank().get_length() == 5) {
|
||||
return rename_outputs_with_suffix({translate_mul_mat_id_mxfp4_packed(context, expert_weights, activations, ids)},
|
||||
context.get_name());
|
||||
}
|
||||
|
||||
// OpenVINO sees GGML tensors in reversed dimension order:
|
||||
// weights: [1, n_expert, m, k]
|
||||
// activations: [1, n_tokens, n_used_or_1, k]
|
||||
|
||||
@@ -6,12 +6,16 @@
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/frontend/exception.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/multiply.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/shape_of.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/softmax.hpp>
|
||||
#include <vector>
|
||||
|
||||
@@ -20,12 +24,31 @@ namespace frontend {
|
||||
namespace ggml {
|
||||
namespace op {
|
||||
|
||||
static bool is_static_one(const ov::Dimension & dim) {
|
||||
return dim.is_static() && dim.get_length() == 1;
|
||||
}
|
||||
|
||||
static bool same_static_dim(const ov::Dimension & lhs, const ov::Dimension & rhs) {
|
||||
return lhs.is_static() && rhs.is_static() && lhs.get_length() == rhs.get_length();
|
||||
}
|
||||
|
||||
static bool is_attention_sinks_input_shape(const ov::PartialShape & candidate, const ov::PartialShape & logits_shape) {
|
||||
if (candidate.rank().is_dynamic() || logits_shape.rank().is_dynamic() || candidate.rank().get_length() != 4 ||
|
||||
logits_shape.rank().get_length() != 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return is_static_one(candidate[0]) && is_static_one(candidate[1]) && is_static_one(candidate[2]) &&
|
||||
same_static_dim(candidate[3], logits_shape[1]);
|
||||
}
|
||||
|
||||
// Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend:
|
||||
// 1) logits = src0 * scale
|
||||
// 2) logits += mask (if provided)
|
||||
// 3) softmax over the last dimension
|
||||
// 3) append attention sinks as hidden logits (if provided)
|
||||
// 4) softmax over the last dimension and remove the hidden sink column
|
||||
OutputVector translate_soft_max(const NodeContext & context) {
|
||||
num_inputs_check(context, 1, 2);
|
||||
num_inputs_check(context, 1, 3);
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
@@ -33,6 +56,11 @@ OutputVector translate_soft_max(const NodeContext & context) {
|
||||
memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float));
|
||||
|
||||
ov::Output<ov::Node> logits = context.get_input(0);
|
||||
const bool second_input_is_sinks =
|
||||
context.get_input_size() == 2 && is_attention_sinks_input_shape(context.get_input_shape(1), context.get_output_shape());
|
||||
const bool has_mask = context.get_input_size() > 1 && !second_input_is_sinks;
|
||||
const bool has_sinks = second_input_is_sinks || context.get_input_size() > 2;
|
||||
const size_t sinks_input_idx = second_input_is_sinks ? 1 : 2;
|
||||
|
||||
// Apply scale first: logits = src0 * scale
|
||||
if (scale != 1.0f) {
|
||||
@@ -41,12 +69,12 @@ OutputVector translate_soft_max(const NodeContext & context) {
|
||||
logits = std::make_shared<ov::op::v1::Multiply>(logits, scale_const);
|
||||
}
|
||||
|
||||
FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && context.get_input_size() < 2),
|
||||
FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && !has_mask),
|
||||
"OpenVINO softmax ALiBi path requires mask input");
|
||||
|
||||
// Optional mask add: logits += mask
|
||||
// For max_bias > 0 (ALiBi), apply per-head slope to mask before adding.
|
||||
if (context.get_input_size() > 1) {
|
||||
if (has_mask) {
|
||||
ov::Output<ov::Node> mask = context.get_input(1);
|
||||
|
||||
// For stateful
|
||||
@@ -94,8 +122,40 @@ OutputVector translate_soft_max(const NodeContext & context) {
|
||||
logits = std::make_shared<ov::op::v1::Add>(logits, mask);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> softmax_input = logits;
|
||||
if (has_sinks) {
|
||||
ov::Output<ov::Node> sinks = context.get_input(sinks_input_idx);
|
||||
if (sinks.get_element_type() != logits.get_element_type()) {
|
||||
sinks = std::make_shared<ov::op::v0::Convert>(sinks, logits.get_element_type());
|
||||
}
|
||||
|
||||
auto sink_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, {1, -1, 1, 1});
|
||||
auto sinks_4d = std::make_shared<ov::op::v1::Reshape>(sinks, sink_shape, false);
|
||||
|
||||
auto logits_shape = std::make_shared<ov::op::v3::ShapeOf>(logits, ov::element::i64);
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
|
||||
auto four = ov::op::v0::Constant::create(ov::element::i64, {1}, {4});
|
||||
auto shape_axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
|
||||
auto sink_prefix_shape = std::make_shared<ov::op::v8::Slice>(logits_shape, zero, three, one, shape_axis);
|
||||
auto sink_last_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
|
||||
auto sink_broadcast_shape = std::make_shared<ov::op::v0::Concat>(
|
||||
ov::OutputVector{sink_prefix_shape, sink_last_dim}, 0);
|
||||
auto sink_column = std::make_shared<ov::op::v3::Broadcast>(sinks_4d, sink_broadcast_shape,
|
||||
ov::op::BroadcastType::BIDIRECTIONAL);
|
||||
softmax_input = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{logits, sink_column}, 3);
|
||||
|
||||
auto softmax_with_sink = std::make_shared<ov::op::v8::Softmax>(softmax_input, -1);
|
||||
auto original_last_dim = std::make_shared<ov::op::v8::Slice>(logits_shape, three, four, one, shape_axis);
|
||||
auto res = std::make_shared<ov::op::v8::Slice>(softmax_with_sink, zero, original_last_dim, one, three);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
// Softmax along last dimension (equivalent to ggml softmax over ne[0]).
|
||||
auto res = std::make_shared<ov::op::v8::Softmax>(logits, -1);
|
||||
auto res = std::make_shared<ov::op::v8::Softmax>(softmax_input, -1);
|
||||
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
||||
@@ -47,6 +47,7 @@ std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
|
||||
{"GGML_UNARY_OP_TANH", op::translate_1to1_match_1_input<v0::Tanh> },
|
||||
{"GGML_OP_VIEW", op::translate_view },
|
||||
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
|
||||
{"GGML_GLU_OP_SWIGLU_OAI", op::translate_glu_swiglu_oai },
|
||||
{"GGML_GLU_OP_GEGLU", op::translate_glu_geglu },
|
||||
{"GGML_OP_SET_ROWS", op::translate_set_rows },
|
||||
{"GGML_OP_CPY", op::translate_cpy },
|
||||
|
||||
@@ -32,6 +32,7 @@ GGML_OP_CONVERTER(translate_soft_max);
|
||||
GGML_OP_CONVERTER(translate_transpose);
|
||||
GGML_OP_CONVERTER(translate_view);
|
||||
GGML_OP_CONVERTER(translate_glu_swiglu);
|
||||
GGML_OP_CONVERTER(translate_glu_swiglu_oai);
|
||||
GGML_OP_CONVERTER(translate_glu_geglu);
|
||||
GGML_OP_CONVERTER(translate_set_rows);
|
||||
GGML_OP_CONVERTER(translate_cpy);
|
||||
|
||||
@@ -308,6 +308,7 @@ enum vk_device_architecture {
|
||||
AMD_RDNA1,
|
||||
AMD_RDNA2,
|
||||
AMD_RDNA3,
|
||||
INTEL_XE1,
|
||||
INTEL_XE2,
|
||||
NVIDIA_PRE_TURING,
|
||||
NVIDIA_TURING,
|
||||
@@ -365,21 +366,26 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
|
||||
bool subgroup_size_control = false;
|
||||
bool integer_dot_product = false;
|
||||
|
||||
for (const auto& properties : ext_props) {
|
||||
if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
||||
subgroup_size_control = true;
|
||||
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
|
||||
integer_dot_product = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!subgroup_size_control) {
|
||||
if (!subgroup_size_control || !integer_dot_product) {
|
||||
return vk_device_architecture::OTHER;
|
||||
}
|
||||
|
||||
vk::PhysicalDeviceProperties2 props2;
|
||||
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
|
||||
|
||||
props2.pNext = &subgroup_size_control_props;
|
||||
subgroup_size_control_props.pNext = &integer_dot_props;
|
||||
device.getProperties2(&props2);
|
||||
|
||||
if (subgroup_size_control_props.minSubgroupSize == 16) {
|
||||
@@ -388,6 +394,9 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
||||
// https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
|
||||
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
||||
return vk_device_architecture::INTEL_XE2;
|
||||
} else if (subgroup_size_control_props.minSubgroupSize == 8 &&
|
||||
integer_dot_product && integer_dot_props.integerDotProduct4x8BitPackedSignedAccelerated) {
|
||||
return vk_device_architecture::INTEL_XE1;
|
||||
}
|
||||
} else if (props.vendorID == VK_VENDOR_ID_NVIDIA) {
|
||||
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
||||
@@ -3837,7 +3846,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 };
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) {
|
||||
} else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support) {
|
||||
// Xe2/Xe3 with coopmat enabled - warptile performance tuning
|
||||
l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 };
|
||||
@@ -4710,7 +4719,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
|
||||
}
|
||||
uint32_t rm_iq = 2 * rm_kq;
|
||||
|
||||
const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN;
|
||||
const bool use_subgroups = device->subgroup_arithmetic;
|
||||
// Ensure a subgroup size >= 16 is available
|
||||
const bool use_subgroups16 = use_subgroups && subgroup_min_size_16;
|
||||
|
||||
@@ -6361,9 +6370,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||
break;
|
||||
case VK_VENDOR_ID_INTEL: {
|
||||
// Current Windows driver does not expose BF16 support.
|
||||
// We only want to use l_warptile if coopmat is available and is Xe2+
|
||||
const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2;
|
||||
const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat;
|
||||
// We only want to use l_warptile if coopmat is available
|
||||
const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && device->coopmat_support) : device->coopmat_support;
|
||||
device->mul_mat_l[i] = use_l_warptile;
|
||||
device->mul_mat_id_l[i] = use_l_warptile;
|
||||
device->mul_mat_m[i] = true;
|
||||
@@ -17890,9 +17898,9 @@ static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) {
|
||||
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
|
||||
switch (props.vendorID) {
|
||||
case VK_VENDOR_ID_INTEL:
|
||||
// Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
|
||||
// while some older hardware (ex. Arc A770) has performance regressions
|
||||
return arch == vk_device_architecture::INTEL_XE2;
|
||||
// Only allowing Xe2/Xe3 GPU and integrated Xe GPUs at the moment since older hardware (ex. Arc A770) has performance regressions.
|
||||
return (arch == vk_device_architecture::INTEL_XE2) ||
|
||||
(arch == vk_device_architecture::INTEL_XE1 && props.deviceType == vk::PhysicalDeviceType::eIntegratedGpu && driver_props.driverID == vk::DriverId::eIntelProprietaryWindows);
|
||||
case VK_VENDOR_ID_AMD:
|
||||
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
||||
// Workaround for AMD proprietary driver reporting support on all GPUs
|
||||
@@ -17940,6 +17948,8 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev)
|
||||
case 0xE20B: // B580
|
||||
case 0xE211: // Pro B60
|
||||
return 20;
|
||||
case 0xB080: // PTL Xe3 LPG 2x6 (12 subslices)
|
||||
return 12;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -158,7 +158,7 @@ const uint32_t Csh_stride = BS_NPQ;
|
||||
#ifdef COOPMAT
|
||||
const uint32_t Csh_len = BS_K * Csh_stride;
|
||||
#else
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1;
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 8; // 8 to workaround compiler bug
|
||||
#endif
|
||||
shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ
|
||||
#endif
|
||||
|
||||
@@ -144,7 +144,7 @@ const uint32_t Csh_stride = BS_NPQ;
|
||||
#ifdef COOPMAT
|
||||
const uint32_t Csh_len = BS_K * Csh_stride;
|
||||
#else
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1;
|
||||
const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 8; // 8 to workaround compiler bug
|
||||
#endif
|
||||
shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ
|
||||
#endif
|
||||
|
||||
@@ -28,13 +28,10 @@ vec2 cache_b_ds;
|
||||
|
||||
#include "mul_mat_vecq_funcs.glsl"
|
||||
|
||||
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i) {
|
||||
void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint col, const uint b_qs_idx) {
|
||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||
const uint col = i*BLOCK_SIZE + tid*K_PER_ITER;
|
||||
|
||||
// Preload data_b block
|
||||
const uint b_block_idx = (j*p.batch_stride_b + col) / QUANT_K_Q8_1 + b_offset;
|
||||
const uint b_qs_idx = tid % (32 / K_PER_ITER);
|
||||
const uint b_block_idx_outer = b_block_idx / 4;
|
||||
const uint b_block_idx_inner = b_block_idx % 4;
|
||||
cache_b_ds = vec2(data_b[b_block_idx_outer].ds[b_block_idx_inner]);
|
||||
@@ -91,35 +88,35 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
|
||||
}
|
||||
}
|
||||
|
||||
uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE);
|
||||
if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) {
|
||||
const uint col_stride = K_PER_ITER * BLOCK_SIZE;
|
||||
uint num_iters = p.ncols / col_stride;
|
||||
if (num_iters * col_stride + K_PER_ITER * tid < p.ncols) {
|
||||
num_iters++;
|
||||
}
|
||||
int unroll_count = 4;
|
||||
uint unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
uint i = 0;
|
||||
while (i < unrolled_iters) {
|
||||
const uint b_qs_idx = tid % (32 / K_PER_ITER);
|
||||
uint col = tid * K_PER_ITER;
|
||||
while (num_iters >= 4) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
[[unroll]] for (uint k = 0; k < 4; ++k) {
|
||||
iter(temp, first_row, num_rows, col, b_qs_idx);
|
||||
col += col_stride;
|
||||
}
|
||||
|
||||
num_iters -= 4;
|
||||
}
|
||||
|
||||
unroll_count = 2;
|
||||
unrolled_iters = num_iters & ~(unroll_count - 1);
|
||||
|
||||
while (i < unrolled_iters) {
|
||||
if (num_iters >= 2) {
|
||||
// Manually partially unroll the loop
|
||||
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
}
|
||||
iter(temp, first_row, num_rows, col, b_qs_idx);
|
||||
col += col_stride;
|
||||
iter(temp, first_row, num_rows, col, b_qs_idx);
|
||||
col += col_stride;
|
||||
num_iters -= 2;
|
||||
}
|
||||
while (i < num_iters) {
|
||||
iter(temp, first_row, num_rows, tid, i*K_PER_ITER);
|
||||
i++;
|
||||
|
||||
if (num_iters > 0) {
|
||||
iter(temp, first_row, num_rows, col, b_qs_idx);
|
||||
}
|
||||
|
||||
reduce_result(temp, d_offset, first_row, num_rows, tid);
|
||||
|
||||
@@ -1 +1 @@
|
||||
707321c4cf6d21cb4bc831aa8b687dbf01a521ce
|
||||
eced84c86f8b012c752c016f7fe789adea168e1e
|
||||
|
||||
@@ -7973,6 +7973,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_conv_2d({ 256, 256, 192, 1 }, { 3, 3, 192, 96 }, kernel_type, 1, 1, 1, 1, 1, 1, false));
|
||||
}
|
||||
|
||||
// sycl backend will limit task global_range < MAX_INT
|
||||
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
||||
|
||||
+1
-1
@@ -33,7 +33,7 @@
|
||||
|
||||
{#if !readonly && onRemove}
|
||||
<div
|
||||
class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-hover:opacity-100"
|
||||
class="absolute top-10 right-2 flex items-center justify-center opacity-0 transition-opacity group-focus-within:opacity-100 group-hover:opacity-100"
|
||||
>
|
||||
<ActionIcon icon={X} tooltip="Remove" stopPropagationOnClick onclick={() => onRemove?.()} />
|
||||
</div>
|
||||
|
||||
+1
-1
@@ -56,7 +56,7 @@
|
||||
<div class="relative flex h-6 items-center justify-between">
|
||||
<div class="right-0 flex items-center gap-2 opacity-100 transition-opacity">
|
||||
<div
|
||||
class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-hover:opacity-100"
|
||||
class="pointer-events-auto inset-0 flex items-center gap-1 opacity-0 transition-all duration-150 group-focus-within:opacity-100 group-hover:opacity-100"
|
||||
>
|
||||
<ActionIcon icon={Edit} tooltip="Edit" onclick={editCtx.handleEdit} />
|
||||
<ActionIcon icon={Trash2} tooltip="Delete" onclick={onDelete} />
|
||||
|
||||
+56
-81
@@ -39,7 +39,6 @@
|
||||
depth = 0
|
||||
}: Props = $props();
|
||||
|
||||
let renderActionsDropdown = $state(false);
|
||||
let dropdownOpen = $state(false);
|
||||
|
||||
let isLoading = $derived(getAllLoadingChats().includes(conversation.id));
|
||||
@@ -71,26 +70,10 @@
|
||||
}
|
||||
}
|
||||
|
||||
function handleMouseLeave() {
|
||||
if (!dropdownOpen) {
|
||||
renderActionsDropdown = false;
|
||||
}
|
||||
}
|
||||
|
||||
function handleMouseOver() {
|
||||
renderActionsDropdown = true;
|
||||
}
|
||||
|
||||
function handleSelect() {
|
||||
onSelect?.(conversation.id);
|
||||
}
|
||||
|
||||
$effect(() => {
|
||||
if (!dropdownOpen) {
|
||||
renderActionsDropdown = false;
|
||||
}
|
||||
});
|
||||
|
||||
onMount(() => {
|
||||
document.addEventListener('edit-active-conversation', handleGlobalEditEvent as EventListener);
|
||||
|
||||
@@ -103,23 +86,19 @@
|
||||
});
|
||||
</script>
|
||||
|
||||
<!-- svelte-ignore a11y_mouse_events_have_key_events -->
|
||||
<button
|
||||
class="group flex min-h-9 w-full cursor-pointer items-center justify-between space-x-3 rounded-lg py-1.5 text-left transition-colors hover:bg-foreground/10 {isActive
|
||||
<div
|
||||
class="conversation-item group relative flex min-h-9 w-full items-center justify-between space-x-3 rounded-lg py-1.5 transition-colors hover:bg-foreground/10 {isActive
|
||||
? 'bg-foreground/5 text-accent-foreground'
|
||||
: ''} px-3"
|
||||
onclick={handleSelect}
|
||||
onmouseover={handleMouseOver}
|
||||
onmouseleave={handleMouseLeave}
|
||||
onfocusin={handleMouseOver}
|
||||
onfocusout={(e) => {
|
||||
if (!e.currentTarget.contains(e.relatedTarget as Node | null)) {
|
||||
handleMouseLeave();
|
||||
}
|
||||
}}
|
||||
>
|
||||
<button
|
||||
class="absolute inset-0 z-0 cursor-pointer rounded-lg focus:outline-none focus-visible:ring-2 focus-visible:ring-ring"
|
||||
onclick={handleSelect}
|
||||
aria-label={conversation.name}
|
||||
>
|
||||
</button>
|
||||
<div
|
||||
class="flex min-w-0 flex-1 items-center gap-2"
|
||||
class="pointer-events-none relative z-10 flex min-w-0 flex-1 items-center gap-2"
|
||||
style:padding-left="{depth * FORK_TREE_DEPTH_PADDING}px"
|
||||
>
|
||||
{#if depth > 0}
|
||||
@@ -130,7 +109,7 @@
|
||||
<a
|
||||
{...props}
|
||||
href={RouterService.chat(conversation.forkedFromConversationId)}
|
||||
class="flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
|
||||
class="pointer-events-auto flex shrink-0 items-center text-muted-foreground transition-colors hover:text-foreground"
|
||||
>
|
||||
<GitBranch class="h-3.5 w-3.5" />
|
||||
</a>
|
||||
@@ -146,18 +125,15 @@
|
||||
{#if isLoading}
|
||||
<Tooltip.Root>
|
||||
<Tooltip.Trigger>
|
||||
<div
|
||||
class="stop-button flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
|
||||
<button
|
||||
class="stop-button pointer-events-auto flex h-4 w-4 shrink-0 cursor-pointer items-center justify-center rounded text-muted-foreground transition-colors hover:text-foreground"
|
||||
onclick={handleStop}
|
||||
onkeydown={(e) => e.key === 'Enter' && handleStop(e)}
|
||||
role="button"
|
||||
tabindex="0"
|
||||
aria-label="Stop generation"
|
||||
>
|
||||
<Loader2 class="loading-icon h-3.5 w-3.5 animate-spin" />
|
||||
|
||||
<Square class="stop-icon hidden h-3 w-3 fill-current text-destructive" />
|
||||
</div>
|
||||
</button>
|
||||
</Tooltip.Trigger>
|
||||
|
||||
<Tooltip.Content>
|
||||
@@ -169,52 +145,50 @@
|
||||
<TruncatedText text={conversation.name} class="text-sm font-medium" showTooltip={false} />
|
||||
</div>
|
||||
|
||||
{#if renderActionsDropdown}
|
||||
<div class="actions flex items-center">
|
||||
<DropdownMenuActions
|
||||
triggerIcon={MoreHorizontal}
|
||||
triggerTooltip="More actions"
|
||||
bind:open={dropdownOpen}
|
||||
actions={[
|
||||
{
|
||||
icon: conversation.pinned ? PinOff : Pin,
|
||||
label: conversation.pinned ? 'Unpin' : 'Pin',
|
||||
onclick: (e: Event) => {
|
||||
e.stopPropagation();
|
||||
handleTogglePin();
|
||||
}
|
||||
},
|
||||
{
|
||||
icon: Pencil,
|
||||
label: 'Edit',
|
||||
onclick: handleEdit,
|
||||
shortcut: ['shift', 'cmd', 'e']
|
||||
},
|
||||
{
|
||||
icon: Download,
|
||||
label: 'Export',
|
||||
onclick: (e: Event) => {
|
||||
e.stopPropagation();
|
||||
conversationsStore.downloadConversation(conversation.id);
|
||||
},
|
||||
shortcut: ['shift', 'cmd', 's']
|
||||
},
|
||||
{
|
||||
icon: Trash2,
|
||||
label: 'Delete',
|
||||
onclick: handleDelete,
|
||||
variant: 'destructive',
|
||||
shortcut: ['shift', 'cmd', 'd'],
|
||||
separator: true
|
||||
<div class="actions pointer-events-auto relative z-20 flex items-center">
|
||||
<DropdownMenuActions
|
||||
triggerIcon={MoreHorizontal}
|
||||
triggerTooltip="More actions"
|
||||
bind:open={dropdownOpen}
|
||||
actions={[
|
||||
{
|
||||
icon: conversation.pinned ? PinOff : Pin,
|
||||
label: conversation.pinned ? 'Unpin' : 'Pin',
|
||||
onclick: (e: Event) => {
|
||||
e.stopPropagation();
|
||||
handleTogglePin();
|
||||
}
|
||||
]}
|
||||
/>
|
||||
</div>
|
||||
{/if}
|
||||
</button>
|
||||
},
|
||||
{
|
||||
icon: Pencil,
|
||||
label: 'Edit',
|
||||
onclick: handleEdit,
|
||||
shortcut: ['shift', 'cmd', 'e']
|
||||
},
|
||||
{
|
||||
icon: Download,
|
||||
label: 'Export',
|
||||
onclick: (e: Event) => {
|
||||
e.stopPropagation();
|
||||
conversationsStore.downloadConversation(conversation.id);
|
||||
},
|
||||
shortcut: ['shift', 'cmd', 's']
|
||||
},
|
||||
{
|
||||
icon: Trash2,
|
||||
label: 'Delete',
|
||||
onclick: handleDelete,
|
||||
variant: 'destructive',
|
||||
shortcut: ['shift', 'cmd', 'd'],
|
||||
separator: true
|
||||
}
|
||||
]}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
button {
|
||||
.conversation-item {
|
||||
:global([data-slot='dropdown-menu-trigger']:not([data-state='open'])) {
|
||||
opacity: 0;
|
||||
}
|
||||
@@ -239,7 +213,8 @@
|
||||
}
|
||||
}
|
||||
|
||||
&:is(:hover) .stop-button {
|
||||
&:is(:hover) .stop-button,
|
||||
&:focus-within .stop-button {
|
||||
:global(.stop-icon) {
|
||||
display: block;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user