Compare commits

...

25 Commits

Author SHA1 Message Date
Neo Zhang d1759e4156 [SYCL] Add conv_3d (#24691)
* add conv_3d

* optimize

* update ops.md

* restore test script

* rm unused code

* rm copyright notes
2026-06-17 17:20:01 +03:00
Julien Chaumond 8086439a4c webui: export conversations as jsonl (#24688)
* webui: export conversations as jsonl

each session is one jsonl file, a session header line followed by one line per message
exporting multiple conversations bundles them into a zip, one jsonl file each

* webui: import jsonl and zip conversation exports

parse the new jsonl session format and zip archives on import
keep supporting the legacy json format
2026-06-17 13:25:47 +02:00
Winston Ma 558e221b70 vulkan: record actual memory properties during buffer creation (#24326) 2026-06-17 11:14:48 +02:00
Ruben Ortlam ea21e03955 Revert "cuda: reset cuda context after reading memory size (#23935)" (#24715)
This reverts commit 0f7fada56b.
2026-06-17 10:59:35 +02:00
kononnable d5376cf5d7 ci: fix vulkan docker images (#24595)
* Update vulkan-shaders-gen.cpp

* Update vulkan-shaders-gen.cpp

add comment describing code change intention

* Update vulkan-shaders-gen.cpp

fix potential UB
2026-06-17 09:43:45 +02:00
Harapan Rachman bae36efa30 UI : fix SSE transport detection and routing through CORS proxy. Assi… (#24500)
* UI : fix SSE transport detection and routing through CORS proxy. Assisted-by: Antigravity

* ui : replace magic strings with constants in MCP transport handling
2026-06-17 08:26:30 +02:00
lhez 51571722aa opencl: optimize mul_mat_f16_f32_l4 for decode (#24504) 2026-06-16 23:21:26 -07:00
Max Krasnyansky cda63856b8 common: update logging to enforce max_capacity and optimize queue resizing (#24490)
* common: update logging to enforce max_capacity and optimize queue resizing logic

* common/log: remove queue expansion logic
2026-06-17 09:19:11 +03:00
Zijun Yu 890f1a27ed openvino: OV 2026.2, context-shift, Q5_1 support, gemma4 dense/embedding, and -fa off (#24503)
* Add interface is_model_splitted() to check the c-graph is splited or not

* Infer and propagate dynamic-dimension indices for all tensors in the GGML graph in api compute_model_outputs()

* Only do this for fallback sub graph

* Move dynamic dims compute in graph missmatch

* ggml-openvino: fix tensor data handling for PERMUTE/VIEW ops in split models

* ggml-openvino:add comments

* ggml-openvino: override VIEW op_case to 0 for split model inputs

* openvino backend: Handle unsupported VIEW shape-mismatch in OpenVINO backend

* Enable additional mul_mat tests and add tensor data saving function (#81)

* ggml-openvino: fix CONT/TRANSPOSE mapping and improve dynamic-dimension handling

* OpenVINO: add NORM/TANH support and rework SOFT_MAX translation

* ggml-openvino: extend VIEW handling

* Enable -fa off (#118)

* Enable --context-shift

* Fix llm param compute error for normal softmax not the softmax in attention

* OpenVINO backend: fix error for attention size compute in llm param

* use tensor->extra in infer_request i/o

* OpenVINO backend: refacter the compute_llm_params() func add get_attention_pattern_case to easy extand

* OpenVINO backend: clean unused code

* 1to1 match op update (#146)

* added translate_1to1_match_1_input function and updated gelu and tanh translations

* Remove unused translation function calls

---------

Co-authored-by: Mustafa Cavus <mustafacavus@intel.com>

* initial gemma4 support

* removed hardcoded names for kv cache slicing

* OpenVINO backend: Add new attention pattern for llm parameters compute

* flash attn Q shape static conversion

* Remove slice in permute translation when n_seq is 1

* return optional in extract_layer_from_name

* OpenVINO backend: refactor VIEW related operation (#148)

* OpenVINO backend: refactor VIEW related operation

* Enable VIEW handling in following ops

* OpenVINO backend does not support GGML_OP_NORM & GGML_OP_L2_NORM with VIEW input accuracy issue from OpenVINO

* OpenVINO backend: Add ops l2_norm & pad

* OpenVINO backend does not support CPY with non-contiguous data or mismatched types

* add op SSM_CONV GATED_DELTA_NET

* OpenVINO backend: fix error for bf16 in OV gpu plugin

* reverted static Q input shape for attention layer

* OpenVINO backend: remove hardcode name inp_tokens, which ignore some leaf case

* Disable remote tensor due to bug in ov gpu

* Disable n_token > 1 GATED_DELTA_NET on gpu

* OpenVINO backend: fix the view op dynamic handling issue in gemma4 & enable view + get_row

* OpenVINO backend: clean code

* OpenVINO backend: enable view + norm/rms_norm

* OpenVINO backend: concat op

* OpenVINO backend: argsort op

* OpenVINO backend: enable unary + view & GGML_UNARY_OP_SOFTPLUS

* Fix issue for test-backend-ops in TOPK_MOE, which compare VIEW ops result, VIEW node in OpenVINO no need compare, the whole graph result is correct

* OpenVINO backend: enable sum_rows

* OpenVINO backend: enable clamp

* OpenVINO backend: enable DIV

* OpenVINO backend: enable GGML_OP_MUL_MAT_ID

* OpenVINO backend: disable MUL_MAT_ID_FUSION case with large mem needed

* OpenVINO backend: Disable GGML_OP_ARGSORT, cause test_backend-ops failed

* OpenVINO backend: fix issue in mul_mat_id

* OpenVINO backend: Disable DIV with broadcast on GPU

* OpenVINO backend: update DIV

* use ov internal op GatedDeltaNet

* OpenVINO backend: enable llama erch test qwen3next

* OpenVINO backend: enable RMS_NORM + VIEW & remove op_case 2 for rope

* OpenVINO backend: fix error

* suggested changes, need review

* suggested changes, need review

* OpenVINO backend: clean unused code & fix build warning

* OpenVINO backend: enable minicpm3 for arch test

* Disable GDN op (#177)

* disable gated_delta_net

* update stateful_kv_size correctly in mismatch case

* OpenVINO backend: enable arch test for qwen3vl

* OpenVINO backend: enable cohere2 for arch test

* OpenVINO backend: enable t5 for arch test

* OpenVINO backend: enable jamba for arch test

* OpenVINO backend: remove warning for tmp

* OpenVINO backend: enable kimi-linear for arch test

* Remove unused

* Fix gpt-oss accuracy issue

* OpenVINO backend: enable arctic for arch test

* OpenVINO backend: enable grok for arch test

* Gemma4 initial npu support (#179)

* Initiall gemma4 npu support

* temp. fix for gemma4 accuracy bug on npu

* Remove hardcoded names for npu-fold handling

* revert static n tokens for cont translation as it is not needed

* removed unused variable

* ggml-openvino: add GGML_OPENVINO_ENABLE_CACHE env var to control decoder cache. Add environment variable GGML_OPENVINO_ENABLE_CACHE (default: YES). When set to NO, the decoder_cache is bypassed and models are rebuilt from the cgraph on every inference call in both dynamic and static compute paths. This is useful for debugging and verifying correctness without caching interference.

* Revert "Gemma4 initial npu support (#179)"

This reverts commit 0d29a9c4a52dc2c8aa52990f1a3854cfb01768ad.

* OpenVINO backend: disable debug log print

* Update TBB discovery. Delegated to OpenVINOs own config.

* OpenVINO backend: GGML_OPENVINO_ENABLE_CACHE YES -> 1

* OpenVINO backend: fallback FLASH_ATTN_EXT in gemma3n to CPU backend

* Add raw ov infer profiling metric

* Add OV raw infer time metric to static compute path

Co-authored-by: virajwad <84867530+virajwad@users.noreply.github.com>

* Modify precision of static profiling

* update to OV 2026.2, add OV windows CI

* fix editorconfig-checks

* Initiall gemma4 npu support

* temp. fix for gemma4 accuracy bug on npu

* Remove hardcoded names for npu-fold handling

* revert static n tokens for cont translation as it is not needed

* removed unused variable

* test-llama-archs fix

* Fix gemma4 flash_attn fallback

* support im2col

* fix code style

* disable add_rope_sin_cos optimization

* stateless boradcast and rope optimizations

* Enable manual gqa attn by default for stateless gpu

* manual gqa: fixed static batch

* gemma4 llama-bench ctx update fix

* Update OV win CI

* stateful rope fusion temp. fix

* OpenVINO backend: Conslolidate supported ops

* Exclude unsupported GGML_OP_SUB cases

* Exclude unsupported TOPK_MOE cases

* OpenVINO Backend: MUL_MAT enhancements

* Update OV CI

* support f16 mask input for npu

* Make GGML_OPENVINO_* env vars usage uniform

Standardize all GGML_OPENVINO_* env flags:
positive integers >0 to enable. Unset, empty, =0, or non-numeric values to disable.
This fixes cases where text values or empty strings enabled features.

* OpenVINO backend: Enhance envvar handling

* more cleanup

* move ggml_openvino_env_flag to appropriate place

* OpenVINO backend: add REPEAT translator, Q5_1 weights, and GLU view-input fix

* ggml-openvino: fix -Werror=cast-qual in extract_q5_1_data

* Update openvino.Dockerfile

Use BuildKit cache mounts for faster Docker rebuilds.
Use apt instead of dpkg, remove unused .ddeb downloads, add DLLAMA_BUILD_TESTS=OFF.

* ggml-openvino: centralize env var access via *getenv_str/getenv_int helpers

Replace getenv and legacy flags with _str and _int helpers.Minor cleanup, doc updates.

* OpenVINO backend: Enable GGML_OP_ADD_ID

* Uptade openvino backend clamg-format

* clang-format

* Update OPENVINO.md (#211)

* OpenVINO backend: fix accuracy issue for op CONCAT with i64 precision

* Remove strict concurrency for gpu-openvino-low-perf

* Update openvino CI keynames; add ccache-clear

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <1629204+CISC@users.noreply.github.com>

* Fix formatting

---------

Co-authored-by: Xuejun Zhai <Xuejun.Zhai@intel.com>
Co-authored-by: Mustafa Cavus <mustafa.cavus@intel.com>
Co-authored-by: Mustafa Cavus <mustafacavus@intel.com>
Co-authored-by: Xuejun <XuejunZhai@intel.com>
Co-authored-by: Wang Yang <yang4.wang@intel.com>
Co-authored-by: Ravi Panchumarthy <ravi.panchumarthy@intel.com>
Co-authored-by: virajwad <84867530+virajwad@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Mostafa Faheem <mostafaaafaheem@gmail.com>
Co-authored-by: Sigbjørn Skjæret <1629204+CISC@users.noreply.github.com>
2026-06-17 09:11:21 +03:00
Neo Zhang 58728bdbf0 sycl : Enable to support fp16 by OPs: SQR, SQRT, LOG, SIN, COS, CLAMP (#24692) 2026-06-17 08:58:03 +03:00
Alexey Kopytko ebbc1e51c1 SYCL: fix use-after-free bug with async memcpy in MoE prefill (#24676)
* SYCL: fix a bug with async memcpy

* make mmid_row_mapping_host persistent

* comment on stream->wait

* Apply suggestion from @sanmai

* Apply suggestion from @sanmai

* Apply suggestion from @sanmai
2026-06-17 08:57:29 +03:00
Francois Dugast 9b260fc9ef sycl: Add optional USM system allocations (#22526)
This introduces an optional feature to allocate large GPU buffers (≥ 1GB)
using USM system allocations if supported by the device. It allows using
buffers from the system allocator then letting the system manage memory
migrations between host and device as necessary.

This feature is disabled by default and requires the GGML_SYCL_USM_SYSTEM
environment variable to enable. If USM system allocations are not supported
by the device or the system, we fallback to regular allocations.

This feature can allow VRAM overcommit. For example, the test below fails
on B580 due to lack of memory for allocation, but it passes when enabling
USM system allocations:

  ./examples/sycl/test.sh -m Qwen3.5-27B-Q3_K_M.gguf -lv 4

Signed-off-by: Francois Dugast <francois.dugast@intel.com>
2026-06-17 08:54:21 +03:00
Alessandro de Oliveira Faria (A.K.A.CABELO) 74ade52741 vendor : update BoringSSL to 0.20260616.0 (#24693) 2026-06-16 20:24:28 +02:00
Pascal c1304d7b28 ui: add source toggle to mermaid and svg blocks (#24652)
* ui: add source toggle to mermaid and svg blocks

Add a toggle button next to copy and preview that switches a rendered
mermaid or svg block to its source code and back. The button is shared by
both block types and the rendered view stays the default.

The source view reuses the code block scroll container and the highlighted
code element captured at transform time, so it matches the app code blocks
without highlighting again.

Make tall diagrams scroll like text code blocks: safe centering keeps the
diagram centered when it fits and falls back to start alignment when it
overflows, so the top stays reachable instead of clipping above.

Keep the block header opaque and layered above the scrolled diagram, and
ignore header clicks in the zoom handler, so a button click never falls
through to the zoom dialog.

* ui: transparent diagram block header, address review from @allozaur
2026-06-16 14:14:22 +02:00
Oliver Simons 02810c7aa8 Fix and restrict NVFP4 edge-cases in llama-graph (#24331)
* Move post-GEMM MUL required for dequant b4 lora and bias add

see https://github.com/ggml-org/llama.cpp/pull/23484 :
1. For lora, I would presume we want fully dequantized values before
   doing the residuals, but this depends on how the LORAs were
generated. Literature tells me LORA happens post-mul but pre-bias add https://github.com/ggml-org/llama.cpp/pull/8332
2. For ModelOPT, bias-add should happen on [fully-dequantized
   values](https://github.com/NVIDIA/Model-Optimizer/blob/b49f9b9e2d747af992d78a3aa7f10efe5a8847e1/modelopt/torch/quantization/backends/nvfp4_gemm.py#L59-L64)

* Restrict build_ffn for NVFP4 to supported combinations
2026-06-16 11:52:38 +02:00
Ruixiang Wang a1824902b5 spec: add backend sampling support for eagle3 (#24655) 2026-06-16 12:05:52 +03:00
Winston Ma 32120c10e3 vulkan: prefer host-visible memory buffers on UMA devices (#22930)
* implement UMA host-visible memory

* update based on 0cc4m's suggestion
2026-06-16 09:36:52 +02:00
Jeff Bolz d5fb104293 vulkan: Support gated_delta_net with S_v=16 (#24581) 2026-06-16 09:26:57 +02:00
Ruixiang Wang 635b65ad7a spec: add spec metrics mean acceptance length and acceptance rate per position (#24536)
* spec: add spec metrics mean acceptance length and acceptance per pos

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestion

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix as suggestions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2026-06-16 10:23:09 +03:00
Adrien Gallouët e3a74b2990 bench : add --offline (#24511)
* bench : add --offline

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

* Add default

Signed-off-by: Adrien Gallouët <angt@huggingface.co>

---------

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-06-16 08:26:05 +02:00
Frosty40 ac79caa7ce sycl: support reordered Q4_K/Q5_K/Q6_K MoE MUL_MAT_ID (#24452)
* sycl: support reordered Q4_K and Q5_K MoE MUL_MAT_ID

Extend reordered-weight handling to fused MoE MUL_MAT_ID for Q4_K and Q5_K expert tensors and add Q5_K reordered DMMV coverage. Unsupported 3D reorder cases now fall back instead of aborting.

* sycl: extend MoE reorder to Q6_K mul_mat_id
2026-06-16 08:35:00 +03:00
Neo Zhang fdd109883d [SYCL] Support OP EXPM1, support all UT cases of FLOOR, TRUNC, ROUND (#24363)
* support OP EXPM1, support all UT cases of FLOOR, TRUNC, ROUND

* fix conflict

* rebase, support new UT case of repeat, concat
2026-06-16 08:34:29 +03:00
Todd Malsbary 4196b477da sycl : Make GGML_SYCL_F16=ON the default (#23996)
* Add -cl-fp32-correctly-rounded-divide-sqrt to F16=ON builds

Signed-off-by: Todd Malsbary <todd.malsbary@intel.com>

* Make GGML_SYCL_F16=ON the default

Signed-off-by: Todd Malsbary <todd.malsbary@intel.com>

* Leave F32 the default

F16 remains explictly set for example and Dockerfile builds.

Signed-off-by: Todd Malsbary <todd.malsbary@intel.com>

* Revert changes to examples/sycl/build scripts

Signed-off-by: Todd Malsbary <todd.malsbary@intel.com>

---------

Signed-off-by: Todd Malsbary <todd.malsbary@intel.com>
2026-06-16 08:34:02 +03:00
Pascal ad39ccaa19 vulkan: add col2im_1d op (#24425)
* vulkan: add GGML_OP_COL2IM_1D, follow-up to the CPU op

* vulkan: col2im_1d bounded gather loop instead of full-K scan with modulo

* vulkan: col2im_1d address review from @jeffbolznv

* vulkan: col2im_1d return nullptr for unsupported types, address review from @0cc4m
2026-06-16 06:34:43 +02:00
Tarek Dakhran 7dad2f1a17 chat : fix LFM2 tool-call parsing double-escaping (#24667)
* Add escape test cases

* chat : fix LFM2 tool-call parsing double-escaping
2026-06-15 22:10:09 +02:00
111 changed files with 7732 additions and 1643 deletions
+3 -2
View File
@@ -7,7 +7,7 @@ ARG APP_REVISION=N/A
FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
ARG GGML_SYCL_F16=ON
ARG LEVEL_ZERO_VERSION=1.28.2
ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
RUN apt-get update && \
@@ -24,7 +24,8 @@ COPY . .
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
&& export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \
+63 -46
View File
@@ -1,17 +1,17 @@
ARG OPENVINO_VERSION_MAJOR=2026.0
ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
ARG OPENVINO_VERSION_MAJOR=2026.2
ARG OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857
ARG UBUNTU_VERSION=24.04
# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGDGMM_VERSION=22.9.0
ARG IGC_VERSION=v2.34.4
ARG IGC_VERSION_FULL=2_2.34.4+21428
ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
ARG IGDGMM_VERSION=22.10.0
# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
ARG NPU_DRIVER_VERSION=v1.32.0
ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
ARG NPU_DRIVER_VERSION=v1.33.0
ARG NPU_DRIVER_FULL=v1.33.0.20260529-26625960453
ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
# Optional proxy build arguments
@@ -46,13 +46,18 @@ RUN apt-get update && \
intel-opencl-icd && \
rm -rf /var/lib/apt/lists/*
# Install OpenVINO for Ubuntu 24.04
# OpenVINO toolkit and GPU/NPU drivers are cached via BuildKit cache mounts to avoid re-downloading on rebuilds.
# Install OpenVINO for Ubuntu 24.04.
ARG OPENVINO_VERSION_MAJOR
ARG OPENVINO_VERSION_FULL
RUN mkdir -p /opt/intel && \
wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
RUN --mount=type=cache,target=/var/cache/openvino,sharing=locked \
mkdir -p /opt/intel && \
TGZ=/var/cache/openvino/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
if [ ! -f "$TGZ" ]; then \
wget -O "$TGZ" https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz; \
fi && \
tar -xf "$TGZ" -C /opt/intel/ && \
mv /opt/intel/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
cd - && \
@@ -68,14 +73,14 @@ COPY . .
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_OPENVINO=ON && \
cmake --build build/ReleaseOV -j$(nproc)"
cmake --build build/ReleaseOV --parallel "
# Copy all necessary libraries
# Copy all necessary libraries (build outputs + OpenVINO runtime libs)
RUN mkdir -p /app/lib && \
find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
find build/ReleaseOV -name '*.so*' -exec cp -P {} /app/lib \; && \
find "${OpenVINO_DIR}/runtime/lib/intel64" -name '*.so*' -exec cp -P {} /app/lib \;
# Create runtime directories and copy binaries
RUN mkdir -p /app/full \
@@ -120,33 +125,41 @@ ARG IGC_VERSION_FULL
ARG COMPUTE_RUNTIME_VERSION
ARG COMPUTE_RUNTIME_VERSION_FULL
ARG IGDGMM_VERSION
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
&& wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
&& dpkg --install *.deb \
&& rm -rf /tmp/neo/
RUN --mount=type=cache,target=/var/cache/intel-gpu,sharing=locked \
set -eux; \
cd /var/cache/intel-gpu; \
for url in \
https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb ; do \
f=$(basename "$url"); \
[ -f "$f" ] || wget -q -O "$f" "$url"; \
done; \
apt-get update; \
apt-get install -y --no-install-recommends ./*.deb; \
rm -rf /var/lib/apt/lists/*
# Install NPU drivers
ARG NPU_DRIVER_VERSION
ARG NPU_DRIVER_FULL
ARG LIBZE1_VERSION
RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
&& wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
&& dpkg --install *.deb \
&& rm -rf /tmp/npu/
RUN cd /tmp \
&& wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
&& dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
&& rm libze1_${LIBZE1_VERSION}_amd64.deb
RUN --mount=type=cache,target=/var/cache/intel-npu,sharing=locked \
set -eux; \
TGZ=/var/cache/intel-npu/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
if [ ! -f "$TGZ" ]; then \
wget -q -O "$TGZ" https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \
fi; \
DEB=/var/cache/intel-npu/libze1_${LIBZE1_VERSION}_amd64.deb; \
if [ ! -f "$DEB" ]; then \
wget -q -O "$DEB" https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb; \
fi; \
mkdir /tmp/npu/ && cd /tmp/npu/ && tar -xf "$TGZ" && cp "$DEB" .; \
apt-get update; \
apt-get install -y --no-install-recommends ./*.deb; \
rm -rf /tmp/npu/ /var/lib/apt/lists/*
COPY --from=build /app/lib/ /app/
@@ -166,22 +179,26 @@ RUN apt-get update && \
python3 \
python3-venv \
python3-pip && \
python3 -m venv /ov-venv && \
/ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
/ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
python3 -m venv /openvino-venv && \
/openvino-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
/openvino-venv/bin/pip install --no-cache-dir -r requirements.txt && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
# Activate the venv
ENV VIRTUAL_ENV=/openvino-venv \
PATH=/openvino-venv/bin:$PATH
ENTRYPOINT ["/app/tools.sh"]
### Light, CLI only
FROM base AS light
COPY --from=build /app/full/llama-cli /app/
COPY --from=build /app/full/llama-cli /app/full/llama-completion /app/
WORKDIR /app
@@ -0,0 +1,24 @@
name: "Windows - Setup OpenVINO Toolkit"
description: "Setup OpenVINO Toolkit for Windows"
inputs:
path:
description: "Installation path"
required: true
version_major:
description: "OpenVINO major version (e.g., 2026.2)"
required: true
version_full:
description: "OpenVINO full version"
required: true
runs:
using: "composite"
steps:
- name: Download and extract OpenVINO Runtime
shell: powershell
run: |
$url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/windows/openvino_toolkit_windows_${{ inputs.version_full }}_x86_64.zip"
$out = "openvino.zip"
Invoke-WebRequest -Uri $url -OutFile $out
Expand-Archive -Path $out -DestinationPath ${{ inputs.path }} -Force
Remove-Item $out
+30 -2
View File
@@ -68,8 +68,8 @@ jobs:
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Clone
@@ -91,6 +91,34 @@ jobs:
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
windows-2022-openvino-cache:
runs-on: windows-2022
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: Setup Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/windows-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
windows-2022-rocm-cache:
runs-on: windows-2022
+81 -8
View File
@@ -37,14 +37,10 @@ jobs:
ubuntu-24-openvino:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
concurrency:
group: openvino-gpu-${{ github.head_ref || github.ref }}
cancel-in-progress: false
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Clone
@@ -78,7 +74,7 @@ jobs:
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
time cmake --build build/ReleaseOV --config Release -j $(nproc)
time cmake --build build/ReleaseOV --config Release --parallel
- name: Test (CPU)
id: cmake_test_cpu
@@ -93,4 +89,81 @@ jobs:
run: |
cd ${{ github.workspace }}
export GGML_OPENVINO_DEVICE=GPU
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 3000
openvino-windows-2022:
runs-on: windows-2022
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: openvino-windows-2022
variant: ccache
evict-old-files: 1d
save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
- name: Setup Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/windows-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenCL using vcpkg
shell: powershell
run: |
git clone https://github.com/microsoft/vcpkg C:\vcpkg
C:\vcpkg\bootstrap-vcpkg.bat
C:\vcpkg\vcpkg install opencl
- name: Build
id: cmake_build
shell: cmd
run: |
REM Find extracted OpenVINO folder dynamically
for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
echo ERROR: OpenVINOConfig.cmake not found
exit /b 1
)
call "%OPENVINO_ROOT%\setupvars.bat"
cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
-A x64 ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_OPENVINO=ON ^
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
cmake --build build\ReleaseOV --config Release -- /m
- name: Test (CPU)
id: cmake_test_cpu
shell: cmd
# TODO: fix and re-enable the `test-llama-archs` test below
run: |
REM Find extracted OpenVINO folder dynamically
for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
call "%OPENVINO_ROOT%\setupvars.bat"
cd build
ctest --test-dir ReleaseOV -L main -E "test-llama-archs" -C Release --verbose --timeout 3000
+2 -6
View File
@@ -264,14 +264,10 @@ jobs:
gpu-openvino-low-perf:
runs-on: [self-hosted, Linux, Intel, OpenVINO]
concurrency:
group: openvino-gpu-${{ github.head_ref || github.ref }}
cancel-in-progress: false
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Clone
+107 -3
View File
@@ -443,9 +443,9 @@ jobs:
openvino_version: ${{ steps.openvino_version.outputs.value }}
env:
# Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.0"
OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Set OpenVINO version output
@@ -528,6 +528,108 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
windows-openvino:
runs-on: windows-2022
outputs:
openvino_version: ${{ steps.openvino_version.outputs.value }}
env:
# Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
OPENVINO_VERSION_MAJOR: "2026.2"
OPENVINO_VERSION_FULL: "2026.2.0.21903.52ddc073857"
steps:
- name: Set OpenVINO version output
id: openvino_version
run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: "24"
cache: "npm"
cache-dependency-path: "tools/ui/package-lock.json"
- name: ccache
uses: ggml-org/ccache-action@v1.2.21
with:
key: release-windows-2022-openvino
variant: ccache
evict-old-files: 1d
- name: Setup Cache
uses: actions/cache@v5
id: cache-openvino
with:
path: ./openvino_toolkit
key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
- name: Setup OpenVINO Toolkit
if: steps.cache-openvino.outputs.cache-hit != 'true'
uses: ./.github/actions/windows-setup-openvino
with:
path: ./openvino_toolkit
version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
version_full: ${{ env.OPENVINO_VERSION_FULL }}
- name: Install OpenCL using vcpkg
shell: powershell
run: |
git clone https://github.com/microsoft/vcpkg C:\vcpkg
C:\vcpkg\bootstrap-vcpkg.bat
C:\vcpkg\vcpkg install opencl
- name: Build
id: cmake_build
shell: cmd
run: |
REM Find extracted OpenVINO folder dynamically
for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i
if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" (
echo ERROR: OpenVINOConfig.cmake not found
exit /b 1
)
call "%OPENVINO_ROOT%\setupvars.bat"
cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^
-A x64 ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_OPENVINO=ON ^
-DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
cmake --build build\ReleaseOV --config Release -- /m
- name: ccache-clear
uses: ./.github/actions/ccache-clear
with:
key: release-windows-2022-openvino
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
id: pack_artifacts
shell: powershell
run: |
Copy-Item LICENSE .\build\ReleaseOV\bin\
7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip .\build\ReleaseOV\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v6
with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
name: llama-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip
windows-cpu:
needs: [check-release]
if: ${{ needs.check-release.outputs.should_release == 'true' }}
@@ -1403,6 +1505,7 @@ jobs:
- windows-cuda
#- windows-sycl
- windows-hip
- windows-openvino
- ubuntu-22-rocm
- ubuntu-cpu
- ubuntu-vulkan
@@ -1524,6 +1627,7 @@ jobs:
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
- [Windows x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ needs.windows-openvino.outputs.openvino_version }}-x64.zip)
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
+5 -4
View File
@@ -540,10 +540,11 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls(
auto arg_name_parser = literal(prop_name);
common_peg_parser arg_value_parser = eps();
auto string_value_parser = choice({
literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
});
// Quoted literal as a value: normalize_quotes_to_json preserves escapes.
auto string_value_parser = tool_arg_value(choice({
literal("\"") + string_content('"') + literal("\""),
literal("'") + string_content('\'') + literal("'")
}));
if (is_string_type) {
arg_value_parser = string_value_parser;
+82 -76
View File
@@ -11,8 +11,13 @@
#include <sstream>
#include <thread>
#include <vector>
#include <algorithm>
#if defined(_WIN32)
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <io.h>
# include <windows.h>
# define isatty _isatty
@@ -62,16 +67,15 @@ static const char* g_col[] = {
};
struct common_log_entry {
enum ggml_log_level level;
bool prefix;
int64_t timestamp;
enum ggml_log_level level {GGML_LOG_LEVEL_INFO};
std::vector<char> msg;
// signals the worker thread to stop
bool is_end;
int64_t timestamp { 0 };
bool is_end { false }; // signals the worker thread to stop
bool prefix { false };
common_log_entry(size_t size = 256) : msg(size) { }
void print(FILE * file = nullptr) const {
FILE * fcur = file;
@@ -122,22 +126,15 @@ struct common_log_entry {
};
struct common_log {
// default capacity - will be expanded if needed
common_log() : common_log(256) {}
common_log(size_t capacity) {
file = nullptr;
prefix = false;
// default capacity
common_log(size_t capacity = 512) {
file = nullptr;
prefix = false;
timestamps = false;
running = false;
t_start = t_us();
// initial message size - will be expanded if longer messages arrive
entries.resize(capacity);
for (auto & entry : entries) {
entry.msg.resize(256);
}
running = false;
t_start = t_us();
queue.resize(capacity, common_log_entry(256));
head = 0;
tail = 0;
@@ -152,9 +149,10 @@ struct common_log {
}
private:
std::mutex mtx;
std::thread thrd;
std::condition_variable cv;
std::mutex mtx;
std::thread thrd;
std::condition_variable cv_new; // new entry
std::condition_variable cv_full; // wait on full
FILE * file;
@@ -164,24 +162,53 @@ private:
int64_t t_start;
// ring buffer of entries
std::vector<common_log_entry> entries;
// queue of entries
std::vector<common_log_entry> queue;
size_t head;
size_t tail;
// worker thread copies into this
common_log_entry cur;
bool print_entry(const common_log_entry & e) const {
if (e.is_end) return true;
e.print();
if (file) {
e.print(file);
}
return false;
}
bool flush_queue(size_t start_head, size_t end_tail, size_t & out_head) const {
bool stop = false;
size_t h = start_head;
while (h != end_tail && !stop) {
stop = print_entry(queue[h]);
h = (h + 1) % queue.size();
}
out_head = h;
return stop;
}
public:
bool is_full() const {
return ((tail + 1) % queue.size()) == head;
}
bool is_empty() const {
return head == tail;
}
void add(enum ggml_log_level level, const char * fmt, va_list args) {
std::lock_guard<std::mutex> lock(mtx);
std::unique_lock<std::mutex> lock(mtx);
// block if the queue is full
cv_full.wait(lock, [this]() { return !running || !is_full(); });
if (!running) {
// discard messages while the worker thread is paused
return;
}
auto & entry = entries[tail];
auto & entry = queue[tail];
{
// cannot use args twice, so make a copy in case we need to expand the buffer
@@ -216,38 +243,16 @@ public:
va_end(args_copy);
}
entry.level = level;
entry.prefix = prefix;
entry.is_end = false;
entry.level = level;
entry.prefix = prefix;
entry.timestamp = 0;
if (timestamps) {
entry.timestamp = t_us() - t_start;
}
entry.is_end = false;
tail = (tail + 1) % entries.size();
if (tail == head) {
// expand the buffer
std::vector<common_log_entry> new_entries(2*entries.size());
size_t new_tail = 0;
do {
new_entries[new_tail] = std::move(entries[head]);
head = (head + 1) % entries.size();
new_tail = (new_tail + 1);
} while (head != tail);
head = 0;
tail = new_tail;
for (size_t i = tail; i < new_entries.size(); i++) {
new_entries[i].msg.resize(256);
}
entries = std::move(new_entries);
}
cv.notify_one();
tail = (tail + 1) % queue.size();
cv_new.notify_one();
}
void resume() {
@@ -261,23 +266,24 @@ public:
thrd = std::thread([this]() {
while (true) {
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [this]() { return head != tail; });
cur = entries[head];
std::unique_lock<std::mutex> lock(mtx);
cv_new.wait(lock, [this]() { return !is_empty(); });
head = (head + 1) % entries.size();
}
size_t cached_head = head;
size_t cached_tail = tail;
if (cur.is_end) {
lock.unlock(); // drop the lock during flush
size_t next_head;
bool stop = flush_queue(cached_head, cached_tail, next_head);
lock.lock();
head = next_head;
cv_full.notify_all();
if (stop) {
break;
}
cur.print(); // stdout and stderr
if (file) {
cur.print(file);
}
}
});
}
@@ -293,13 +299,13 @@ public:
running = false;
// push an entry to signal the worker thread to stop
{
auto & entry = entries[tail];
entry.is_end = true;
auto & entry = queue[tail];
entry.is_end = true;
tail = (tail + 1) % queue.size();
tail = (tail + 1) % entries.size();
}
cv.notify_one();
// wakeup everyone
cv_new.notify_one();
cv_full.notify_all();
}
thrd.join();
+62 -2
View File
@@ -140,6 +140,8 @@ struct common_speculative_impl {
size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
std::vector<size_t> n_acc_tokens_per_pos; // number of tokens accepted per draft position.
// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
@@ -416,6 +418,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
std::vector<common_sampler_ptr> smpls;
// backend sampler chain per seq, attached to ctx_dft
std::vector<llama_sampler *> backend_chains;
int32_t n_embd_dec = 0; // draft hidden size
int32_t n_embd_enc = 0; // target_layer_ids_n * target_hidden_size
int32_t n_embd_tgt = 0; // target model hidden size
@@ -441,7 +446,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
, params(params.draft)
{
LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f, backend_sampling=%d\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min, (int) params.draft.backend_sampling);
auto * ctx_tgt = this->params.ctx_tgt;
auto * ctx_dft = this->params.ctx_dft;
@@ -476,6 +481,22 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
}
// offload draft sampling to the backend
backend_chains.assign(n_seq, nullptr);
if (this->params.backend_sampling) {
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
llama_sampler_free(chain);
chain = nullptr;
}
backend_chains[seq_id] = chain;
}
}
// turn on extraction of the target layers' input embeddings
for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
@@ -494,6 +515,18 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
}
~common_speculative_impl_draft_eagle3() override {
auto * ctx_dft = this->params.ctx_dft;
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
if (backend_chains[seq_id] == nullptr) {
continue;
}
if (ctx_dft) {
llama_set_sampler(ctx_dft, seq_id, nullptr);
}
llama_sampler_free(backend_chains[seq_id]);
}
backend_chains.clear();
if (batch.token != nullptr) {
free(batch.token);
batch.token = nullptr;
@@ -2059,6 +2092,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
{
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
if (impl->n_acc_tokens_per_pos.size() < n_accepted) {
impl->n_acc_tokens_per_pos.resize(n_accepted, 0);
}
for (size_t i = 0; i < n_accepted; ++i) {
impl->n_acc_tokens_per_pos[i]++;
}
if (n_accepted > 0) {
impl->n_acc_drafts++;
impl->n_acc_tokens += n_accepted;
@@ -2093,13 +2135,31 @@ void common_speculative_print_stats(const common_speculative * spec) {
str_perf = "";
}
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
std::string str_stats;
if (impl->n_call_accept > 0) {
const double mean =
1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept;
std::ostringstream tmp;
tmp << std::fixed << std::setprecision(3);
for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) {
if (i > 0) {
tmp << ", ";
}
tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept;
}
std::ostringstream oss;
oss << std::fixed << std::setprecision(2) << mean;
str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")";
}
LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n",
common_speculative_type_to_str(impl->type).c_str(),
impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
impl->n_gen_drafts,
impl->n_acc_drafts,
impl->n_gen_tokens,
impl->n_acc_tokens,
str_stats.c_str(),
str_perf.c_str());
}
}
+535 -132
View File
@@ -12,6 +12,25 @@ The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a t
- Compiles and caches the model for the target device.
- Binds GGML tensor memory to OpenVINO inference tensors and runs inference.
## Contents
- [Supported Devices](#supported-devices)
- [Supported Model Precisions](#supported-model-precisions)
- [Supported Llama.cpp Tools](#supported-llamacpp-tools)
- [Validated Models](#validated-models)
- [Build Instructions](#build-instructions)
- [0. Prerequisites](#0-prerequisites)
- [1. Install OpenVINO Runtime](#1-install-openvino-runtime)
- [2. Build llama.cpp with OpenVINO Backend](#2-build-llamacpp-with-openvino-backend)
- [Automated Ubuntu Build Script](#automated-ubuntu-build-script)
- [Automated Windows Build Script](#automated-windows-build-script)
- [3. Download Sample Model](#3-download-sample-model)
- [4. Run Inference with OpenVINO Backend](#4-run-inference-with-openvino-backend)
- [5. Docker Build](#5-docker-build)
- [GGML OpenVINO Backend Runtime Configurations](#ggml-openvino-backend-runtime-configurations)
- [Known Limitations](#known-limitations)
- [Work in Progress](#work-in-progress)
## Supported Devices
OpenVINO backend supports the following hardware:
@@ -31,55 +50,102 @@ Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvin
- `Q4_1`
- `Q4_K`
- `Q4_K_M`
- `Q5_K` (converted to Q8_0_C at runtime)
- `Q6_K` (converted to Q8_0_C at runtime)
- `Q5_K` (converted to `Q8_0_C` at runtime)
- `Q6_K` (converted to `Q8_0_C` at runtime)
> [!NOTE]
> Accuracy validation and performance optimizations for quantized models are a work in progress.
## Quantization Support Details
### CPU and GPU
- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported**
**CPU and GPU Quantization Details:**
- `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C`
### NPU
- **Primary supported quantization scheme is `Q4_0`**
**NPU Quantization Details:**
- Primary supported quantization scheme is `Q4_0`
- `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16
### Additional Notes
**Additional Notes:**
- Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor)
- `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize`
- `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3)
- `Q5_1` tensors are dequantized natively (weights, scales, and zero-points extracted directly)
## Supported Llama.cpp Tools
The OpenVINO backend integrates with the standard llama.cpp tools listed below.
However, all the tools coverage across all devices is not uniform and exhaustive validation is work in progress.
- llama-bench
- llama-cli
- llama-completion
- llama-embedding
- llama-perplexity
- llama-run
- llama-server
- llama-simple
## Validated Models
The following models were validated on Intel® Core™ Ultra Series 2. While our testing was limited, the OpenVINO backend is expected to work across a broad range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html).
- Use `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
- Additional model support, quantization formats and validations are work in progress.
Although, the validated models below were tested with `llama-cli` using the `Q4_K_M` quantization format on Intel® Core™ Ultra Series 2 (Lunar Lake), the OpenVINO backend is expected to work across a broader range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html), [supported model precisions](#supported-model-precisions), [supported llama.cpp tools](#supported-llamacpp-tools) and additional model architectures.
| Model | Validated | Known Issues |
| :------| :---------- | :-------------|
| [Llama-3.2-1B-Instruct](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
| [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | `Q8_0`, `Q4_K_M` on CPU/GPU/NPU | `Q4_0_8_8`, `Q4_0_4_8`, `Q4_0_4_4` fail |
| [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | `FP16`, `Q4` on CPU/NPU | GPU unsupported for `FP16` and `Q4` (`llama-cli`, `llama-bench`) |
| [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
| [Qwen3-8B-Instruct](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/NPU; GPU works via `llama-bench` | GPU `llama-cli` unsupported for all quantizations |
| [MiniCPM-V-2_6-GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `Q4_0` on CPU/GPU/NPU | — |
| [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — |
| [Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | CPU: `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M`; GPU: `Q8_0`, `Q4_0`, `Q4_1`; NPU (`llama-bench` only): `Q4_0`, `Q4_1`, `Q4_K_M` | GPU `Q4_K_M` unsupported; NPU `llama-cli` unsupported |
| [Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF/) | CPU/GPU: `Q8_0`, `Q4_K_M`; NPU: `Q8_0`, `Q4_K_M` (via `llama-bench`) | NPU `llama-cli` unsupported for `Q8_0`, `Q4_K_M` |
> [!NOTE]
> Extensive accuracy validation, performance optimizations, and broader architecture coverage are work in progress.
**Legend & Test Configuration:**
- **Status:** ✓ = Passed | ✗ = Failed or Unsupported
- **Execution Modes:**
- **SL** = Stateless (`GGML_OPENVINO_STATEFUL_EXECUTION=0`)
- **SF** = Stateful (`GGML_OPENVINO_STATEFUL_EXECUTION=1`)
- Note: The NPU operates in stateless mode only.
- **Validation system:** Intel® Core™ Ultra 5 238V (Lunar Lake) | 32 GB RAM | Ubuntu 24.04 | Intel OpenCL GPU Driver 26.18.38308.1 | Intel NPU Driver 1.33.0.
- See [Known Limitations](#known-limitations) for context on observed failures.
| Model | CPU (SL / SF) | GPU (SL / SF) | NPU (SL) |
| :--- | :---: | :---: | :---: |
| [bartowski/Llama-3.2-1B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| [bartowski/Llama-3.2-3B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| [bartowski/Meta-Llama-3.1-8B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| | | | |
| [Qwen/qwen2.5-1.5b-instruct-q4_k_m](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [Qwen/qwen2.5-coder-7b-instruct-q4_k_m](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [bartowski/Qwen_Qwen3-0.6B-Q4_K_M](https://huggingface.co/bartowski/Qwen_Qwen3-0.6B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [bartowski/Qwen_Qwen3-1.7B-Q4_K_M](https://huggingface.co/bartowski/Qwen_Qwen3-1.7B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [Qwen/Qwen3-4B-Q4_K_M](https://huggingface.co/Qwen/Qwen3-4B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [lm-kit/Qwen3-8B-Q4_K_M](https://huggingface.co/lm-kit/qwen-3-8b-instruct-gguf) | ✓ / ✓ | ✓ / ✗ | ✓ |
| | | | |
| [unsloth/gemma-3-4b-it-Q4_K_M](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [bartowski/google_gemma-4-E2B-it-Q4_K_M](https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✓ |
| [bartowski/google_gemma-4-E4B-it-Q4_K_M](https://huggingface.co/bartowski/google_gemma-4-E4B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✓ |
| [bartowski/gemma-4-12B-it-Q4_K_M](https://huggingface.co/bartowski/gemma-4-12B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✗ |
| | | | |
| [bartowski/Phi-3-mini-4k-instruct-Q4_K_M](https://huggingface.co/bartowski/Phi-3-mini-4k-instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [bartowski/Phi-3.5-mini-instruct-Q4_K_M](https://huggingface.co/bartowski/Phi-3.5-mini-instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| | | | |
| [bartowski/Mistral-7B-Instruct-v0.3-Q4_K_M](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| [QuantFactory/Ministral-3b-instruct.Q4_K_M](https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| [bartowski/Ministral-8B-Instruct-2410-Q4_K_M](https://huggingface.co/bartowski/Ministral-8B-Instruct-2410-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| | | | |
| [bartowski/DeepSeek-R1-Distill-Llama-8B-Q4_K_M](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| [bartowski/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| | | | |
| [ibm-granite/granite-4.0-350m-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-350m-GGUF) | ✓ / ✓ | ✗ / ✗ | ✓ |
| [ibm-granite/granite-4.0-micro-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-micro-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| [ibm-granite/granite-4.0-1b-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-1b-GGUF) | ✓ / ✓ | ✗ / ✗ | ✗ |
| [ibm-research/granite-3.2-8b-instruct-Q4_K_M](https://huggingface.co/ibm-research/granite-3.2-8b-instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| | | | |
| [HuggingFaceTB/smollm2-1.7b-instruct-q4_k_m](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ |
| [openbmb/MiniCPM-V-2_6-Q4_K_M](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [bartowski/tencent_Hunyuan-7B-Instruct-Q4_K_M](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-Q4_K_M](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| [bartowski/prism-ml_Bonsai-8B-unpacked-Q4_K_M](https://huggingface.co/bartowski/prism-ml_Bonsai-8B-unpacked-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ |
| | | | |
| [gpustack/bge-m3-Q4_K_M.gguf](https://huggingface.co/gpustack/bge-m3-GGUF) | ✓ | ✗ | ✗ |
## Build Instructions
### Prerequisites
### 0. Prerequisites
- Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2026/get-started/install-openvino/configurations.html).
- **Linux:**
- Git, CMake, and Ninja software tools are needed for building.
@@ -119,28 +185,14 @@ The following models were validated on Intel® Core™ Ultra Series 2. While our
- Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html)
- **Linux:**
<details>
<summary>📦 Click to expand OpenVINO installation from an archive file on Ubuntu</summary>
<br>
```bash
wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
chmod +x install-openvino-from-archive.sh
./install-openvino-from-archive.sh
```
Verify OpenVINO is initialized properly:
```bash
echo $OpenVINO_DIR
```
</details>
- Verify OpenVINO is initialized properly:
```bash
echo $OpenVINO_DIR
```
### 2. Build llama.cpp with OpenVINO Backend
Clone the OpenVINO-enabled llama.cpp fork and build it:
Clone llama.cpp repo and build :
```bash
git clone https://github.com/ggml-org/llama.cpp
@@ -148,39 +200,375 @@ cd llama.cpp
```
- **Linux:**
```bash
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --parallel
```
```bash
source /opt/intel/openvino/setupvars.sh
cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --parallel
```
- **Windows:** Open a **Developer Command Prompt for VS 2022** (so the MSVC toolchain is on `PATH`), then run:
```cmd
C:\Intel\openvino\setupvars.bat
cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
cmake --build build\ReleaseOV --parallel
```
- **Windows:**
```cmd
# x64 Native Tools Command Prompt for VS 2022
"C:\Program Files (x86)\Intel\openvino_2026.0\setupvars.bat"
cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
cmake --build build\ReleaseOV --parallel
```
> [!NOTE]
> Use `x64 Native Tools Command Prompt` for Windows build. After building, you could use either `cmd` or `PowerShell` to run the OpenVINO backend.
> The Windows install path is `C:\Intel\openvino` (no spaces) to avoid quoting problems some CMake/Ninja toolchains have with `C:\Program Files (x86)\...`. Adjust to wherever you installed OpenVINO Runtime. From `cmd`, run `C:\Intel\openvino\setupvars.bat`; from PowerShell, run `& "C:\Intel\openvino\setupvars.ps1"` instead. Once the build is finished you can launch the binaries from any `cmd` or `PowerShell` window after sourcing the matching `setupvars` script for that shell.
#### Automated Ubuntu Build Script
For Ubuntu24 users, the following shell script automates the prerequisite installs (build tools, OpenCL ICD), the OpenVINO Runtime download/extract/setup, and the Ninja-based llama.cpp build.
Save the following as `ubuntu-llamacpp-ov-install.sh` next to where you want the `llama.cpp` folder to land, then run it:
```bash
chmod +x ubuntu-llamacpp-ov-install.sh
./ubuntu-llamacpp-ov-install.sh
```
<details>
<summary>Click to expand <code>ubuntu-llamacpp-ov-install.sh</code></summary>
```bash
#!/usr/bin/env bash
# ============================================
# llama.cpp OpenVINO Build Script (Ninja)
# ============================================
set -euo pipefail
OPENVINO_VERSION_MAJOR="2026.2"
OPENVINO_VERSION_FULL="2026.2.0.21903.52ddc073857"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}"
OPENVINO_LINK_DIR="/opt/intel/openvino"
OPENVINO_TGZ="${SCRIPT_DIR}/openvino.tgz"
OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz"
echo "============================================"
echo "Installing prerequisites (apt)..."
echo "============================================"
sudo apt-get update
sudo apt-get install -y \
build-essential libcurl4-openssl-dev libtbb12 \
cmake ninja-build python3-pip \
curl wget tar git
echo "============================================"
echo "Installing OpenCL runtime + headers..."
echo "============================================"
sudo apt-get install -y \
ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
cd "${SCRIPT_DIR}"
# ============================================
# Clone llama.cpp if missing
# ============================================
if [[ ! -f "llama.cpp/CMakeLists.txt" ]]; then
echo "Cloning llama.cpp..."
git clone https://github.com/ggml-org/llama.cpp
fi
# ============================================
# Setup OpenVINO: download & extract to /opt/intel/openvino_${OPENVINO_VERSION_MAJOR},
# then point /opt/intel/openvino at it via symlink so the active version is swappable.
# ============================================
if [[ -f "${OPENVINO_INSTALL_DIR}/setupvars.sh" ]]; then
echo "OpenVINO ${OPENVINO_VERSION_MAJOR} already installed at ${OPENVINO_INSTALL_DIR}. Skipping download."
else
echo "OpenVINO not found at ${OPENVINO_INSTALL_DIR}. Starting download..."
curl -L -o "${OPENVINO_TGZ}" "${OPENVINO_URL}"
echo "Extracting OpenVINO to ${OPENVINO_INSTALL_DIR}..."
sudo mkdir -p "${OPENVINO_INSTALL_DIR}"
sudo tar -xzf "${OPENVINO_TGZ}" -C "${OPENVINO_INSTALL_DIR}" --strip-components=1
rm -f "${OPENVINO_TGZ}"
fi
# Refresh symlink: /opt/intel/openvino -> /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}
sudo ln -sfn "${OPENVINO_INSTALL_DIR}" "${OPENVINO_LINK_DIR}"
OPENVINO_ROOT="${OPENVINO_LINK_DIR}"
echo "OpenVINO Ready: ${OPENVINO_ROOT} -> ${OPENVINO_INSTALL_DIR}"
# Install OpenVINO's own runtime dependencies (one-time per system).
if [[ -x "${OPENVINO_ROOT}/install_dependencies/install_openvino_dependencies.sh" ]]; then
echo "============================================"
echo "Installing OpenVINO runtime dependencies..."
echo "============================================"
echo "Y" | sudo -E "${OPENVINO_ROOT}/install_dependencies/install_openvino_dependencies.sh"
fi
# ============================================
# Clean old build cache
# ============================================
cd "${SCRIPT_DIR}/llama.cpp"
if [[ -d "build/ReleaseOV" ]]; then
echo "Removing old build directory..."
rm -rf "build/ReleaseOV"
fi
echo "============================================"
echo "Configuring with CMake..."
echo "============================================"
# shellcheck disable=SC1091
source "${OPENVINO_ROOT}/setupvars.sh"
cmake -B build/ReleaseOV -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENVINO=ON
cmake --build build/ReleaseOV --parallel
echo "============================================"
echo "Build completed successfully!"
echo "============================================"
echo "Binaries: $(pwd)/build/ReleaseOV/bin"
echo
echo "NOTE: To run, source setupvars.sh and pick a device:"
echo " source /opt/intel/openvino/setupvars.sh"
echo " export GGML_OPENVINO_DEVICE=CPU # or GPU / NPU"
echo " ./build/ReleaseOV/bin/llama-cli -m model.gguf"
```
> [!NOTE]
> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release.
</details>
#### Automated Windows Build Script
For Windows users, the following `.bat` script automates the prerequisite installs (Git, Ninja, CMake, Visual Studio 2022 Build Tools, vcpkg + OpenCL), the OpenVINO Runtime download/extract, and the Ninja-based llama.cpp build.
Save the following as `windows-llamacpp-ov-install.bat` next to where you want the `llama.cpp` to land, then run it from either **Command Prompt** or **PowerShell**:
```cmd
:: Command Prompt
windows-llamacpp-ov-install.bat
```
```powershell
# PowerShell
.\windows-llamacpp-ov-install.bat
```
<details>
<summary>Click to expand <code>windows-llamacpp-ov-install.bat</code></summary>
```bat
@echo off
setlocal enabledelayedexpansion
REM ============================================
REM llama.cpp OpenVINO Build Script (Ninja)
REM ============================================
set "OPENVINO_VERSION_MAJOR=2026.2"
set "OPENVINO_VERSION_FULL=2026.2.0.21903.52ddc073857"
set "SCRIPT_DIR=%~dp0"
set "VCPKG_DIR=C:\vcpkg"
set "OPENVINO_INSTALL_DIR=C:\Intel\openvino_%OPENVINO_VERSION_MAJOR%"
set "OPENVINO_LINK_DIR=C:\Intel\openvino"
set "OPENVINO_ZIP=%SCRIPT_DIR%openvino.zip"
set "OPENVINO_EXTRACT_TMP=%SCRIPT_DIR%openvino_extract_tmp"
set "OPENVINO_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/%OPENVINO_VERSION_MAJOR%/windows/openvino_toolkit_windows_%OPENVINO_VERSION_FULL%_x86_64.zip"
echo ============================================
echo Installing prerequisites...
echo ============================================
winget install --id Git.Git -e --accept-source-agreements --accept-package-agreements 2>nul
winget install --id Ninja-build.Ninja -e --accept-source-agreements --accept-package-agreements 2>nul
winget install --id Kitware.CMake -e --accept-source-agreements --accept-package-agreements 2>nul
REM Ensure Visual Studio Build Tools are installed.
echo Checking for Visual Studio Build Tools...
set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
set "VS_INSTALLED="
if exist "%VSWHERE%" (
for /f "usebackq tokens=*" %%i in (`"%VSWHERE%" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath 2^>nul`) do (
set "VS_INSTALLED=%%i"
)
)
if defined VS_INSTALLED (
echo Visual Studio with VC++ x86/x64 tools already present at "!VS_INSTALLED!". Skipping winget install.
) else (
winget install --id Microsoft.VisualStudio.2022.BuildTools -e --override "--wait --passive --add Microsoft.VisualStudio.Workload.VCTools --includeRecommended" --accept-source-agreements --accept-package-agreements
if errorlevel 1 (
echo WARNING: winget could not install Visual Studio Build Tools automatically.
echo Install manually from https://aka.ms/vs/17/release/vs_BuildTools.exe ^(select the "Desktop development with C++" workload^)
echo and re-run this script from a "Developer Command Prompt for VS 2022".
)
)
echo ============================================
echo Installing OpenCL via vcpkg...
echo ============================================
if not exist "%VCPKG_DIR%" (
git clone https://github.com/microsoft/vcpkg "%VCPKG_DIR%"
cd /d "%VCPKG_DIR%"
call bootstrap-vcpkg.bat
call vcpkg integrate install
)
cd /d "%VCPKG_DIR%"
call vcpkg install opencl
cd /d "%SCRIPT_DIR%"
REM ============================================
REM Clone llama.cpp if missing
REM ============================================
if not exist "llama.cpp\CMakeLists.txt" (
echo Cloning llama.cpp...
git clone https://github.com/ggml-org/llama.cpp
)
cd /d "llama.cpp"
set "SCRIPT_DIR=%CD%"
REM ============================================
REM Setup OpenVINO: download & extract to C:\Intel\openvino_%OPENVINO_VERSION_MAJOR%,
REM then point C:\Intel\openvino at it via a directory junction (mklink /J).
REM ============================================
if exist "%OPENVINO_INSTALL_DIR%\setupvars.bat" (
echo OpenVINO %OPENVINO_VERSION_MAJOR% already installed at "%OPENVINO_INSTALL_DIR%". Skipping download.
) else (
echo OpenVINO not found at "%OPENVINO_INSTALL_DIR%". Starting download...
curl -L -o "%OPENVINO_ZIP%" "%OPENVINO_URL%"
if errorlevel 1 (
echo ERROR: Download failed.
exit /b 1
)
echo Extracting OpenVINO...
if exist "%OPENVINO_EXTRACT_TMP%" rmdir /s /q "%OPENVINO_EXTRACT_TMP%"
mkdir "%OPENVINO_EXTRACT_TMP%"
tar -xf "%OPENVINO_ZIP%" -C "%OPENVINO_EXTRACT_TMP%"
if errorlevel 1 (
echo ERROR: Extraction failed.
exit /b 1
)
REM Move the single top-level folder contents into the versioned install dir.
REM NOTE: delayed expansion (!VAR!) is required because the surrounding else( ... )
REM block is parsed once up-front, so %OPENVINO_EXTRACTED% would expand to "" here
REM and xcopy would then treat "\*" as C:\* and fail with "Cannot perform a cyclic copy".
set "OPENVINO_EXTRACTED="
for /d %%i in ("%OPENVINO_EXTRACT_TMP%\*") do set "OPENVINO_EXTRACTED=%%i"
if not defined OPENVINO_EXTRACTED (
echo ERROR: Could not locate extracted OpenVINO folder under "%OPENVINO_EXTRACT_TMP%".
exit /b 1
)
if not exist "%OPENVINO_INSTALL_DIR%" mkdir "%OPENVINO_INSTALL_DIR%"
xcopy /e /i /y /q "!OPENVINO_EXTRACTED!\*" "%OPENVINO_INSTALL_DIR%\" >nul
if errorlevel 1 (
echo ERROR: Failed to copy OpenVINO from "!OPENVINO_EXTRACTED!" to "%OPENVINO_INSTALL_DIR%".
echo Re-run this script from an elevated Command Prompt ^(Run as administrator^) if access is denied.
exit /b 1
)
rmdir /s /q "%OPENVINO_EXTRACT_TMP%"
del "%OPENVINO_ZIP%"
)
REM Refresh junction: C:\Intel\openvino -> C:\Intel\openvino_<version>.
REM `mklink /J` creates a directory junction (no admin / Developer Mode required).
if exist "%OPENVINO_LINK_DIR%" rmdir "%OPENVINO_LINK_DIR%"
mklink /J "%OPENVINO_LINK_DIR%" "%OPENVINO_INSTALL_DIR%" >nul
if errorlevel 1 (
echo ERROR: Failed to create junction "%OPENVINO_LINK_DIR%" -^> "%OPENVINO_INSTALL_DIR%".
echo If "%OPENVINO_LINK_DIR%" already exists as a regular non-empty folder, remove it manually and re-run.
exit /b 1
)
set "OPENVINO_ROOT=%OPENVINO_LINK_DIR%"
echo OpenVINO Ready: %OPENVINO_ROOT% -^> %OPENVINO_INSTALL_DIR%
echo ============================================
echo Setting up compiler environment...
echo ============================================
REM Locate Visual Studio Build Tools vcvars64.bat
set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"
if exist "%VSWHERE%" (
for /f "usebackq tokens=*" %%i in (`"%VSWHERE%" -latest -products Microsoft.VisualStudio.Product.BuildTools -property installationPath`) do (
set "VS_PATH=%%i"
)
)
if defined VS_PATH (
call "%VS_PATH%\VC\Auxiliary\Build\vcvars64.bat" >nul
) else (
echo WARNING: Visual Studio Build Tools not found. Compiler may be missing.
)
REM ============================================
REM Clean old build cache
REM ============================================
if exist "build\ReleaseOV" (
echo Removing old build directory ...
rmdir /s /q "build\ReleaseOV"
)
echo ============================================
echo Configuring with CMake...
echo ============================================
call "%OPENVINO_ROOT%\setupvars.bat" >nul 2>nul
cmake -B build\ReleaseOV -G Ninja ^
-DCMAKE_BUILD_TYPE=Release ^
-DGGML_OPENVINO=ON ^
-DCMAKE_TOOLCHAIN_FILE="%VCPKG_DIR%\scripts\buildsystems\vcpkg.cmake"
if errorlevel 1 (
echo If you continue to face CMAKE errors, make sure to install:
echo winget install Microsoft.VisualStudio.2022.BuildTools
echo Then run the "Developer Command Prompt for VS 2022" and launch this script from there.
exit /b 1
)
cmake --build build\ReleaseOV --config Release
if errorlevel 1 exit /b 1
echo ============================================
echo Build completed successfully!
echo ============================================
echo Binaries: %CD%\build\ReleaseOV\bin
echo.
echo NOTE: To run, source setupvars.bat and pick a device:
echo call "C:\Intel\openvino\setupvars.bat"
echo set GGML_OPENVINO_DEVICE=CPU ^&^& REM or GPU / NPU
echo build\ReleaseOV\bin\llama-cli.exe -m model.gguf
echo.
endlocal
```
> [!NOTE]
> The script pins OpenVINO `2026.2` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**.
</details>
### 3. Download Sample Model
Download models for testing:
Download sample model for testing.
```bash
# Linux
mkdir -p ~/models/
wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
-O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf \
-O ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
# Windows PowerShell
mkdir C:\models
Invoke-WebRequest -Uri https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
Invoke-WebRequest -Uri https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf
# Windows Command Line
mkdir C:\models
curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
curl -L https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf
```
### 4. Run Inference with OpenVINO Backend
@@ -196,65 +584,45 @@ When using the OpenVINO backend, the first inference token may have slightly hig
# Linux
export GGML_OPENVINO_DEVICE=GPU
# Enable stateful execution with GPU device to avoid known stateless execution failures.
# Optional: enable stateful execution for improved GPU performance (recommended).
export GGML_OPENVINO_STATEFUL_EXECUTION=1
# To run llama-simple:
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -n 50 "The story of AI is "
# To run in chat mode:
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 1024
# To run llama-bench, -fa 1 is needed
GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -fa 1
GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -fa 1
# NPU: keep context small to avoid failures from very large model context windows.
export GGML_OPENVINO_DEVICE=NPU
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 512
./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 512
# Windows Command Line
set GGML_OPENVINO_DEVICE=GPU
# Enable stateful execution with GPU device to avoid known stateless execution failures.
# Optional: enable stateful execution for improved GPU performance (recommended).
set GGML_OPENVINO_STATEFUL_EXECUTION=1
# Windows PowerShell
$env:GGML_OPENVINO_DEVICE = "GPU"
$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"
# To run llama-simple
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -n 50 "The story of AI is "
# To run in chat mode:
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 1024
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -c 1024
# To run llama-bench, -fa 1 is needed
build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -fa 1
build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -fa 1
# NPU: keep context small to avoid failures from very large model context windows.
# Windows Command Line
set GGML_OPENVINO_DEVICE=NPU
# Windows PowerShell
$env:GGML_OPENVINO_DEVICE = "NPU"
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 512
build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -c 512
```
> [!NOTE]
> On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.
### Known Issues and Current Workarounds
- GPU stateless execution is currently affected by a known issue.
- Workaround: set `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
- NPU failures can happen when context size is too large. Recent llama.cpp behavior may resolve context size to the model training context (for example, 131072 for Llama 3.2 1B), which is too large for current NPU usage and can also stress laptop CPU/GPU on larger models. To inspect the selected context size, run `llama-cli` or `llama-server` with `-lv 3`.
- Workaround: explicitly set context size, for ex. `-c 1024` for NPU runs. Performance will be better with lower context size.
- Additional NPU limitations:
- Model caching is not yet supported.
- `llama-server -np > 1` (multiple parallel sequences) is not supported.
- `llama-perplexity` is only supported with `-b 512` or smaller.
- `--context-shift` with `llama-cli` is currently not supported with OpenVINO backend across CPU, GPU, and NPU devices.
- Encoder models (embedding, reranking) are not supported with the current OpenVINO backend implementation.
- `-fa 1` is required when running llama-bench with the OpenVINO backend.
- `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
> [!NOTE]
> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
### Docker Build
### 5. Docker Build
You can build and run llama.cpp with OpenVINO backend using Docker.
@@ -272,7 +640,7 @@ docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfi
docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
# If you are behind a proxy:
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
```
Run llama.cpp with OpenVINO backend Docker container.
@@ -281,19 +649,19 @@ Save sample models in `~/models` as [shown above](#3-download-sample-model). It
```bash
# Run Docker container
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
# With Intel GPU access (iGPU or dGPU)
docker run --rm -it -v ~/models:/models \
--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
--env=GGML_OPENVINO_DEVICE=GPU --env=GGML_OPENVINO_STATEFUL_EXECUTION=1 \
llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
# With Intel NPU access
docker run --rm -it -v ~/models:/models \
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
--env=GGML_OPENVINO_DEVICE=NPU \
llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
```
Run Llama.cpp Server with OpenVINO Backend.
@@ -301,17 +669,30 @@ Run Llama.cpp Server with OpenVINO Backend.
> `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
```bash
# Run the Server Docker container
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
# Or Using llama-server executable
./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf --port 8080 -c 1024
# Run the llama-openvino:server Docker container (CPU)
docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 1024 --host 0.0.0.0
# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
export NO_PROXY=localhost,127.0.0.1
# Run the llama-openvino:server Docker container with Intel GPU access (iGPU or dGPU)
docker run --rm -it -v ~/models:/models \
--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
-p 8080:8080 --env=GGML_OPENVINO_DEVICE=GPU \
llama-openvino:server --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --host 0.0.0.0
# Run the llama-openvino:server Docker container with Intel NPU access
docker run --rm -it -v ~/models:/models \
--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
-p 8080:8080 --env=GGML_OPENVINO_DEVICE=NPU \
llama-openvino:server --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --host 0.0.0.0
# Or Using llama-server executable
./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --port 8080 -c 1024
# Option 1: Open your browser to http://localhost:8080 to access the web UI for the llama.cpp server.
# Option 2: In a NEW terminal, test the server with curl
# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
export NO_PROXY=localhost,127.0.0.1
# Test health endpoint
curl -f http://localhost:8080/health
@@ -320,24 +701,26 @@ curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: appli
-d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq .
```
## Runtime Configuration
## GGML OpenVINO Backend Runtime Configurations
The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior.
Boolean flags follow a uniform convention: set to a **positive integer** (e.g. `1`) to enable; unset, empty, `0`, negative, or non-numeric values are treated as disabled.
### Configuration Options
| Variable | Default | Description |
|-----------------------------------|------------|-------------------------------------------------------------------------------------------------------------|
| `GGML_OPENVINO_DEVICE` | `CPU` | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
| `GGML_OPENVINO_CACHE_DIR` | `not set` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| `256` | Token chunk size for **NPU** prefill. |
| `GGML_OPENVINO_STATEFUL_EXECUTION`| `0` | Enable stateful KV cache on for better performance. Recommended on CPU, GPU. |
| `GGML_OPENVINO_PROFILING` | `0` | Enable execution-time profiling. |
| `GGML_OPENVINO_DUMP_CGRAPH` | `0` | Dump the GGML compute graph to `cgraph_ov.txt`. |
| `GGML_OPENVINO_DUMP_IR` | `0` | Serialize OpenVINO IR files with timestamps. |
| `GGML_OPENVINO_DEBUG_INPUT` | `0` | Enable input debugging and print input tensor info. |
| `GGML_OPENVINO_DEBUG_OUTPUT` | `0` | Enable output debugging and print output tensor info. |
| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | `0` | Print tensor address map once. |
| Variable | Type | Default | Description |
|-----------------------------------|-----------|------------|-------------------------------------------------------------------------------------------------------------|
| `GGML_OPENVINO_DEVICE` | String | `CPU` | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
| `GGML_OPENVINO_CACHE_DIR` | String | `not set` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| Integer | `256` | Token chunk size for **NPU** prefill (NPU-only; ignored on CPU/GPU). Must be a positive integer; otherwise the default is used. |
| `GGML_OPENVINO_STATEFUL_EXECUTION`| Boolean | `0` | Enable stateful KV cache for better performance. Recommended on CPU, GPU. |
| `GGML_OPENVINO_DISABLE_CACHE` | Boolean | `0` | Disable the in-process compiled-model / decoder cache (cache is on by default). Set to `1` to disable. |
| `GGML_OPENVINO_DISABLE_KV_SLICE` | Boolean | `0` | Disable the KV-cache input-tensor slicing optimization (slicing is on by default on CPU/GPU). Set to `1` to disable. |
| `GGML_OPENVINO_MANUAL_GQA_ATTN` | Boolean | device-based | Tri-state. When **unset**, manual GQA attention is enabled by default on `GPU` and disabled on other devices. Set to a positive integer to force-enable, or `0` to force-disable. |
| `GGML_OPENVINO_PROFILING` | Boolean | `0` | Enable execution-time profiling. |
| `GGML_OPENVINO_DUMP_CGRAPH` | Boolean | `0` | Dump the GGML compute graph to `cgraph_ov.txt`. |
| `GGML_OPENVINO_DUMP_IR` | Boolean | `0` | Serialize OpenVINO IR files with timestamps. |
| `GGML_OPENVINO_DEBUG_INPUT` | Boolean | `0` | Enable input debugging and print input tensor info. |
| `GGML_OPENVINO_DEBUG_OUTPUT` | Boolean | `0` | Enable output debugging and print output tensor info. |
| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | Boolean | `0` | Print tensor address map once. |
> [!NOTE]
>`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported.
@@ -355,7 +738,7 @@ export GGML_OPENVINO_PROFILING=1
export GGML_OPENVINO_DEVICE=GPU
export GGML_OPENVINO_STATEFUL_EXECUTION=1
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -n 50 "The story of AI is "
# Windows Command Line
set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
@@ -369,19 +752,39 @@ $env:GGML_OPENVINO_PROFILING = "1"
$env:GGML_OPENVINO_DEVICE = "GPU"
$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -n 50 "The story of AI is "
```
## Llama.cpp Tools
## Known Limitations
The following tools work with the OpenVINO backend on CPU, GPU, NPU:
- llama-bench
- llama-cli
- llama-completion
- llama-perplexity
- llama-server
- llama-simple
**General (all devices)**
- Llama.cpp OpenVINO backend currently supports a subset of GGML ops and text-only models. Unsupported ops or unsupported op shapes/cases fail during OpenVINO translation.
- Multimodal features (audio/image/video) are a work in progress.
- Limited Embedding and Reranking model support.
- Llama.cpp tool coverage across CPU/GPU/NPU is not uniform.
**Tool-specific**
- `llama-bench`: requires `-fa 1` (flash-attention).
- `llama-cli --context-shift`: stateless only (`GGML_OPENVINO_STATEFUL_EXECUTION=0`). In stateful mode the KV cache is owned by the OpenVINO model and cannot be shifted externally.
- `llama-server`: only one chat session/thread when `GGML_OPENVINO_STATEFUL_EXECUTION=1`.
**GPU-specific**
- `llama-server -np > 1`: concurrent requests are batched together, which may slightly reduce per-request throughput.
**NPU-specific**
- Default context resolves to the model's training context (e.g. 131072 for Llama 3.2 1B), which can OOM or fail or degrade performance on NPU. Inspect the resolved value with `-lv 3`.
- **Workaround:** Pass an explicit `-c <N>`, e.g. `-c 1024`.
- NPU device uses a static graph with a fixed prefill chunk size (defaults to 256), configurable with `GGML_OPENVINO_PREFILL_CHUNK_SIZE`. Large prefill/batch settings may need tuning.
- `llama-server -np > 1` (multiple parallel sequences) is not supported.
- `llama-perplexity`: requires `-b 512` or smaller.
> [!NOTE]
> The OpenVINO backend is actively under development. Fixes and improvements are underway, and this document will continue to be updated.
## Work in Progress
+13 -10
View File
@@ -253,6 +253,7 @@ When targeting an intel GPU, the user should expect one or more devices among th
#### Intel GPU
```sh
# Uses FP32, consider using FP16 for better performance in most cases
./examples/sycl/build.sh
```
@@ -262,12 +263,12 @@ or
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# Option 2: Use FP16
# Option 1: Use FP16 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
# Option 2: Use FP32
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# build all binary
cmake --build build --config Release -j -v
```
@@ -469,6 +470,7 @@ Choose one of following methods to build from source code.
##### Option 1: Script
```sh
# Uses FP32, consider using FP16 for better performance in most cases
.\examples\sycl\win-build-sycl.bat
```
@@ -479,11 +481,11 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
```
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
# Option 1: Use FP16 (recommended for better performance in most cases)
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
# Option 2: Or FP16
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
# Option 2: Or FP32
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
cmake --build build --config Release -j
```
@@ -491,10 +493,10 @@ cmake --build build --config Release -j
Or, use CMake presets to build:
```sh
cmake --preset x64-windows-sycl-release
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-completion
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
cmake --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-completion
cmake --preset x64-windows-sycl-debug
@@ -718,6 +720,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. |
| GGML_SYCL_USM_SYSTEM | 0 (default) or 1 | Enable experimental support for [USM system allocations](https://github.khronos.org/SYCL_Reference/iface/usm_basic_concept.html#system-allocations) for large GPU buffers. This requires enough host memory for model weights and caches, an Intel Xe2+ GPU such as BMG or newer and supported on Linux only, with CONFIG_DRM_XE_GPUSVM enabled. |
## Compile-time Flags
+11 -11
View File
@@ -23,16 +23,16 @@ Legend:
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | | 🟡 | ✅ | ❌ | ❌ |
| COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | | ❌ | ❌ | ❌ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@@ -44,10 +44,10 @@ Legend:
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | | ✅ | ✅ | ❌ | ❌ |
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | | ✅ | ✅ | ❌ | ❌ |
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | | ✅ | ✅ | ❌ | ❌ |
| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -65,7 +65,7 @@ Legend:
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | | ✅ | ✅ | ❌ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
@@ -89,7 +89,7 @@ Legend:
| ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | | ✅ | ✅ | ❌ | ❌ |
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -99,13 +99,13 @@ Legend:
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | | 🟡 | ✅ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | | 🟡 | ✅ | ❌ | ❌ |
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -118,6 +118,6 @@ Legend:
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | | ✅ | ✅ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+317 -315
View File
@@ -27,20 +27,20 @@
"SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
@@ -69,20 +69,20 @@
"SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@@ -111,8 +111,8 @@
"SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@@ -153,20 +153,20 @@
"SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","SYCL"
"SYCL0","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","SYCL"
"SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","SYCL"
@@ -4676,264 +4676,264 @@
"SYCL0","CONV_2D_DW","ne_input=[17,34,9,1],ne_kernel=[3,3,1,9],stride=1,padding=0,dilation=1,cwhn=1","support","0","no","SYCL"
"SYCL0","CONV_2D_DW","ne_input=[32,8,64,1],ne_kernel=[3,3,1,64],stride=2,padding=1,dilation=1,cwhn=0","support","0","no","SYCL"
"SYCL0","CONV_2D_DW","ne_input=[32,8,64,1],ne_kernel=[3,3,1,64],stride=2,padding=1,dilation=1,cwhn=1","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=4,ID=8,IH=8,IW=8,OC=8,KD=1,KH=1,KW=1,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=4,ID=8,IH=8,IW=8,OC=8,KD=1,KH=1,KW=1,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=4,ID=8,IH=8,IW=8,OC=8,KD=1,KH=1,KW=1,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_3D","N=1,IC=4,ID=8,IH=8,IW=8,OC=8,KD=1,KH=1,KW=1,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL"
"SYCL0","CONV_TRANSPOSE_1D","ne_input=[1,1,1,1],ne_kernel=[1,1,1,1],s0=1,p0=0,d0=1","support","1","yes","SYCL"
"SYCL0","CONV_TRANSPOSE_1D","ne_input=[1,1,1,1],ne_kernel=[1,1,1,1],s0=2,p0=0,d0=1","support","1","yes","SYCL"
"SYCL0","CONV_TRANSPOSE_1D","ne_input=[1,1,1,1],ne_kernel=[1,1,1,1],s0=3,p0=0,d0=1","support","1","yes","SYCL"
@@ -5105,6 +5105,7 @@
"SYCL0","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=bf16,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","SYCL"
@@ -5112,6 +5113,7 @@
"SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL"
"SYCL0","REPEAT","type=bf16,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL"
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","1","yes","SYCL"
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","1","yes","SYCL"
"SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","1","yes","SYCL"
@@ -9741,39 +9743,39 @@
"SYCL0","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=1","support","1","yes","SYCL"
"SYCL0","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=32","support","1","yes","SYCL"
"SYCL0","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=129","support","1","yes","SYCL"
"SYCL0","SQR","type=f16,ne=[10,5,4,3]","support","0","no","SYCL"
"SYCL0","SQRT","type=f16,ne=[10,3,3,2]","support","0","no","SYCL"
"SYCL0","LOG","type=f16,ne=[10,5,4,3]","support","0","no","SYCL"
"SYCL0","SIN","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
"SYCL0","COS","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
"SYCL0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","0","no","SYCL"
"SYCL0","SQR","type=f16,ne=[10,5,4,3]","support","1","yes","SYCL"
"SYCL0","SQRT","type=f16,ne=[10,3,3,2]","support","1","yes","SYCL"
"SYCL0","LOG","type=f16,ne=[10,5,4,3]","support","1","yes","SYCL"
"SYCL0","SIN","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
"SYCL0","COS","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
"SYCL0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","SYCL"
"SYCL0","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","0","no","SYCL"
"SYCL0","SQR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
"SYCL0","SQR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
"SYCL0","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
"SYCL0","SQRT","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
"SYCL0","LOG","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
"SYCL0","LOG","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
"SYCL0","SIN","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
"SYCL0","SIN","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
"SYCL0","COS","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
"SYCL0","COS","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
"SYCL0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","0","no","SYCL"
"SYCL0","CLAMP","type=f16,ne=[1024,1024,1,1],min=-0.500000,max=0.500000","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL"
"SYCL0","SQR","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","SQR","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","SQRT","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","SQRT","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","LOG","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","LOG","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","SIN","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","SIN","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","COS","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","COS","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","SYCL"
"SYCL0","CLAMP","type=f16,ne=[1024,1024,1,1],min=-0.500000,max=0.500000","support","1","yes","SYCL"
"SYCL0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","SYCL"
"SYCL0","LEAKY_RELU","type=f16,ne_a=[1024,1024,1,1],negative_slope=0.100000","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","0","no","SYCL"
"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL"
"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL"
"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL"
"SYCL0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
"SYCL0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","SYCL"
"SYCL0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL"
@@ -11044,8 +11046,8 @@
"SYCL0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","0","no","SYCL"
@@ -11093,8 +11095,8 @@
"SYCL0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","0","no","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","0","no","SYCL"
Can't render this file because it is too large.
+9 -66
View File
@@ -622,18 +622,6 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
// cuda buffer
struct ggml_backend_cuda_device_context {
int device;
std::string name;
std::string description;
std::string pci_bus_id;
int op_offload_min_batch_size;
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
std::mutex device_mutex;
int active_count = 0;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
};
struct ggml_backend_cuda_buffer_context {
int device;
void * dev_ptr = nullptr;
@@ -651,13 +639,6 @@ struct ggml_backend_cuda_buffer_context {
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
dev_ctx->active_count--;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
delete ctx;
}
@@ -810,12 +791,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
dev_ctx->active_count++;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
}
@@ -1515,12 +1490,6 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) {
}
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context;
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
dev_ctx->active_count--;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
CUDA_CHECK(cudaFreeHost(buffer->context));
}
@@ -1529,8 +1498,6 @@ static void * ggml_cuda_host_malloc(size_t size) {
return nullptr;
}
ggml_cuda_set_device(0); // cudaMallocHost can create the implicit CUDA device context, make sure that this is consistently done on device 0.
void * ptr = nullptr;
cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) {
@@ -1556,12 +1523,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
buffer->buft = buft;
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context;
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
dev_ctx->active_count++;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
return buffer;
}
@@ -3179,12 +3140,6 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) {
static void ggml_backend_cuda_free(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) backend->device->context;
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
dev_ctx->active_count--;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
delete cuda_ctx;
delete backend;
}
@@ -4916,6 +4871,14 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
// backend device
struct ggml_backend_cuda_device_context {
int device;
std::string name;
std::string description;
std::string pci_bus_id;
int op_offload_min_batch_size;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ctx->name.c_str();
@@ -5004,11 +4967,6 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
std::lock_guard<std::mutex> lock(ctx->device_mutex);
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemGetInfo(free, total));
@@ -5035,13 +4993,6 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
}
#endif // defined(__linux__)
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
// If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA
// context that permanently consumes VRAM. Reset the device to free it.
if (ctx->active_count == 0) {
CUDA_CHECK(cudaDeviceReset());
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
}
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
@@ -5745,21 +5696,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
return nullptr;
}
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device);
ggml_backend_t cuda_backend = new ggml_backend {
/* .guid = */ ggml_backend_cuda_guid(),
/* .iface = */ ggml_backend_cuda_interface,
/* .device = */ dev,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
/* .context = */ ctx,
};
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
std::lock_guard<std::mutex> lock(dev_ctx->device_mutex);
dev_ctx->active_count++;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
return cuda_backend;
}
+52 -6
View File
@@ -564,6 +564,9 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_f16_f32_1row;
cl_kernel kernel_mul_mat_f16_f32;
cl_kernel kernel_mul_mat_f16_f32_l4;
cl_kernel kernel_mul_mat_f16_f32_l4_dr;
cl_kernel kernel_mul_mat_f16_f32_l4_dr_ls;
cl_kernel kernel_mul_mat_f16_f32_l4_dr_lq;
cl_kernel kernel_mul_mat_f16_f32_tiled;
cl_kernel kernel_adreno_xmem_pack_src_f32;
cl_kernel kernel_adreno_xmem_prepack_weight_f16;
@@ -1787,6 +1790,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr", &err), err));
if (backend_ctx->gpu_family == ADRENO) {
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_ls", &err), err));
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_lq", &err), err));
}
GGML_LOG_CONT(".");
}
@@ -14570,11 +14578,31 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
}
if (src1t == GGML_TYPE_F32) {
// heuristic for packing more work for Adreno
const bool adreno_use_lane_split =
backend_ctx->gpu_family == ADRENO &&
ne11 == 1 &&
ne01 >= 8 &&
ne00 % 4 == 0 &&
r3 == 1 && r2 >= 1 && r2 <= 8 &&
(ne12 % r2) == 0;
if (ne11 * ne12 < 4) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
} else if (adreno_use_lane_split && ne00 >= 64 && ne00 <= 128) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq;
nrows = 1;
} else if (adreno_use_lane_split && r2 >= 2 && ne00 > 128 && ne00 <= 256) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls;
nrows = 1;
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
nrows = ne11;
if (ne11 == 1) {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr;
nrows = 1; // not used by this kernel
} else {
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
nrows = ne11;
}
} else {
kernel = backend_ctx->kernel_mul_mat_f16_f32;
nrows = 4;
@@ -15353,12 +15381,30 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
} else {
int64_t ny = (ne11 + nrows - 1)/nrows;
if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr) {
const int NDST_DR = 4;
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, NDST_DR)*nth0, (size_t)nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
} else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls) {
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 2)*nth0, (size_t)nth1, (size_t)ne02*ne03};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
} else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq) {
size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 4)*nth0, (size_t)nth1, (size_t)ne02*ne03};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
} else {
int64_t ny = (ne11 + nrows - 1)/nrows;
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
}
}
@@ -82,3 +82,299 @@ kernel void kernel_mul_mat_f16_f32_l4(
}
}
}
// Each subgroup produces DR_NDST outputs, assumes ne11 == 1
#define MUL_MAT_F16_F32_L4_DR_NDST 4
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mat_f16_f32_l4_dr(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3
) {
src0 = (global char*)((global char*)src0 + offset0);
src1 = (global char*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
const int r0_base = get_group_id(0) * MUL_MAT_F16_F32_L4_DR_NDST;
const int im = get_group_id(2);
const int i12 = im % ne12;
const int i13 = im / ne12;
// assume ne11 == 1
const ulong offset_src1 = i12*nb12 + i13*nb13;
global float4 * y4 = (global float4 *)(src1 + offset_src1);
global half4 * x4[MUL_MAT_F16_F32_L4_DR_NDST];
float sumf[MUL_MAT_F16_F32_L4_DR_NDST];
const ulong k_head_off = (i12/r2)*nb02 + (i13/r3)*nb03;
#pragma unroll
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
int r0 = r0_base + n;
int r0c = r0 < ne01 ? r0 : 0;
ulong off = (ulong)r0c*nb01 + k_head_off;
x4[n] = (global half4 *)(src0 + off);
sumf[n] = 0.0f;
}
const int n_chunks = ne00 / 4;
const int sg_size = get_max_sub_group_size();
const int lid = get_sub_group_local_id();
for (int i = lid; i < n_chunks; i += sg_size) {
float4 q = y4[i];
#pragma unroll
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
float4 k = convert_float4(x4[n][i]);
sumf[n] = mad(k.s0, q.s0, sumf[n]);
sumf[n] = mad(k.s1, q.s1, sumf[n]);
sumf[n] = mad(k.s2, q.s2, sumf[n]);
sumf[n] = mad(k.s3, q.s3, sumf[n]);
}
}
#pragma unroll
for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) {
float reduced = sub_group_reduce_add(sumf[n]);
int r0 = r0_base + n;
if (lid == 0 && r0 < ne01) {
dst[im*ne1*ne0 + r0] = reduced;
}
}
}
// Kernels for decoding, Adreno only for now
#define MUL_MAT_F16_F32_L4_DR_LS_R2_MAX 8
#ifdef ADRENO_GPU
#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable
#define sub_group_shuffle_xor(val, mask) qcom_sub_group_shuffle_xor((val), (mask), CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.0f)
REQD_SUBGROUP_SIZE_64
kernel void kernel_mul_mat_f16_f32_l4_dr_ls(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3
) {
src0 = (global char*)((global char*)src0 + offset0);
src1 = (global char*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
const int r0_base = get_group_id(0) * 2;
const int kv_grp = get_group_id(2); // KV head group; im = kv_grp*r2 + q
const int i12_kv = kv_grp % ne02;
const int i13_kv = kv_grp / ne02;
const int lid = get_sub_group_local_id();
const int subhalf = lid >> 5; // 0 or 1 (which K row in the WG)
const int intra = lid & 31; // 0..31 (lane within the half)
const int r0 = r0_base + subhalf;
const int r0c = r0 < ne01 ? r0 : 0; // clamp OOB to row 0; skip write below
// K row pointer for this lane (one K row per half-wave).
const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03;
global half4 * x4 = (global half4 *)(src0 + k_off);
global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
const int i12_q = i12_kv*r2 + q;
const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13;
y4[q] = (global float4 *)(src1 + q_off);
}
float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
partial[q] = 0.0f;
}
const int n_chunks = ne00 / 4;
for (int i = intra; i < n_chunks; i += 32) {
float4 k = convert_float4(x4[i]);
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
float4 v = y4[q][i];
partial[q] = mad(k.s0, v.s0, partial[q]);
partial[q] = mad(k.s1, v.s1, partial[q]);
partial[q] = mad(k.s2, v.s2, partial[q]);
partial[q] = mad(k.s3, v.s3, partial[q]);
}
}
}
// half-wave reduction
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
partial[q] += sub_group_shuffle_xor(partial[q], 1u);
partial[q] += sub_group_shuffle_xor(partial[q], 2u);
partial[q] += sub_group_shuffle_xor(partial[q], 4u);
partial[q] += sub_group_shuffle_xor(partial[q], 8u);
partial[q] += sub_group_shuffle_xor(partial[q], 16u);
}
}
if (intra == 0 && r0 < ne01) {
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
const int im = i12_kv*r2 + q + i13_kv*ne12;
dst[im*ne1*ne0 + r0] = partial[q];
}
}
}
}
REQD_SUBGROUP_SIZE_64
kernel void kernel_mul_mat_f16_f32_l4_dr_lq(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne10,
int ne11,
int ne12,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
int ne1,
int r2,
int r3
) {
src0 = (global char*)((global char*)src0 + offset0);
src1 = (global char*)((global char*)src1 + offset1);
dst = (global float*)((global char*)dst + offsetd);
const int r0_base = get_group_id(0) * 4;
const int kv_grp = get_group_id(2);
const int i12_kv = kv_grp % ne02;
const int i13_kv = kv_grp / ne02;
const int lid = get_sub_group_local_id();
const int subq = lid >> 4; // 0..3 (which K row)
const int intra = lid & 15; // 0..15 (lane within quarter)
const int r0 = r0_base + subq;
const int r0c = r0 < ne01 ? r0 : 0;
const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03;
global half4 * x4 = (global half4 *)(src0 + k_off);
global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
const int i12_q = i12_kv*r2 + q;
const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13;
y4[q] = (global float4 *)(src1 + q_off);
}
float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX];
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
partial[q] = 0.0f;
}
const int n_chunks = ne00 / 4;
for (int i = intra; i < n_chunks; i += 16) {
float4 k = convert_float4(x4[i]);
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
float4 v = y4[q][i];
partial[q] = mad(k.s0, v.s0, partial[q]);
partial[q] = mad(k.s1, v.s1, partial[q]);
partial[q] = mad(k.s2, v.s2, partial[q]);
partial[q] = mad(k.s3, v.s3, partial[q]);
}
}
}
// quarter-wave reduction
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
partial[q] += sub_group_shuffle_xor(partial[q], 1u);
partial[q] += sub_group_shuffle_xor(partial[q], 2u);
partial[q] += sub_group_shuffle_xor(partial[q], 4u);
partial[q] += sub_group_shuffle_xor(partial[q], 8u);
}
}
if (intra == 0 && r0 < ne01) {
#pragma unroll
for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) {
if (q < r2) {
const int im = i12_kv*r2 + q + i13_kv*ne12;
dst[im*ne1*ne0 + r0] = partial[q];
}
}
}
}
#endif // ADRENO_GPU
-5
View File
@@ -2,12 +2,7 @@
# Override root .clang-format
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
Cpp11BracedListStyle: true
SpacesInContainerLiterals: false
BreakBeforeBraces: Attach
AccessModifierOffset: -4
IndentCaseBlocks: false
IndentCaseLabels: false
Language: Cpp
AlignAfterOpenBracket: Align
+2 -4
View File
@@ -1,8 +1,6 @@
find_package(OpenVINO REQUIRED)
find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading)
find_package(OpenCL REQUIRED)
include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp")
file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp")
@@ -11,7 +9,7 @@ ggml_add_backend_library(ggml-openvino
${GGML_HEADERS_OPENVINO}
)
target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
target_link_libraries(ggml-openvino PRIVATE openvino::runtime openvino::threading OpenCL::OpenCL)
if (GGML_OPENVINO)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
File diff suppressed because it is too large Load Diff
+76 -23
View File
@@ -1,6 +1,7 @@
#pragma once
#include "ggml-quants.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "openvino/decoder.h"
@@ -14,21 +15,21 @@
struct ModelParams {
int ctx = -1;
int ctx_swa = -1;
int ctx_per_seq = -1;
int ctx_per_seq_swa = -1;
int n_seq = 1;
int n_heads = -1;
int n_heads_kv = -1;
int head_size = -1;
int32_t rope_params[15];
bool mixed_rope_params = false;
std::vector<int> swa_layers;
std::vector<std::string> kv_names;
size_t kv_buffer_ctx_id = 0;
bool same_rope_params(const ModelParams & other) const {
return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
return mixed_rope_params == other.mixed_rope_params &&
memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0;
}
bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); }
@@ -56,12 +57,14 @@ public:
std::string node_name;
std::string node_op_type;
std::map<std::string, ggml_tensor *> node_inputs;
std::map<std::string, std::vector<std::pair<std::string, ggml_tensor *>>> node_inputs_views;
std::vector<std::string> node_inputs_names;
ggml_tensor * node_output;
std::string node_output_name;
int node_op_case = 0;
void * data_addr;
};
// Graph decoder
GgmlOvDecoder(ggml_cgraph * cgraph,
ModelParams & model_params,
@@ -69,6 +72,7 @@ public:
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_stateful = false,
bool model_is_splitted = false,
bool is_prefill = false,
int prefill_chunk_size = 256);
@@ -84,6 +88,42 @@ public:
virtual std::vector<size_t> get_input_stride(int node_idx, const std::string & name) const override;
virtual size_t get_view_input_size(int node_idx, const std::string & name) const override;
virtual size_t get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const override;
virtual size_t get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const override;
virtual std::vector<size_t> get_view_input_stride(int node_idx,
const std::string & name,
size_t view_index) const override;
virtual std::vector<size_t> get_view_input_src_stride(int node_idx,
const std::string & name,
size_t view_index) const override;
virtual ov::Shape get_view_input_ggml_shape(int node_idx,
const std::string & name,
size_t view_index) const override;
virtual ov::Shape get_view_input_src_ggml_shape(int node_idx,
const std::string & name,
size_t view_index) const override;
virtual ov::PartialShape get_view_input_ov_shape(int node_idx,
const std::string & name,
size_t view_index) const override;
virtual ov::PartialShape get_view_input_src_ov_shape(int node_idx,
const std::string & name,
size_t view_index) const override;
virtual std::string get_view_input_name(int node_idx, const std::string & name, size_t view_index) const override;
virtual std::string get_view_input_src_name(int node_idx,
const std::string & name,
size_t view_index) const override;
virtual ov::element::Type get_input_type(int node_idx, const std::string & name) const override;
virtual size_t get_input_size() const override;
@@ -106,10 +146,14 @@ public:
virtual ov::element::Type get_output_type(int node_idx) const override;
virtual std::vector<size_t> get_output_stride(int node_idx) const override;
virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
virtual int32_t * get_output_op_params(int node_idx) const override;
virtual size_t get_output_op_offset(int node_idx) const override;
virtual std::vector<std::string> get_output_names(int node_idx) const override;
virtual const std::string & get_op_type() const override;
@@ -120,7 +164,10 @@ public:
virtual const std::string & get_op_name(int node_idx) const override;
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;
virtual int32_t get_op_dynamic_dim(int node_idx) const override;
virtual void visit_subgraph(
std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;
ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
@@ -142,16 +189,12 @@ public:
return m_model_weights;
}
virtual std::vector<std::string> get_model_output_names() const override {
return m_model_output_names;
}
virtual std::vector<std::string> get_model_output_names() const override { return m_model_output_names; }
const std::map<std::string, ggml_tensor *> & get_model_outputs() const { return m_model_outputs; }
virtual int get_ctx_size() const { return m_model_params.ctx; }
virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; }
virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; }
virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; }
@@ -169,13 +212,21 @@ public:
virtual int32_t * get_rope_params() const override { return const_cast<int32_t *>(m_model_params.rope_params); }
virtual bool has_mixed_rope_params() const override { return m_model_params.mixed_rope_params; }
virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
virtual bool is_static() const override { return m_is_static; }
virtual bool is_stateful() const override { return m_is_stateful; }
ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
int get_static_n_tokens() const { return m_is_prefill ? m_prefill_chunk_size : 1; }
virtual bool is_splited_model() const override { return m_model_is_splitted; }
ov::PartialShape get_graph_input_shape(const ggml_tensor * op,
const ggml_tensor * input,
int dynamic_dim_index = -1) const;
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
@@ -205,6 +256,7 @@ public:
bool m_is_prefill = false;
bool m_naive = false;
int m_prefill_chunk_size = 0;
bool m_model_is_splitted = false; // label the cgraph is splited or not
static ov::Shape get_shape(const ggml_tensor * tensor);
static std::vector<size_t> get_stride(const ggml_tensor * tensor);
@@ -227,7 +279,8 @@ public:
}
inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]) ||
(op->op == GGML_OP_SOFT_MAX && tensor == op->src[1]);
}
inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) {
@@ -235,7 +288,8 @@ public:
}
inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY ||
(op != nullptr && op->op == GGML_OP_SET_ROWS && op->src[2] == tensor);
}
inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
@@ -243,23 +297,18 @@ public:
}
inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE;
return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE &&
op->src[1]->op == GGML_OP_NONE;
}
static std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) {
if (is_inp_tok(tensor, op)) {
return "inp_tokens";
}
std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) {
if (is_inp_pos(tensor, op)) {
return "inp_pos";
}
if (is_inp_emb(tensor, op)) {
return "embd";
}
if (is_output_idx(tensor, op)) {
return "inp_out_ids";
}
if (is_inp_mask(tensor, op)) {
if (is_stateful() && is_inp_mask(tensor, op)) {
return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa";
}
return tensor->name;
@@ -272,6 +321,9 @@ private:
void compute_model_inputs();
void compute_model_outputs();
// Infer and propagate dynamic-dimension indices for all tensors in the GGML graph.
void compute_node_dynamic_dims();
void validate_cgraph() const;
ggml_cgraph * m_cgraph = nullptr;
@@ -284,6 +336,7 @@ private:
std::map<std::string, ggml_tensor *> m_model_outputs;
std::vector<std::string> m_model_output_names;
std::vector<NodeInfo> m_node_info_list;
std::map<ggml_tensor *, int> m_node_dynamic_dims;
ModelParams m_model_params;
ComputeParams m_compute_params;
@@ -291,4 +344,4 @@ private:
void print_tensor_address_map(const ggml_cgraph * cgraph);
int extract_layer_from_name(const std::string & name);
std::optional<int> extract_layer_from_name(const std::string & name);
+57 -3
View File
@@ -3,6 +3,7 @@
#include "ggml-impl.h"
#include "ggml.h"
#include <cstdlib>
#include <cstring>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
@@ -22,7 +23,38 @@ void ggml_openvino_device_config::init() {
if (initialized) {
return;
}
device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
// All recognized GGML_OPENVINO_* env vars. Their values are cached here
// once at backend init time and read back via ggml_openvino_getenv_str()
// (raw string) or ggml_openvino_getenv_int() (integer / boolean toggle).
static constexpr const char * env_var_names[] = {
// String values (use ggml_openvino_getenv_str)
"GGML_OPENVINO_DEVICE",
"GGML_OPENVINO_CACHE_DIR",
// Integer values (use ggml_openvino_getenv_int)
"GGML_OPENVINO_PREFILL_CHUNK_SIZE",
// Boolean toggles (treated as int flags via ggml_openvino_getenv_int)
"GGML_OPENVINO_STATEFUL_EXECUTION",
"GGML_OPENVINO_PROFILING",
"GGML_OPENVINO_DUMP_CGRAPH",
"GGML_OPENVINO_DUMP_IR",
"GGML_OPENVINO_DEBUG_INPUT",
"GGML_OPENVINO_DEBUG_OUTPUT",
"GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS",
"GGML_OPENVINO_ENABLE_CACHE",
"GGML_OPENVINO_DISABLE_CACHE",
"GGML_OPENVINO_DISABLE_KV_SLICE",
"GGML_OPENVINO_MANUAL_GQA_ATTN",
};
for (const char * const & env_var : env_var_names) {
auto * env = getenv(env_var);
if (env) {
environment_variables[env_var] = env;
}
}
device_name = ggml_openvino_getenv_str("GGML_OPENVINO_DEVICE", "CPU");
auto available_devices = ov_singleton_core().get_available_devices();
if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) {
GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str());
@@ -30,7 +62,7 @@ void ggml_openvino_device_config::init() {
}
is_npu = (device_name == "NPU");
auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
const char * cache_dir = ggml_openvino_getenv_str("GGML_OPENVINO_CACHE_DIR");
if (device_name == "NPU") {
compile_config = {
{"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" },
@@ -119,6 +151,23 @@ const std::string & ggml_openvino_get_device_name() {
return ggml_openvino_get_device_config().device_name;
}
// Get the value of a GGML_OPENVINO_* env var as a string. Returns
// default_value when the var is unset or set to an empty string.
const char * ggml_openvino_getenv_str(const char * var, const char * default_value) {
auto & env_map = ggml_openvino_get_device_config().environment_variables;
auto it = env_map.find(var);
return (it == env_map.end() || it->second.empty()) ? default_value : it->second.c_str();
}
// Get the value of a GGML_OPENVINO_* env var as an int (via std::atoi).
// Returns default_value (0) when the var is unset or empty. Used for both
// integer settings (e.g. GGML_OPENVINO_PREFILL_CHUNK_SIZE) and boolean
// toggles: "0" disables, any non-zero integer enables.
int ggml_openvino_getenv_int(const char * var, int default_value) {
const char * v = ggml_openvino_getenv_str(var, nullptr);
return v ? std::atoi(v) : default_value;
}
// Check if running on NPU
bool ggml_openvino_is_npu() {
return ggml_openvino_get_device_config().is_npu;
@@ -173,7 +222,8 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
return std::nullopt;
}
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 :
ExtraQuantType::Q8_0_C);
}
if (strncmp(tensor->name, "output.weight", 13) == 0) {
return ExtraQuantType::Q8_0_C;
@@ -298,6 +348,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
layout.is_symmetric = true;
break;
case GGML_TYPE_Q5_1:
// u8 weights (5-bit values), asymmetric (scale + zero point)
break;
case GGML_TYPE_Q6_K:
layout.weights_per_block = 16;
layout.is_symmetric = true;
+29 -8
View File
@@ -64,6 +64,7 @@ struct ggml_openvino_device_config {
bool initialized = false;
std::optional<ov::RemoteContext> remote_context;
ov::AnyMap compile_config;
std::unordered_map<std::string, std::string> environment_variables;
cl_command_queue cl_queue = nullptr;
void init();
@@ -79,6 +80,22 @@ void ggml_openvino_init_device_config();
// Get the device name
const std::string & ggml_openvino_get_device_name();
// Environment variable accessors. All GGML_OPENVINO_* env vars are read once
// during backend init and cached on the device config; consumers must go
// through these helpers (never call ::getenv directly) so behavior stays
// consistent and centralized.
//
// Use ggml_openvino_getenv_str() for string / path values
// (e.g. GGML_OPENVINO_DEVICE, GGML_OPENVINO_CACHE_DIR). The optional
// default_value is returned when the var is unset or empty.
//
// Use ggml_openvino_getenv_int() for boolean toggles and integer settings.
// It returns std::atoi(value) when set, otherwise default_value. For
// boolean use, `if (ggml_openvino_getenv_int(name))` is true iff the value
// is a non-zero integer (so "0" disables, "1" enables).
const char * ggml_openvino_getenv_str(const char * var, const char * default_value = nullptr);
int ggml_openvino_getenv_int(const char * var, int default_value = 0);
// Check if running on NPU
bool ggml_openvino_is_npu();
@@ -115,9 +132,9 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
ov::Tensor weights; // U4 or U8 extracted weights
ov::Tensor scales; // F16 scales
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
ov::Tensor weights; // U4 or U8 extracted weights
ov::Tensor scales; // F16 scales
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight subgraph
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> n) :
@@ -132,8 +149,9 @@ struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
std::shared_ptr<ov::Tensor> tensor; // For direct use with infer_request
explicit ggml_openvino_tensor_extra(std::shared_ptr<ov::Tensor> t)
: ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {}
explicit ggml_openvino_tensor_extra(std::shared_ptr<ov::Tensor> t) :
ggml_openvino_extra_base(Type::TENSOR),
tensor(std::move(t)) {}
};
// =====================================================
@@ -152,11 +170,11 @@ struct ggml_openvino_extracted_layout {
size_t zp_size = 0; // Size of zero points in bytes (U4 or U8)
bool is_u4; // true for U4 weights, false for U8
int64_t weights_per_block; // weights per scale/zp block
bool is_symmetric; // true for symmetric quantization
bool is_symmetric; // true for symmetric quantization
// Requantization info
bool is_requant = false; // true if this tensor needs requantization
std::optional<ExtraQuantType> requant_type; // target requant type if is_requant
bool is_requant = false; // true if this tensor needs requantization
std::optional<ExtraQuantType> requant_type; // target requant type if is_requant
};
// Calculate the buffer layout for extracted quantized data
@@ -164,6 +182,9 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
// Check if a tensor's buffer uses remote (device) memory (e.g. GPU USM)
bool ggml_openvino_buffer_is_remote(const ggml_tensor * tensor);
// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
// This sets tensor->extra and tracks the extra in the buffer context for cleanup.
void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
+310 -66
View File
@@ -4,13 +4,14 @@
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-openvino-extra.h"
#include "ggml-openvino/openvino/op_table.h"
#include "ggml-openvino/utils.h"
#include "ggml-quants.h"
#include "ggml.h"
#include <atomic>
#include <cstdlib>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <memory>
#include <mutex>
@@ -146,8 +147,7 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer
}
static bool is_stateful_enabled() {
static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION");
return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0;
return ggml_openvino_getenv_int("GGML_OPENVINO_STATEFUL_EXECUTION") != 0;
}
static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
@@ -367,11 +367,9 @@ static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer
ggml_backend_openvino_buffer_context * src_ctx =
(ggml_backend_openvino_buffer_context *) src->buffer->context;
if (src_ctx->is_remote) {
cl_int err =
mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr);
if (err != CL_SUCCESS) {
GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__,
err);
GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__, err);
return false;
}
return true;
@@ -579,6 +577,17 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
return ctx->id;
}
bool ggml_openvino_buffer_is_remote(const ggml_tensor * tensor) {
if (tensor == nullptr || tensor->buffer == nullptr) {
return false;
}
if (!ggml_backend_buffer_is_openvino(tensor->buffer)) {
return false;
}
auto * ctx = static_cast<ggml_backend_openvino_buffer_context *>(tensor->buffer->context);
return ctx->is_remote;
}
void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) {
GGML_ASSERT(tensor != nullptr);
GGML_ASSERT(tensor->buffer != nullptr);
@@ -785,6 +794,18 @@ static bool has_view_op_input(const ggml_tensor * op) {
return false;
}
static bool has_non_contiguous_view_input(const ggml_tensor * op) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op->src[i] == nullptr) {
break;
}
if (op->src[i]->op == GGML_OP_VIEW && !ggml_is_contiguous(op->src[i])) {
return true;
}
}
return false;
}
static bool is_supported_flash_attn_pattern(const ggml_tensor * op) {
// pattern of q,k,v should be q->op==PERMUTE, q->src[0]->op==VIEW, q->src[0]->src[0]->view_src==nullptr
for (int i = 0; i < 3; i++) {
@@ -797,17 +818,107 @@ static bool is_supported_flash_attn_pattern(const ggml_tensor * op) {
return true;
}
static bool is_gemma3n_flash_attn_pattern(const ggml_tensor * op) {
if (!is_supported_flash_attn_pattern(op)) {
return false;
}
const ggml_tensor * q_base =
op->src[0] != nullptr && op->src[0]->src[0] != nullptr ? op->src[0]->src[0]->src[0] : nullptr;
const ggml_tensor * k_base =
op->src[1] != nullptr && op->src[1]->src[0] != nullptr ? op->src[1]->src[0]->src[0] : nullptr;
const ggml_tensor * v_base =
op->src[2] != nullptr && op->src[2]->src[0] != nullptr ? op->src[2]->src[0]->src[0] : nullptr;
if (q_base == nullptr || q_base->op != GGML_OP_ROPE) {
return false;
}
// gemma3n direct attention path (no KV cache): q=ROPE, k=ROPE, v=RMS_NORM
// Only match this specific pattern to avoid falsely catching other models
// (e.g. Gemma4) that also use scale=1.0 with KV-cache backed attention.
const bool is_qkv_direct =
k_base != nullptr && v_base != nullptr && k_base->op == GGML_OP_ROPE && v_base->op == GGML_OP_RMS_NORM;
return is_qkv_direct;
}
static bool checked_mul_size(size_t a, size_t b, size_t & out) {
if (a == 0 || b == 0) {
out = 0;
return true;
}
if (a > SIZE_MAX / b) {
return false;
}
out = a * b;
return true;
}
static bool mul_mat_id_requires_large_tmp(const ggml_tensor * op) {
const ggml_tensor * as = op->src[0];
const ggml_tensor * ids = op->src[2];
if (as == nullptr || ids == nullptr) {
return true;
}
// The current OpenVINO translation materializes selected expert weights with
// shape [n_tokens, n_used, rows, k]. Skip cases that would create a very
// large temporary on GPU and let the scheduler fall back instead.
size_t tmp_elems = 1;
if (!checked_mul_size(tmp_elems, static_cast<size_t>(ids->ne[1]), tmp_elems) ||
!checked_mul_size(tmp_elems, static_cast<size_t>(ids->ne[0]), tmp_elems) ||
!checked_mul_size(tmp_elems, static_cast<size_t>(as->ne[1]), tmp_elems) ||
!checked_mul_size(tmp_elems, static_cast<size_t>(as->ne[0]), tmp_elems)) {
return true;
}
size_t tmp_bytes = 0;
if (!checked_mul_size(tmp_elems, sizeof(float), tmp_bytes)) {
return true;
}
static constexpr size_t mul_mat_id_tmp_limit = 1ULL << 30; // 1 GiB
return tmp_bytes > mul_mat_id_tmp_limit;
}
static bool is_op_unsupported_case(const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_CONCAT: {
if (op->type == GGML_TYPE_I64) {
return true;
}
break;
}
case GGML_OP_GET_ROWS:
case GGML_OP_SET_ROWS: {
if (op->ne[3] != 1) {
return true;
}
if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
// ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
// ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
return true;
}
// Keep the MoE routing weights gather on CPU for GPU runs. Splitting
// only at the later SUM/CLAMP/DIV nodes still leaves this routing path
// numerically unstable for arctic-style MoE graphs.
if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_RESHAPE: {
if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 ||
strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_ADD:
case GGML_OP_MUL: {
case GGML_OP_MUL:
case GGML_OP_SUB: {
if (op->src[1]->op == GGML_OP_PERMUTE) {
return true;
}
@@ -818,30 +929,79 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
}
break;
}
case GGML_OP_ADD_ID: {
// Keep support aligned with the CPU backend implementation, which only handles f32 inputs/output and i32 ids.
if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32 ||
op->src[2]->type != GGML_TYPE_I32) {
return true;
}
break;
}
case GGML_OP_DIV: {
bool requires_broadcast = false;
for (int i = 0; i < 4; i++) {
if (op->src[0]->ne[i] == op->src[1]->ne[i]) {
continue;
}
if (op->src[0]->ne[i] != 1 && op->src[1]->ne[i] != 1) {
return true;
}
requires_broadcast = true;
}
// The GPU plugin can fuse broadcast DIV into the preceding FFN GEMM path
// and produce infs for per-channel scale vectors. Keep those DIVs on CPU
// until the fused GPU kernel is reliable. (falied case llama-arch-test mpt)
if (requires_broadcast && ggml_openvino_get_device_name() == "GPU") {
return true;
}
// qwen3next MoE weight normalization is numerically sensitive on the GPU
// path. Keep the normalization divide on CPU to match the reference.
if (strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_SOFT_MAX: {
if (op->src[2] != nullptr) {
// GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
return true;
}
float scale = 1.0f;
float max_bias = 0.0f;
const auto * op_params = op->op_params;
memcpy(&scale, (const float *) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
if (max_bias > 0) {
// GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
if (strncmp(op->name, "ffn_moe_probs", sizeof("ffn_moe_probs") - 1) == 0) {
return true;
}
// GPU execution of the MoE routing weights softmax is numerically unstable
// when fused with the surrounding GET_ROWS/reshape path. Keep this softmax
// on CPU so the scheduler splits at the same boundary that restores parity.
if (op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE && op->src[0]->src[0] != nullptr &&
strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_SUM_ROWS: {
if (strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) {
return true;
}
// if the input is PERMUTE skip
if (op->src[0]->op == GGML_OP_PERMUTE) {
return true;
}
break;
}
case GGML_OP_CLAMP: {
if (strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) {
return true;
}
break;
}
case GGML_OP_FLASH_ATTN_EXT: {
if (op->src[4] != nullptr) {
// GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n");
return true;
}
if (!is_supported_flash_attn_pattern(op)) {
return true;
}
float scale = 1.0f;
float max_bias = 0.0f;
float logit_softcap = 0.0f;
@@ -849,6 +1009,21 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
memcpy(&scale, (const float *) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float));
// Keep gemma3n flash-attn pattern on CPU for GPU runs to avoid
// accuracy drift in the OpenVINO path. Restrict by scale=1.0 to avoid
// affecting non-gemma3n models such as Llama-3.2.
if (fabsf(scale - 1.0f) < 1e-6f && is_gemma3n_flash_attn_pattern(op)) {
return true;
}
if (op->src[4] != nullptr) {
// GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n");
return true;
}
if (!is_supported_flash_attn_pattern(op)) {
return true;
}
if (max_bias > 0) {
// GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n");
return true;
@@ -868,34 +1043,44 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
break;
}
case GGML_OP_CPY: {
if (op->src[1] != op) {
// GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n");
if (op->src[0]->type == GGML_TYPE_BF16 || op->src[1]->type == GGML_TYPE_BF16) {
// GGML_LOG_WARN("OpenVINO backend does not support CPY with non-contiguous data or bf16 types\n");
return true;
}
// op test case with non-contiguous src or dst
if ((op->ne[0] == 3 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2) ||
(op->ne[0] == 1 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2) ||
(op->ne[0] == 2 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2)) {
return true;
}
break;
}
case GGML_OP_MUL_MAT: {
if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) {
// Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"`
// GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n");
if (ggml_openvino_get_device_name() == "GPU" && op->src[1]->op == GGML_OP_SOFT_MAX &&
op->src[0]->op == GGML_OP_CONT && op->src[0]->src[0] != nullptr &&
op->src[0]->src[0]->op == GGML_OP_TRANSPOSE && op->src[0]->src[0]->src[0] != nullptr &&
op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) {
return true;
}
if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
return true;
}
if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) {
return true;
}
if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) {
// MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1)
// triggers a bug in ov matmul_shape_inference.hpp
return true;
}
if (op->src[0]->op == GGML_OP_VIEW && op->src[1]->op == GGML_OP_VIEW) {
return true;
}
break;
}
case GGML_OP_MUL_MAT_ID: {
if (strncmp(op->name, "ffn_moe_gate_up", sizeof("ffn_moe_gate_up") - 1) == 0 ||
strncmp(op->name, "ffn_moe_down", sizeof("ffn_moe_down") - 1) == 0) {
return true;
}
if (mul_mat_id_requires_large_tmp(op)) {
return true;
}
break;
}
case GGML_OP_ROPE: {
const int32_t * op_params = op->op_params;
const int n_dims = op_params[1];
@@ -909,7 +1094,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
// op->src[0]->ne[0]);
return true;
}
if (op->type != GGML_TYPE_F32) {
if (op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
return true;
}
@@ -930,15 +1115,54 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
}
break;
}
default:
break;
}
if (op->op == GGML_OP_GET_ROWS) {
if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
// ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
// ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
case GGML_OP_TRANSPOSE: {
// if the type is bf16, will return true
if (op->type == GGML_TYPE_BF16) {
// GGML_LOG_WARN("OpenVINO backend does not support CONT with BF16 type\n");
return true;
}
break;
}
case GGML_OP_GATED_DELTA_NET: {
// enable after https://github.com/openvinotoolkit/openvino/pull/35917 is included in OV release
return true;
// if (ggml_openvino_get_device_name() == "GPU" && op->src[0]->ne[2] > 1) {
// // CVS-186471
// return true;
// }
if (op->src[2]->op == GGML_OP_PERMUTE) {
return true;
}
// kda (per-key-dimension gating) not supported by fused GatedDeltaNet op
if (op->src[3]->ne[0] != 1) {
return true;
}
// v_repeat > 1 (GQA): ggml uses modulo head mapping (h_q = h_v % H_k)
// but the fused op uses consecutive mapping (h_q = h_v / group_size)
if (op->src[2]->ne[1] != op->src[0]->ne[1]) {
return true;
}
// K > 1 (multiple state snapshots) not supported by fused op
if (op->src[5]->ne[1] > 1) {
return true;
}
break;
}
case GGML_OP_SSM_CONV: {
// qwen3next is numerically unstable with OpenVINO SSM_CONV.
// Keep this op on CPU until the OpenVINO implementation is fixed.
return true;
}
case GGML_OP_VIEW: {
// Skip TOPK_MOE fused tests until it is fully supported
// the argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe
if (strcmp(op->name, "selected_experts") == 0) {
return true;
}
break;
}
default:
break;
}
return false;
}
@@ -946,24 +1170,47 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
GGML_ASSERT(dev->reg != nullptr);
static std::set<ggml_type> supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64,
GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
static std::unordered_set<ggml_type> supported_types{
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32, GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
/*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
// softmax is not updated due to replaced by flash_attn_ext
// GGML_OP_SOFT_MAX,
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
static const std::set<ggml_unary_op> supported_unary_ops{
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_SILU,
};
static const std::set<ggml_glu_op> supported_glu_ops{
GGML_GLU_OP_SWIGLU,
GGML_GLU_OP_GEGLU,
// derive supported op sets from the op_table map, keys in
// the map use the full macro name (e.g. "GGML_OP_ADD"), while
// the ggml_*_op_name() helpers return only the trailing part (e.g. "ADD").
// each set is built once and cached.
static const auto build_supported_sets = [] {
const auto & table = ov::frontend::ggml::get_supported_ops();
std::unordered_set<ggml_op> ops;
std::unordered_set<ggml_unary_op> unary_ops;
std::unordered_set<ggml_glu_op> glu_ops;
// GGML_OP_NONE has no translator but is always safe to add to the supported set.
ops.insert(GGML_OP_NONE);
for (int i = 0; i < GGML_OP_COUNT; ++i) {
const std::string key = std::string("GGML_OP_") + ggml_op_name(static_cast<ggml_op>(i));
if (table.count(key)) {
ops.insert(static_cast<ggml_op>(i));
}
}
for (int i = 0; i < GGML_UNARY_OP_COUNT; ++i) {
const std::string key = std::string("GGML_UNARY_OP_") + ggml_unary_op_name(static_cast<ggml_unary_op>(i));
if (table.count(key)) {
unary_ops.insert(static_cast<ggml_unary_op>(i));
}
}
for (int i = 0; i < GGML_GLU_OP_COUNT; ++i) {
const std::string key = std::string("GGML_GLU_OP_") + ggml_glu_op_name(static_cast<ggml_glu_op>(i));
if (table.count(key)) {
glu_ops.insert(static_cast<ggml_glu_op>(i));
}
}
return std::make_tuple(ops, unary_ops, glu_ops);
};
static const auto supported_sets = build_supported_sets();
static const auto & supported_ops = std::get<0>(supported_sets);
static const auto & supported_unary_ops = std::get<1>(supported_sets);
static const auto & supported_glu_ops = std::get<2>(supported_sets);
switch (op->op) {
case GGML_OP_UNARY: {
@@ -972,11 +1219,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
// GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op)));
return false;
}
if (has_view_op_input(op)) {
// GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n",
// ggml_unary_op_name(ggml_get_unary_op(op)));
return false;
}
break;
}
case GGML_OP_GLU: {
@@ -1003,13 +1245,15 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
return false;
}
static std::set<ggml_op> ops_not_support_view_input{
GGML_OP_GET_ROWS,
GGML_OP_RMS_NORM,
GGML_OP_L2_NORM,
};
if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_op_input(op)) {
// GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op));
return false;
}
if (op->op == GGML_OP_RMS_NORM && has_non_contiguous_view_input(op)) {
return false;
}
}
}
+66
View File
@@ -126,6 +126,68 @@ void extract_q4_1_data(const ggml_tensor * tensor,
}
}
// Extracts (weight, scales, zp) from Q5_1 tensors.
// Data layout is: |16 bit scale|16 bit min|32 bit qh (5th bits)|32 x 4bit low nibbles|.
// Reconstructed quant q in [0,31]: q = (low nibble) | (qh_bit << 4). Dequant: w*d + m.
// Weights are stored as u8 (5-bit values do not fit u4), matching make_int8_weights.
void extract_q5_1_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
bool use_bias) {
const uint64_t bytes_per_block = 24; // 2 scale + 2 min + 4 qh + 16 (32x0.5) weights
const int qk = 32;
auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data()); // u8 weights, one byte per weight
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
// Read a 16-bit little-endian value without aliasing/const-qual violations.
auto read_u16 = [](const uint8_t * p) {
uint16_t v;
memcpy(&v, p, sizeof(v));
return v;
};
auto unpack_block = [&](const uint8_t * block, uint8_t * dst) {
uint32_t qh;
memcpy(&qh, block + 4, sizeof(uint32_t));
const uint8_t * qs = block + 8;
for (int j = 0; j < qk / 2; ++j) {
const uint8_t lo = qs[j] & 0x0F;
const uint8_t hi = qs[j] >> 4;
const uint8_t bit_lo = (qh >> j) & 1;
const uint8_t bit_hi = (qh >> (j + qk / 2)) & 1;
dst[j] = lo | (bit_lo << 4); // first 16 weights
dst[j + qk / 2] = hi | (bit_hi << 4); // last 16 weights
}
};
if (use_bias) {
// Store bias (min) directly as f16: dequant w*d + m
auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
const uint8_t * block = data + i * bytes_per_block;
float scale = static_cast<float>(ov::float16::from_bits(read_u16(block)));
float min = static_cast<float>(ov::float16::from_bits(read_u16(block + 2)));
scales[i] = ov::float16(scale);
bias[i] = ov::float16(min);
unpack_block(block, weights + i * qk);
});
} else {
auto * zp = static_cast<uint8_t *>(zp_arr.data()); // u8 zero points
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
const uint8_t * block = data + i * bytes_per_block;
float scale = static_cast<float>(ov::float16::from_bits(read_u16(block)));
float min = static_cast<float>(ov::float16::from_bits(read_u16(block + 2)));
scales[i] = ov::float16(scale);
// zp = -min / scale (dequant: (w - zp) * s == w*s + min)
zp[i] = (scale != 0.0f) ? (uint8_t) std::lround(-min / scale) : 0;
unpack_block(block, weights + i * qk);
});
}
}
// Extracts (weight, scales, zp) from Q8_0 tensors.
// Data layout is: |16 bit scale|32 x 8bit weights|.
// When zp_arr is empty (symmetric), weights are stored as signed i8 directly.
@@ -577,6 +639,7 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
weights_per_block = 32;
break;
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q5_K:
is_u4 = false;
weights_per_block = 32;
@@ -601,6 +664,9 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
case GGML_TYPE_Q4_K:
extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
break;
case GGML_TYPE_Q5_1:
extract_q5_1_data(&temp_tensor, weights, scales, zp, use_bias);
break;
case GGML_TYPE_Q8_0:
extract_q8_0_data(&temp_tensor, weights, scales, zp);
break;
+10 -4
View File
@@ -6,7 +6,7 @@
#include <openvino/op/constant.hpp>
#include <openvino/runtime/tensor.hpp>
void unpack_32_4(const uint8_t* data, uint8_t* dst);
void unpack_32_4(const uint8_t * data, uint8_t * dst);
void extract_q4_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
@@ -19,12 +19,18 @@ void extract_q4_1_data(const ggml_tensor * tensor,
ov::Tensor & zp_arr,
bool use_bias = false);
void extract_q5_1_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr,
bool use_bias = false);
void extract_q8_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,
ov::Tensor & zp_arr);
void unpack_256_4(const uint8_t* data, uint8_t* dst);
void unpack_256_4(const uint8_t * data, uint8_t * dst);
void extract_q4_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr,
@@ -145,8 +151,8 @@ namespace ov {
namespace op {
namespace util {
// From <openvino>/src/common/transformations/include/transformations/utils/utils.hpp
bool get_single_value(const std::shared_ptr<ov::op::v0::Constant>& const_node,
float& value,
bool get_single_value(const std::shared_ptr<ov::op::v0::Constant> & const_node,
float & value,
bool check_value_range = true);
} // namespace util
} // namespace op
+56 -16
View File
@@ -3,6 +3,8 @@
#include <cstdint>
#include <map>
#include <openvino/core/node.hpp>
#include <openvino/core/partial_shape.hpp>
#include <openvino/core/shape.hpp>
#include <openvino/frontend/decoder.hpp>
#include <string>
@@ -12,22 +14,50 @@ namespace ggml {
class GgmlDecoder : public DecoderBase {
public:
virtual ov::Any get_attribute(const std::string& name) const = 0;
virtual ov::Any get_attribute(const std::string & name) const = 0;
virtual PartialShape get_input_shape(int node_idx, const std::string& name) const = 0;
virtual PartialShape get_input_shape(int node_idx, const std::string & name) const = 0;
virtual std::vector<size_t> get_input_stride(int node_idx, const std::string& name) const = 0;
virtual std::vector<size_t> get_input_stride(int node_idx, const std::string & name) const = 0;
virtual element::Type get_input_type(int node_idx, const std::string& name) const = 0;
virtual size_t get_view_input_size(int node_idx, const std::string & name) const = 0;
virtual size_t get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const = 0;
virtual size_t get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const = 0;
virtual std::vector<size_t> get_view_input_stride(int node_idx,
const std::string & name,
size_t view_index) const = 0;
virtual std::vector<size_t> get_view_input_src_stride(int node_idx,
const std::string & name,
size_t view_index) const = 0;
virtual Shape get_view_input_ggml_shape(int node_idx, const std::string & name, size_t view_index) const = 0;
virtual Shape get_view_input_src_ggml_shape(int node_idx, const std::string & name, size_t view_index) const = 0;
virtual PartialShape get_view_input_ov_shape(int node_idx, const std::string & name, size_t view_index) const = 0;
virtual PartialShape get_view_input_src_ov_shape(int node_idx,
const std::string & name,
size_t view_index) const = 0;
virtual std::string get_view_input_name(int node_idx, const std::string & name, size_t view_index) const = 0;
virtual std::string get_view_input_src_name(int node_idx, const std::string & name, size_t view_index) const = 0;
virtual element::Type get_input_type(int node_idx, const std::string & name) const = 0;
virtual size_t get_input_size() const = 0;
virtual size_t get_input_size(int node_idx) const = 0;
virtual void get_input_node(size_t input_port_idx,
std::string& producer_name,
std::string& producer_output_port_name,
size_t& producer_output_port_index) const = 0;
std::string & producer_name,
std::string & producer_output_port_name,
size_t & producer_output_port_index) const = 0;
virtual std::vector<std::string> get_input_names(int node_idx) const = 0;
@@ -35,30 +65,36 @@ public:
virtual element::Type get_output_type(const int node_idx) const = 0;
virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0;
virtual std::vector<size_t> get_output_stride(int node_idx) const = 0;
virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const = 0;
virtual int32_t * get_output_op_params(int node_idx) const = 0;
virtual size_t get_output_op_offset(int node_idx) const = 0;
virtual std::vector<std::string> get_output_names(int node_idx) const = 0;
virtual const std::string& get_op_type() const = 0;
virtual const std::string & get_op_type() const = 0;
virtual const std::string& get_op_type(int node_idx) const = 0;
virtual const std::string & get_op_type(int node_idx) const = 0;
virtual const std::string& get_op_name() const = 0;
virtual const std::string & get_op_name() const = 0;
virtual const std::string& get_op_name(int node_idx) const = 0;
virtual const std::string & get_op_name(int node_idx) const = 0;
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const = 0;
virtual int get_op_case(int node_idx) const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_inputs() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_extra_inputs() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>> & get_model_weights() const = 0;
virtual std::vector<std::string> get_model_output_names() const = 0;
virtual int32_t* get_rope_params() const = 0;
virtual int32_t * get_rope_params() const = 0;
virtual bool has_mixed_rope_params() const = 0;
virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
@@ -66,7 +102,11 @@ public:
virtual bool is_stateful() const = 0;
virtual bool is_splited_model() const = 0;
virtual int is_swa_layer(int layer) const = 0;
virtual int32_t get_op_dynamic_dim(int node_idx) const = 0;
};
} // namespace ggml
+1 -1
View File
@@ -15,7 +15,7 @@ public:
using Ptr = std::shared_ptr<FrontEnd>;
FrontEnd();
static std::shared_ptr<Model> convert(const InputModel::Ptr& model, bool naive = false);
static std::shared_ptr<Model> convert(const InputModel::Ptr & model, bool naive = false);
};
} // namespace ggml
@@ -1,9 +1,9 @@
#pragma once
#include <openvino/frontend/input_model.hpp>
#include "decoder.h"
#include <openvino/frontend/input_model.hpp>
namespace ov {
namespace frontend {
namespace ggml {
@@ -16,9 +16,9 @@ class InputModel : public ov::frontend::InputModel {
friend class ::ov::frontend::ggml::FrontEnd;
public:
explicit InputModel(const std::shared_ptr<GgmlDecoder>& gdecoder);
explicit InputModel(const std::shared_ptr<GgmlDecoder> & gdecoder);
const std::shared_ptr<GgmlDecoder>& get_model_decoder() const;
const std::shared_ptr<GgmlDecoder> & get_model_decoder() const;
private:
std::shared_ptr<GgmlDecoder> m_decoder;
+95 -38
View File
@@ -1,11 +1,11 @@
#pragma once
#include "decoder.h"
#include <cstdint>
#include <openvino/frontend/node_context.hpp>
#include <string>
#include "decoder.h"
namespace ov {
namespace frontend {
namespace ggml {
@@ -16,28 +16,24 @@ typedef std::map<std::string, Output<Node>> TensorMap;
class NodeContext : public frontend::NodeContext {
public:
NodeContext(const std::shared_ptr<GgmlDecoder>& decoder,
std::shared_ptr<TensorMap>& tensor_map,
NodeContext(const std::shared_ptr<GgmlDecoder> & decoder,
std::shared_ptr<TensorMap> & tensor_map,
int node_idx,
TranslateSession* translate_session = nullptr)
: ov::frontend::NodeContext(decoder->get_op_type(node_idx)),
m_decoder(decoder),
m_tensor_map(tensor_map),
m_node_idx(node_idx),
m_translate_session(translate_session) {
TranslateSession * translate_session = nullptr) :
ov::frontend::NodeContext(decoder->get_op_type(node_idx)),
m_decoder(decoder),
m_tensor_map(tensor_map),
m_node_idx(node_idx),
m_translate_session(translate_session) {
m_input_names = decoder->get_input_names(m_node_idx);
m_output_names = decoder->get_output_names(m_node_idx);
}
TranslateSession* get_translate_session() const {
return m_translate_session;
}
TranslateSession * get_translate_session() const { return m_translate_session; }
const std::vector<std::string>& get_input_names() const { return m_input_names; }
const std::vector<std::string> & get_input_names() const { return m_input_names; }
size_t get_input_size() const override {
return m_decoder->get_input_size(m_node_idx);
}
size_t get_input_size() const override { return m_decoder->get_input_size(m_node_idx); }
ov::element::Type get_input_type(size_t index) const {
return m_decoder->get_input_type(m_node_idx, m_input_names[index]);
@@ -55,42 +51,103 @@ public:
PartialShape get_output_shape() const { return m_decoder->get_output_shape(m_node_idx); }
int32_t* get_input_op_params(size_t index) const {
int32_t * get_input_op_params(size_t index) const {
return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]);
}
int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
ov::element::Type get_output_type() const {
return m_decoder->get_output_type(m_node_idx);
size_t get_view_input_size(size_t index) const {
return m_decoder->get_view_input_size(m_node_idx, m_input_names[index]);
}
size_t get_view_input_offset(size_t index, size_t view_index) const {
return m_decoder->get_view_input_offset(m_node_idx, m_input_names[index], view_index);
}
size_t get_view_input_src_offset(size_t index, size_t view_index) const {
return m_decoder->get_view_input_src_offset(m_node_idx, m_input_names[index], view_index);
}
std::vector<size_t> get_view_input_stride(size_t index, size_t view_index) const {
return m_decoder->get_view_input_stride(m_node_idx, m_input_names[index], view_index);
}
std::vector<size_t> get_view_input_src_stride(size_t index, size_t view_index) const {
return m_decoder->get_view_input_src_stride(m_node_idx, m_input_names[index], view_index);
}
ov::Shape get_view_input_ggml_shape(size_t index, size_t view_index) const {
return m_decoder->get_view_input_ggml_shape(m_node_idx, m_input_names[index], view_index);
}
ov::Shape get_view_input_src_ggml_shape(size_t index, size_t view_index) const {
return m_decoder->get_view_input_src_ggml_shape(m_node_idx, m_input_names[index], view_index);
}
ov::PartialShape get_view_input_ov_shape(size_t index, size_t view_index) const {
return m_decoder->get_view_input_ov_shape(m_node_idx, m_input_names[index], view_index);
}
ov::PartialShape get_view_input_src_ov_shape(size_t index, size_t view_index) const {
return m_decoder->get_view_input_src_ov_shape(m_node_idx, m_input_names[index], view_index);
}
std::string get_view_input_name(size_t index, size_t view_index) const {
return m_decoder->get_view_input_name(m_node_idx, m_input_names[index], view_index);
}
std::string get_view_input_src_name(size_t index, size_t view_index) const {
return m_decoder->get_view_input_src_name(m_node_idx, m_input_names[index], view_index);
}
int32_t get_op_dynamic_dim() const { return m_decoder->get_op_dynamic_dim(m_node_idx); }
int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
size_t get_output_op_offset() const { return m_decoder->get_output_op_offset(m_node_idx); }
ov::element::Type get_output_type() const { return m_decoder->get_output_type(m_node_idx); }
std::vector<size_t> get_output_stride() const { return m_decoder->get_output_stride(m_node_idx); }
Output<Node> get_input(int idx) const override {
// Check if this input is a VIEW
size_t view_input_size = m_decoder->get_view_input_size(m_node_idx, m_input_names[idx]);
if (view_input_size > 0) {
// This is a VIEW input, get the base tensor name (last element in the chain)
std::string base_name =
m_decoder->get_view_input_src_name(m_node_idx, m_input_names[idx], view_input_size - 1);
// Check if the VIEW has been resolved (translate_view produced a Slice)
auto view_it = m_tensor_map->find(m_input_names[idx]);
if (!base_name.empty() && view_it != m_tensor_map->end()) {
auto base_it = m_tensor_map->find(base_name);
if (base_it != m_tensor_map->end() &&
view_it->second.get_node_shared_ptr() != base_it->second.get_node_shared_ptr()) {
return view_it->second;
}
return base_it->second;
}
if (!base_name.empty()) {
return m_tensor_map->at(base_name);
}
}
// Not a VIEW or failed to get base name, use the original logic
return m_tensor_map->at(m_input_names[idx]);
}
Output<Node> get_input(const std::string& name) const override {
Output<Node> get_input(const std::string & name) const override {
if (m_tensor_map->find(name) == m_tensor_map->end()) {
throw std::runtime_error("'" + name + "' not found in tensor map.");
}
return m_tensor_map->at(name);
}
bool has_input(const std::string& name) const {
return m_tensor_map->find(name) != m_tensor_map->end();
}
bool has_input(const std::string & name) const { return m_tensor_map->find(name) != m_tensor_map->end(); }
const std::string& get_name() const override {
return m_decoder->get_op_name(m_node_idx);
}
const std::string & get_name() const override { return m_decoder->get_op_name(m_node_idx); }
ov::Any get_attribute_as_any(const std::string& name) const override {
return m_decoder->get_attribute(name);
}
ov::Any get_attribute_as_any(const std::string & name) const override { return m_decoder->get_attribute(name); }
int get_op_case() const {
return m_decoder->get_op_case(m_node_idx);
}
int get_op_case() const { return m_decoder->get_op_case(m_node_idx); }
bool is_static() const { return m_decoder->is_static(); }
@@ -98,14 +155,14 @@ public:
private:
std::shared_ptr<GgmlDecoder> m_decoder;
std::shared_ptr<TensorMap>& m_tensor_map;
std::shared_ptr<TensorMap> & m_tensor_map;
int m_node_idx;
TranslateSession* m_translate_session;
TranslateSession * m_translate_session;
std::vector<std::string> m_input_names;
std::vector<std::string> m_output_names;
};
using CreatorFunction = std::function<ov::OutputVector(const ov::frontend::ggml::NodeContext&)>;
using CreatorFunction = std::function<ov::OutputVector(const ov::frontend::ggml::NodeContext &)>;
} // namespace ggml
} // namespace frontend
@@ -0,0 +1,62 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_add_id(const NodeContext & context) {
num_inputs_check(context, 3, 3);
auto input = process_view_input_new(context, 0);
auto bias = process_view_input_new(context, 1);
auto ids = process_view_input_new(context, 2);
// OpenVINO uses reversed GGML dimensions:
// input: [1, n_token, n_used, n_embd]
// bias: [1, 1, n_expert, n_embd]
// ids: [1, 1, n_token, n_used]
auto bias_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(bias, ov::element::i64);
auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
bias = std::make_shared<ov::op::v1::Reshape>(bias, get_dimensions(bias_shape_4d, {2, 3}), false);
ids = std::make_shared<ov::op::v1::Reshape>(ids, get_dimensions(ids_shape_4d, {2, 3}), false);
if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
}
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
ov::Output<ov::Node> selected_bias = std::make_shared<ov::op::v8::Gather>(bias, ids, gather_axis);
selected_bias = std::make_shared<ov::op::v1::Reshape>(
selected_bias, std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64), false);
if (selected_bias.get_element_type() != input.get_element_type()) {
selected_bias = std::make_shared<ov::op::v0::Convert>(selected_bias, input.get_element_type());
}
ov::Output<ov::Node> res = std::make_shared<ov::op::v1::Add>(input, selected_bias);
const auto output_type = context.get_output_type();
if (res.get_element_type() != output_type) {
res = std::make_shared<ov::op::v0::Convert>(res, output_type);
}
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,47 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include "ggml.h"
#include <openvino/frontend/exception.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/topk.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_argsort(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = process_view_input_new(context, 0);
const int32_t order = context.get_output_op_params()[0];
ov::op::v11::TopK::Mode mode;
switch (order) {
case GGML_SORT_ORDER_ASC:
mode = ov::op::v11::TopK::Mode::MIN;
break;
case GGML_SORT_ORDER_DESC:
mode = ov::op::v11::TopK::Mode::MAX;
break;
default:
FRONT_END_OP_CONVERSION_CHECK(false, "Unsupported GGML_OP_ARGSORT order: ", order);
}
auto k = std::make_shared<ov::op::v0::Squeeze>(get_dimensions(input.get_node_shared_ptr(), {3}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
auto topk = std::make_shared<ov::op::v11::TopK>(input, k, 3, mode, ov::op::v11::TopK::SortType::SORT_VALUES,
context.get_output_type(), false);
return rename_outputs_with_suffix({topk->output(1)}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,33 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <cstring>
#include <openvino/op/clamp.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_clamp(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = process_view_input_new(context, 0);
const int32_t * op_params = context.get_output_op_params();
FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "CLAMP requires output op params");
float min;
float max;
std::memcpy(&min, reinterpret_cast<const float *>(op_params) + 0, sizeof(float));
std::memcpy(&max, reinterpret_cast<const float *>(op_params) + 1, sizeof(float));
auto res = std::make_shared<ov::op::v0::Clamp>(input, min, max);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,48 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <memory>
#include <openvino/frontend/exception.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/convert.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_concat(const NodeContext & context) {
num_inputs_check(context, 2, 2);
const int32_t * op_params = context.get_output_op_params();
FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "CONCAT requires output op params");
const auto output_shape = context.get_output_shape();
FRONT_END_CHECK_IMPLEMENTED(output_shape.rank().is_static(), "CONCAT requires static output rank");
const auto rank = output_shape.rank().get_length();
const int32_t ggml_dim = op_params[0];
FRONT_END_CHECK_IMPLEMENTED(ggml_dim >= 0 && ggml_dim < rank, "CONCAT axis is out of range");
auto input_0 = process_view_input_new(context, 0);
auto input_1 = process_view_input_new(context, 1);
const auto output_type = context.get_output_type();
if (input_0.get_element_type() != output_type) {
input_0 = std::make_shared<ov::op::v0::Convert>(input_0, output_type);
}
if (input_1.get_element_type() != output_type) {
input_1 = std::make_shared<ov::op::v0::Convert>(input_1, output_type);
}
const auto axis = static_cast<int64_t>(rank - 1 - ggml_dim);
auto res = std::make_shared<ov::op::v0::Concat>(OutputVector{input_0, input_1}, axis);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
+8 -16
View File
@@ -18,27 +18,19 @@ namespace op {
OutputVector translate_cont(const NodeContext & context) {
num_inputs_check(context, 1, 1);
int op_case = context.get_op_case();
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
auto src_shape = context.get_input_shape(0).to_shape();
auto dst_shape = context.get_output_shape().to_shape();
ov::Output<Node> res;
if (op_case == 1) {
// The input comes from a PERMUTE
throw std::runtime_error("Code of this case might be outdated");
dst_shape[1] = -1;
res = std::make_shared<ov::op::v1::Reshape>(
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
} else if (op_case == 2) {
// The input comes from a TRANSPOSE
return {context.get_input(0)};
} else {
// The input comes from a VIEW
res = process_view_input(context, 0);
if (context.get_op_dynamic_dim() != -1) {
dst_shape[3 - context.get_op_dynamic_dim()] = -1;
}
auto input = process_view_input_new(context, 0);
ov::Output<Node> res;
res = std::make_shared<ov::op::v1::Reshape>(
input, ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
return rename_outputs_with_suffix({res}, context.get_name());
}
+14 -1
View File
@@ -3,7 +3,9 @@
#include "../utils.h"
#include <memory>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/reshape.hpp>
namespace ov {
namespace frontend {
@@ -11,7 +13,18 @@ namespace ggml {
namespace op {
OutputVector translate_cpy(const NodeContext & context) {
auto res = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_output_type());
auto input = process_view_input_new(context, 0);
auto input_shape = context.get_input_shape(0);
auto output_shape = context.get_output_shape();
// Non-cast CPY may need a reshape (e.g. [3,192,1,1] -> [576,1,1,1])
if (input_shape != output_shape) {
auto new_shape = ov::op::v0::Constant::create(
ov::element::i64, {static_cast<size_t>(output_shape.rank().get_length())}, output_shape.to_shape());
input = std::make_shared<ov::op::v1::Reshape>(input, new_shape, false);
}
auto res = std::make_shared<ov::op::v0::Convert>(input, context.get_output_type());
return rename_outputs_with_suffix({res}, context.get_name());
}
+146
View File
@@ -0,0 +1,146 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include "ggml.h"
#include <memory>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/sigmoid.hpp>
#include <openvino/op/tile.hpp>
#include <openvino/op/util/precision_sensitive_attribute.hpp>
#include <vector>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
namespace {
bool is_silu_div_pattern(const ov::Output<ov::Node> & numerator,
const ov::Output<ov::Node> & denominator,
const NodeContext & context) {
if (context.get_input_size() != 2) {
return false;
}
const auto * unary_op = reinterpret_cast<const ggml_unary_op *>(context.get_input_op_params(0));
if (unary_op == nullptr || *unary_op != GGML_UNARY_OP_SILU) {
return false;
}
auto mul = std::dynamic_pointer_cast<ov::op::v1::Multiply>(numerator.get_node_shared_ptr());
if (!mul) {
return false;
}
const auto denom_node = denominator.get_node_shared_ptr();
const auto mul_input_0 = mul->input_value(0).get_node_shared_ptr();
const auto mul_input_1 = mul->input_value(1).get_node_shared_ptr();
auto sigmoid = std::dynamic_pointer_cast<ov::op::v0::Sigmoid>(mul_input_1);
if (mul_input_0 == denom_node && sigmoid && sigmoid->input_value(0).get_node_shared_ptr() == denom_node) {
return true;
}
sigmoid = std::dynamic_pointer_cast<ov::op::v0::Sigmoid>(mul_input_0);
return mul_input_1 == denom_node && sigmoid && sigmoid->input_value(0).get_node_shared_ptr() == denom_node;
}
ov::Output<ov::Node> repeat_input_to_match(const NodeContext & context,
const ov::Output<ov::Node> & input,
const ov::Output<ov::Node> & target,
size_t input_index) {
const auto input_shape = context.get_input_shape(input_index);
const auto target_shape = context.get_input_shape(0);
if (input_shape == target_shape) {
return input;
}
if (input_shape.rank().is_static() && target_shape.rank().is_static()) {
const auto rank = static_cast<size_t>(input_shape.rank().get_length());
std::vector<int64_t> repeats(rank, 1);
bool needs_repeat = false;
for (size_t axis = 0; axis < rank; ++axis) {
FRONT_END_OP_CONVERSION_CHECK(input_shape[axis].is_static() && target_shape[axis].is_static(),
"DIV repeat requires static dimensions on both inputs");
const int64_t input_dim = input_shape[axis].get_length();
const int64_t target_dim = target_shape[axis].get_length();
FRONT_END_OP_CONVERSION_CHECK(input_dim > 0 && target_dim > 0 && target_dim % input_dim == 0,
"DIV input shape ", input_shape, " cannot repeat to match ", target_shape);
repeats[axis] = target_dim / input_dim;
needs_repeat = needs_repeat || repeats[axis] != 1;
}
if (!needs_repeat) {
return input;
}
auto repeats_node = ov::op::v0::Constant::create(ov::element::i64, {repeats.size()}, repeats);
return std::make_shared<ov::op::v0::Tile>(input, repeats_node);
}
auto input_shape_node = std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64);
auto target_shape_node = std::make_shared<ov::op::v3::ShapeOf>(target, ov::element::i64);
auto repeats_node = std::make_shared<ov::op::v1::Divide>(target_shape_node, input_shape_node);
return std::make_shared<ov::op::v0::Tile>(input, repeats_node);
}
} // namespace
OutputVector translate_div(const NodeContext & context) {
num_inputs_check(context, 2, 2);
auto input_0 = process_view_input_new(context, 0);
auto input_1 = process_view_input_new(context, 1);
if (is_silu_div_pattern(input_0, input_1, context)) {
ov::Output<ov::Node> res = std::make_shared<ov::op::v0::Sigmoid>(input_1);
if (res.get_element_type() != context.get_output_type()) {
res = std::make_shared<ov::op::v0::Convert>(res, context.get_output_type());
}
return rename_outputs_with_suffix({res}, context.get_name());
}
input_1 = repeat_input_to_match(context, input_1, input_0, 1);
const auto output_type = context.get_output_type();
const bool use_f32_compute = input_0.get_element_type() != ov::element::f32 ||
input_1.get_element_type() != ov::element::f32 || output_type != ov::element::f32;
if (use_f32_compute) {
input_0 = std::make_shared<ov::op::v0::Convert>(input_0, ov::element::f32);
input_1 = std::make_shared<ov::op::v0::Convert>(input_1, ov::element::f32);
}
ov::Output<ov::Node> res = std::make_shared<ov::op::v1::Divide>(input_0, input_1);
if (use_f32_compute) {
// Keep the reciprocal/divide path in FP32. Without this hint, the GPU
// plugin can still compress the subgraph back to FP16 and overflow on
// small shexp gate values (e.g. silu(x) / x in qwen2moe).
ov::mark_as_precision_sensitive(res.get_node_shared_ptr()->input(0));
ov::mark_as_precision_sensitive(res.get_node_shared_ptr()->input(1));
}
if (res.get_element_type() != output_type) {
auto output_convert = std::make_shared<ov::op::v0::Convert>(res, output_type);
if (use_f32_compute) {
ov::mark_as_precision_sensitive(output_convert->input(0));
}
res = output_convert;
}
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -1,15 +1,21 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include "ggml-openvino/ggml-openvino-extra.h"
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <openvino/op/add.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/matmul.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/scaled_dot_product_attention.hpp>
#include <openvino/op/softmax.hpp>
#include <openvino/op/transpose.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <string>
@@ -34,36 +40,115 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});
ov::Output<ov::Node> mask_sliced, res;
ov::Output<ov::Node> res;
// For stateful
std::string mask_name = "KQ_mask_sliced";
if (context.get_input_names()[3].find("swa") != std::string::npos) {
mask_name = "KQ_mask_swa_sliced";
}
if (context.has_input(mask_name)) {
mask_sliced = context.get_input(mask_name);
} else {
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
auto token_len = get_dimensions(q, {2});
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, two);
mask = context.get_input(mask_name);
}
if (mask_sliced.get_element_type() != ov::element::f16) {
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
if (mask.get_element_type() != ov::element::f16) {
mask = std::make_shared<ov::op::v0::Convert>(mask, ov::element::f16);
}
auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
int64_t factor = num_heads / num_heads_kv;
if (factor > 1 && num_heads_kv > 1) {
//auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
// int64_t factor = num_heads / num_heads_kv;
// if (factor > 1 && num_heads_kv > 1) {
auto q_shape = context.get_input_shape(0).to_shape();
auto k_shape = context.get_input_shape(1).to_shape();
const int64_t num_heads = q_shape[1];
const int64_t num_heads_kv = k_shape[1];
const int64_t head_size = q_shape[3];
const int64_t factor = num_heads / num_heads_kv;
// Manual GQA attention: enabled by default on GPU in stateless mode.
// Set GGML_OPENVINO_MANUAL_GQA_ATTN to a positive value (e.g. 1) to force-enable,
// or to 0 to force-disable. Unset falls back to the device-based default.
static const bool manual_gqa_enabled = []() {
const char * env = ggml_openvino_getenv_str("GGML_OPENVINO_MANUAL_GQA_ATTN");
if (env != nullptr) {
return ggml_openvino_getenv_int("GGML_OPENVINO_MANUAL_GQA_ATTN") > 0;
}
const char * dev = ggml_openvino_getenv_str("GGML_OPENVINO_DEVICE");
return dev != nullptr && std::string(dev) == "GPU";
}();
const bool use_manual_gqa_attention =
manual_gqa_enabled && factor > 1 && num_heads_kv > 1 && !context.is_stateful();
if (use_manual_gqa_attention) {
// Q, K, V arrive as [B, n_heads(_kv), S, head_size], where B is the active
// batch (n_seq_active) and may be > 1 (llama-perplexity, llama-server -np > 1)
// or dynamic. Reshape to
// K_r: [B, num_heads_kv, 1, S, head_size]
// Q_r: [B, num_heads_kv, factor, S_q, head_size]
// and let MatMul broadcast across the factor dim without materialising
// an expanded K/V. The leading 0 + special_zero=true copies B at runtime,
// so this is correct for B == 1, B > 1, and dynamic B alike. Only the head
// dims and head_size are baked in as literals; the sequence dim stays -1.
auto k_5d_shape = ov::op::v0::Constant::create(ov::element::i64, {5},
std::vector<int64_t>{0, num_heads_kv, 1, -1, head_size});
auto v_5d_shape = ov::op::v0::Constant::create(ov::element::i64, {5},
std::vector<int64_t>{0, num_heads_kv, 1, -1, head_size});
auto q_5d_shape = ov::op::v0::Constant::create(ov::element::i64, {5},
std::vector<int64_t>{0, num_heads_kv, factor, -1, head_size});
auto k_r = std::make_shared<ov::op::v1::Reshape>(k, k_5d_shape, true);
auto v_r = std::make_shared<ov::op::v1::Reshape>(v, v_5d_shape, true);
auto q_r = std::make_shared<ov::op::v1::Reshape>(q, q_5d_shape, true);
// QK^T → [B, num_heads_kv, factor, S_q, S_k]
auto qk = std::make_shared<ov::op::v0::MatMul>(q_r, k_r, /*tA=*/false, /*tB=*/true);
auto qk_scaled = std::make_shared<ov::op::v1::Multiply>(qk, scale_node);
// Mask arrives as [B, 1, S_q, S_k]. Unsqueeze a factor axis at position 2 to
// get [B, 1, 1, S_q, S_k], which NUMPY-broadcasts cleanly against the
// [B, num_heads_kv, factor, S_q, S_k] scores: B==B, then 1→num_heads_kv and
// 1→factor on the head dims.
auto mask_unsq1 =
std::make_shared<ov::op::v0::Unsqueeze>(mask, ov::op::v0::Constant::create(ov::element::i64, {1}, {2}));
// mask_unsq1: [B, 1, 1, S_q, S_k] (rank 5)
ov::Output<ov::Node> qk_masked = std::make_shared<ov::op::v1::Add>(qk_scaled, mask_unsq1);
auto softmax = std::make_shared<ov::op::v8::Softmax>(qk_masked, /*axis=*/-1);
// softmax @ V → [B, num_heads_kv, factor, S_q, head_size]
auto attn = std::make_shared<ov::op::v0::MatMul>(softmax, v_r);
// Reshape back to [B, num_heads, S_q, head_size] (combine num_heads_kv * factor).
// Leading 0 + special_zero=true copies B at runtime.
auto out_4d_shape =
ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{0, num_heads, -1, head_size});
auto out_4d = std::make_shared<ov::op::v1::Reshape>(attn, out_4d_shape, true);
// The standard SDPA path's downstream is Transpose(0,2,1,3) → Convert(f32).
// Replicate it here so callers see the same output layout/dtype.
res = std::make_shared<ov::op::v1::Transpose>(
out_4d, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
return rename_outputs_with_suffix({res}, context.get_name());
}
// Default path: explicit Broadcast → SDPA. Kept as the fallback because
// (a) it goes through the GPU plugin's micro-SDPA fast path (FlashAttention
// tiles via DPAS), and (b) the manual path above is still being validated.
auto tile_kv = [&](int64_t n_heads, int64_t n_heads_kv, int64_t hs, ov::Output<Node> kv) {
int64_t f = n_heads / n_heads_kv;
if (f > 1 && n_heads_kv > 1) {
ov::Output<ov::Node> kv_broadcast_shape, kv_unsqueezed, new_kv_shape;
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2});
kv_unsqueezed = std::make_shared<ov::op::v0::Unsqueeze>(kv, unsqueeze_axes);
kv_broadcast_shape = ov::op::v0::Constant::create(
ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
kv_broadcast_shape = ov::op::v0::Constant::create(ov::element::i64, {5},
{(int64_t) 1, (int64_t) 1, f, (int64_t) 1, (int64_t) 1});
new_kv_shape =
ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, num_heads, (int64_t) -1, head_size});
ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, n_heads, (int64_t) -1, hs});
// ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
//new_kv_shape =
// ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, num_heads, (int64_t) -1, head_size});
kv = std::make_shared<ov::op::v3::Broadcast>(kv_unsqueezed, kv_broadcast_shape,
ov::op::BroadcastType::BIDIRECTIONAL);
@@ -72,12 +157,14 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
return kv;
};
auto q_shape = context.get_input_shape(0).to_shape();
auto k_shape = context.get_input_shape(1).to_shape();
k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k);
v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v);
//auto q_shape = context.get_input_shape(0).to_shape();
//auto k_shape = context.get_input_shape(1).to_shape();
//k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k);
//v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v);
k = tile_kv(num_heads, num_heads_kv, head_size, k);
v = tile_kv(num_heads, num_heads_kv, head_size, v);
auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_sliced, scale_node, false);
auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask, scale_node, false);
res = std::make_shared<ov::op::v1::Transpose>(sdpa,
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
@@ -0,0 +1,282 @@
#include "gated_delta_net.hpp"
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <cmath>
#include <cstdint>
#include <memory>
#include <openvino/op/add.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/exp.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/loop.hpp>
#include <openvino/op/matmul.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/op/transpose.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <vector>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
static OutputVector translate_gated_delta_net_ref(const NodeContext & context);
OutputVector translate_gated_delta_net(const NodeContext & context) {
// auto v_shape = context.get_input_shape(2).to_shape(); // [B, T, H_v, S_v]
// auto q_shape = context.get_input_shape(0).to_shape(); // [B, T, H_k, S_k]
// // Fused GatedDeltaNet op only supports scalar gate (kda=0).
// // Fall back to reference implementation for per-key-dimension gating.
// // if (kda) {
// // return translate_gated_delta_net_ref(context);
// // }
// auto q = context.get_input(0);
// auto k = context.get_input(1);
// auto v = context.get_input(2);
// auto g = context.get_input(3);
// auto beta = context.get_input(4);
// auto state = context.get_input(5);
// const int64_t B = v_shape[0];
// const int64_t T = v_shape[1];
// const int64_t H_v = v_shape[2];
// const int64_t S_v = v_shape[3];
// const int64_t S_k = q_shape[3];
// // ggml state layout (OV notation): [B, H_v, value_dim, key_dim]
// // GatedDeltaNet op expects: [B, H_v, key_dim, value_dim]
// auto state_reshape_shape =
// ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{B, H_v, S_v, S_k});
// state = std::make_shared<ov::op::v1::Reshape>(state, state_reshape_shape, false);
// auto state_perm = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{0, 1, 3, 2});
// state = std::make_shared<ov::op::v1::Transpose>(state, state_perm);
// g = std::make_shared<ov::op::v0::Squeeze>(g, ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
// beta = std::make_shared<ov::op::v0::Squeeze>(beta, ov::op::v0::Constant::create(ov::element::i64, {1}, {3}));
// auto gdn = std::make_shared<ov::op::internal::GatedDeltaNet>(q, k, v, state, g, beta);
// auto attn_4d = gdn->output(0);
// auto state_4d = gdn->output(1); // [B, H_v, key_dim, value_dim]
// // Transpose output state back to ggml layout [B, H_v, value_dim, key_dim]
// auto state_transposed = std::make_shared<ov::op::v1::Transpose>(state_4d, state_perm);
// auto flat_shape_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
// auto attn = std::make_shared<ov::op::v1::Reshape>(attn_4d, flat_shape_1d, false);
// auto new_state = std::make_shared<ov::op::v1::Reshape>(state_transposed, flat_shape_1d, false);
// auto packed = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{attn, new_state}, 0);
// auto out_shape =
// ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{1, 1, T * B + S_v * B, S_v * H_v});
// auto res = std::make_shared<ov::op::v1::Reshape>(packed, out_shape, false);
// return rename_outputs_with_suffix({res}, context.get_name());
// The OV version in CI does not have the GatedDeltaNet op, so use reference implementation for now.
return translate_gated_delta_net_ref(context);
}
static OutputVector translate_gated_delta_net_ref(const NodeContext & context) {
num_inputs_check(context, 6, 6);
// Inputs (OV shapes are reversed from ggml):
// ggml: q[S_k, H_k, T, B], k[S_k, H_k, T, B], v[S_v, H_v, T, B]
// OV: q[B, T, H_k, S_k], k[B, T, H_k, S_k], v[B, T, H_v, S_v]
// ggml: g[1 or S_v, H_v, T, B], beta[1, H_v, T, B]
// OV: g[B, T, H_v, 1 or S_v], beta[B, T, H_v, 1]
// ggml: state[S_v, S_v, H_v, B]
// OV: state[B, H_v, S_v, S_v]
auto q = process_view_input_new(context, 0);
auto k = process_view_input_new(context, 1);
auto v = process_view_input_new(context, 2);
auto g = process_view_input_new(context, 3);
auto beta = process_view_input_new(context, 4);
auto state = process_view_input_new(context, 5);
auto v_shape = context.get_input_shape(2).to_shape(); // [B, T, H_v, S_v]
auto q_shape = context.get_input_shape(0).to_shape(); // [B, T, H_k, S_k]
auto g_shape = context.get_input_shape(3).to_shape(); // [B, T, H_v, 1 or S_v]
const int64_t B = v_shape[0];
const int64_t T = v_shape[1];
const int64_t H_v = v_shape[2];
const int64_t S_v = v_shape[3];
const int64_t H_k = q_shape[2];
const bool kda = (g_shape[3] == (size_t) S_v);
const int64_t rq1 = H_v / H_k; // head repeat factor
const float scale = 1.0f / std::sqrt((float) S_v);
auto axis_1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto axis_2 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
// Transpose inputs from [B, T, H, S] to [B, H, T, S] for easier per-head processing
auto perm_0213 = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{0, 2, 1, 3});
auto q_t = std::make_shared<ov::op::v1::Transpose>(q, perm_0213); // [B, H_k, T, S_k]
auto k_t = std::make_shared<ov::op::v1::Transpose>(k, perm_0213); // [B, H_k, T, S_k]
auto v_t = std::make_shared<ov::op::v1::Transpose>(v, perm_0213); // [B, H_v, T, S_v]
auto g_t = std::make_shared<ov::op::v1::Transpose>(g, perm_0213); // [B, H_v, T, 1 or S_v]
auto beta_t = std::make_shared<ov::op::v1::Transpose>(beta, perm_0213); // [B, H_v, T, 1]
// Broadcast Q, K heads to match V heads if GQA is used (H_v > H_k)
ov::Output<ov::Node> q_bh = q_t;
ov::Output<ov::Node> k_bh = k_t;
if (rq1 > 1) {
auto q_unsq = std::make_shared<ov::op::v0::Unsqueeze>(q_t, axis_2); // [B, H_k, 1, T, S]
auto k_unsq = std::make_shared<ov::op::v0::Unsqueeze>(k_t, axis_2); // [B, H_k, 1, T, S]
auto bcast_shape = ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector<int64_t>{1, 1, rq1, 1, 1});
auto q_bcast =
std::make_shared<ov::op::v3::Broadcast>(q_unsq, bcast_shape, ov::op::BroadcastType::BIDIRECTIONAL);
auto k_bcast =
std::make_shared<ov::op::v3::Broadcast>(k_unsq, bcast_shape, ov::op::BroadcastType::BIDIRECTIONAL);
// Transpose [B, H_k, rq1, T, S] -> [B, rq1, H_k, T, S] so that reshape merges
// as [rq1, H_k] giving repeat-blocks pattern matching CPU: iq1 = iv1 % H_k
auto perm_5d = ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector<int64_t>{0, 2, 1, 3, 4});
auto q_transposed = std::make_shared<ov::op::v1::Transpose>(q_bcast, perm_5d);
auto k_transposed = std::make_shared<ov::op::v1::Transpose>(k_bcast, perm_5d);
auto new_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{B, H_v, T, S_v});
q_bh = std::make_shared<ov::op::v1::Reshape>(q_transposed, new_shape, false);
k_bh = std::make_shared<ov::op::v1::Reshape>(k_transposed, new_shape, false);
}
// Merge batch and head dims: [B*H_v, T, S_v]
auto merge_bh = [&](ov::Output<ov::Node> x, int64_t last_dim) {
auto shape = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{B * H_v, T, last_dim});
return std::make_shared<ov::op::v1::Reshape>(x, shape, false);
};
auto q_m = merge_bh(q_bh, S_v); // [B*H_v, T, S_v]
auto k_m = merge_bh(k_bh, S_v); // [B*H_v, T, S_v]
auto v_m = merge_bh(v_t, S_v); // [B*H_v, T, S_v]
auto g_m = merge_bh(g_t, kda ? S_v : 1); // [B*H_v, T, 1 or S_v]
auto beta_m = merge_bh(beta_t, 1); // [B*H_v, T, 1]
// State: [B, H_v, S_v, S_v] -> [B*H_v, S_v, S_v]
auto state_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{B * H_v, S_v, S_v});
auto state_m = std::make_shared<ov::op::v1::Reshape>(state, state_shape, false);
auto scale_const = ov::op::v0::Constant::create(ov::element::f32, {}, std::vector<float>{scale});
// --- Build Loop body ---
// Body parameters (no iteration counter needed, use -1 in special ports)
auto body_state = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic());
auto body_q = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic());
auto body_k = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic());
auto body_v = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic());
auto body_g = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic());
auto body_beta = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape::dynamic());
auto body_iter = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
// Condition output (always true - we rely on trip_count for termination)
auto body_cond_out = ov::op::v0::Constant::create(ov::element::boolean, ov::Shape{1}, std::vector<bool>{true});
// Gather current token from invariant inputs using iteration counter
auto q_t_cur = std::make_shared<ov::op::v8::Gather>(body_q, body_iter, axis_1); // [B*H_v, 1, S_v]
auto k_t_cur = std::make_shared<ov::op::v8::Gather>(body_k, body_iter, axis_1); // [B*H_v, 1, S_v]
auto v_t_cur = std::make_shared<ov::op::v8::Gather>(body_v, body_iter, axis_1); // [B*H_v, 1, S_v]
auto g_t_cur = std::make_shared<ov::op::v8::Gather>(body_g, body_iter, axis_1); // [B*H_v, 1, 1 or S_v]
auto b_t_cur = std::make_shared<ov::op::v8::Gather>(body_beta, body_iter, axis_1); // [B*H_v, 1, 1]
// Squeeze token dim
auto q_cur = std::make_shared<ov::op::v0::Squeeze>(q_t_cur, axis_1); // [B*H_v, S_v]
auto k_cur = std::make_shared<ov::op::v0::Squeeze>(k_t_cur, axis_1); // [B*H_v, S_v]
auto v_cur = std::make_shared<ov::op::v0::Squeeze>(v_t_cur, axis_1); // [B*H_v, S_v]
auto g_cur = std::make_shared<ov::op::v0::Squeeze>(g_t_cur, axis_1); // [B*H_v, 1 or S_v]
auto b_cur = std::make_shared<ov::op::v0::Squeeze>(b_t_cur, axis_1); // [B*H_v, 1]
// Step 1: Apply decay gate to state
auto exp_g = std::make_shared<ov::op::v0::Exp>(g_cur); // [B*H_v, 1 or S_v]
auto exp_g_unsq = std::make_shared<ov::op::v0::Unsqueeze>(exp_g, axis_1); // [B*H_v, 1, 1 or S_v]
auto state_decayed = std::make_shared<ov::op::v1::Multiply>(body_state, exp_g_unsq); // [B*H_v, S_v, S_v]
// Step 2: delta = (v - S @ k) * beta
auto k_col = std::make_shared<ov::op::v0::Unsqueeze>(k_cur, axis_2); // [B*H_v, S_v, 1]
auto sk = std::make_shared<ov::op::v0::MatMul>(state_decayed, k_col, false, false); // [B*H_v, S_v, 1]
auto sk_sq = std::make_shared<ov::op::v0::Squeeze>(sk, axis_2); // [B*H_v, S_v]
auto v_minus_sk = std::make_shared<ov::op::v1::Subtract>(v_cur, sk_sq); // [B*H_v, S_v]
auto delta = std::make_shared<ov::op::v1::Multiply>(v_minus_sk, b_cur); // [B*H_v, S_v]
// Step 3: state += outer(delta, k)
auto delta_col = std::make_shared<ov::op::v0::Unsqueeze>(delta, axis_2); // [B*H_v, S_v, 1]
auto k_row = std::make_shared<ov::op::v0::Unsqueeze>(k_cur, axis_1); // [B*H_v, 1, S_v]
auto outer_prod = std::make_shared<ov::op::v0::MatMul>(delta_col, k_row, false, false); // [B*H_v, S_v, S_v]
auto state_updated = std::make_shared<ov::op::v1::Add>(state_decayed, outer_prod); // [B*H_v, S_v, S_v]
// Step 4: attn_out = S @ q * scale
auto q_col = std::make_shared<ov::op::v0::Unsqueeze>(q_cur, axis_2); // [B*H_v, S_v, 1]
auto sq = std::make_shared<ov::op::v0::MatMul>(state_updated, q_col, false, false); // [B*H_v, S_v, 1]
auto sq_squeezed = std::make_shared<ov::op::v0::Squeeze>(sq, axis_2); // [B*H_v, S_v]
auto attn_out = std::make_shared<ov::op::v1::Multiply>(sq_squeezed, scale_const); // [B*H_v, S_v]
// Unsqueeze attn_out to [B*H_v, 1, S_v] for scan output concatenation
auto attn_out_unsq = std::make_shared<ov::op::v0::Unsqueeze>(attn_out, axis_1); // [B*H_v, 1, S_v]
// --- Assemble Loop ---
// Body: results = [condition, state_updated, attn_out_unsq]
auto body = std::make_shared<ov::Model>(
ov::OutputVector{body_cond_out, state_updated, attn_out_unsq},
ov::ParameterVector{body_iter, body_state, body_q, body_k, body_v, body_g, body_beta});
auto trip_count = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{T});
auto exec_cond = ov::op::v0::Constant::create(ov::element::boolean, ov::Shape{1}, std::vector<bool>{true});
auto loop = std::make_shared<ov::op::v5::Loop>(trip_count, exec_cond);
loop->set_function(body);
loop->set_special_body_ports(ov::op::v5::Loop::SpecialBodyPorts{0, 0});
// Carried state: feeds back from body output 1 to body_state param
loop->set_merged_input(body_state, state_m, state_updated);
// Invariant inputs: passed through unchanged each iteration
loop->set_invariant_input(body_q, q_m);
loop->set_invariant_input(body_k, k_m);
loop->set_invariant_input(body_v, v_m);
loop->set_invariant_input(body_g, g_m);
loop->set_invariant_input(body_beta, beta_m);
// Loop outputs:
// 1) Final state (last iteration value of state_updated)
auto final_state_out = loop->get_iter_value(state_updated, -1); // [B*H_v, S_v, S_v]
// 2) Concatenated attention outputs across all iterations along axis 1
auto attn_concat_out = loop->get_concatenated_slices(attn_out_unsq, 0, 1, 1, -1, 1); // [B*H_v, T, S_v]
// --- Pack outputs to match ggml layout ---
// ggml output ne = {S_v*H, T*B + S_v*B, 1, 1} -> OV [1, 1, T*B+S_v*B, S_v*H_v]
// attn: [B, T, H_v, S_v] row-major, state: [B, H_v, S_v, S_v] row-major
// attn: [B*H_v, T, S_v] -> [B, H_v, T, S_v] -> transpose to [B, T, H_v, S_v] -> flatten
auto attn_4d_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{B, H_v, T, S_v});
auto attn_4d = std::make_shared<ov::op::v1::Reshape>(attn_concat_out, attn_4d_shape, false);
auto attn_perm = std::make_shared<ov::op::v1::Transpose>(attn_4d, perm_0213); // [B, T, H_v, S_v]
auto flat_shape_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{-1});
auto attn_1d = std::make_shared<ov::op::v1::Reshape>(attn_perm, flat_shape_1d, false);
// state: [B*H_v, S_v, S_v] -> [B, H_v, S_v, S_v] -> flatten
auto state_4d_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{B, H_v, S_v, S_v});
auto state_4d = std::make_shared<ov::op::v1::Reshape>(final_state_out, state_4d_shape, false);
auto state_1d = std::make_shared<ov::op::v1::Reshape>(state_4d, flat_shape_1d, false);
// Concat [attn | state] and reshape to final output
auto packed = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{attn_1d, state_1d}, 0);
auto out_shape =
ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{1, 1, T * B + S_v * B, S_v * H_v});
auto res = std::make_shared<ov::op::v1::Reshape>(packed, out_shape, false);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,65 @@
#pragma once
#include "openvino/op/op.hpp"
namespace ov::op::internal {
/// \note GatedDeltaNet op class is under development and subject to change
///
/// \brief Operator performing Gated Delta Net computation
/// \ingroup ov_ops_cpp_api
class OPENVINO_API GatedDeltaNet : public ov::op::Op {
public:
OPENVINO_OP("GatedDeltaNet")
GatedDeltaNet() = default;
/// \brief Constructs a GatedDeltaNet operation.
///
/// \param query Query tensor input.
/// \param key Key tensor input.
/// \param value Value tensor input.
/// \param recurrent_state Initial recurrent state tensor.
/// \param gate Gate tensor controlling state decay/update.
/// \param beta Beta tensor scaling the delta update.
/// \param fuse_qk_l2norm Enables fusing q/k L2-normalization into this op.
/// \param q_l2_norm_eps Epsilon used for query L2-normalization when fusion is enabled.
/// \param k_l2_norm_eps Epsilon used for key L2-normalization when fusion is enabled.
GatedDeltaNet(const Output<Node>& query,
const Output<Node>& key,
const Output<Node>& value,
const Output<Node>& recurrent_state,
const Output<Node>& gate,
const Output<Node>& beta,
const bool fuse_qk_l2norm = false,
const float q_l2_norm_eps = 1e-6F,
const float k_l2_norm_eps = 1e-6F);
/// \brief Constructs a GatedDeltaNet operation from input vector.
///
/// \param args Input tensor vector in order: query, key, value, recurrent_state, gate, beta.
/// \param fuse_qk_l2norm Enables fusing q/k L2-normalization into this op.
/// \param q_l2_norm_eps Epsilon used for query L2-normalization when fusion is enabled.
/// \param k_l2_norm_eps Epsilon used for key L2-normalization when fusion is enabled.
GatedDeltaNet(const ov::OutputVector& args,
const bool fuse_qk_l2norm = false,
const float q_l2_norm_eps = 1e-6F,
const float k_l2_norm_eps = 1e-6F);
void validate_and_infer_types() override;
bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
bool get_fuse_qk_l2norm() const {
return m_fuse_qk_l2norm;
}
float get_q_l2_norm_eps() const {
return m_q_l2_norm_eps;
}
float get_k_l2_norm_eps() const {
return m_k_l2_norm_eps;
}
private:
bool m_fuse_qk_l2norm = false;
float m_q_l2_norm_eps = 1e-6F;
float m_k_l2_norm_eps = 1e-6F;
};
} // namespace ov::op::internal
@@ -18,16 +18,9 @@ namespace op {
OutputVector translate_get_rows(const NodeContext & context) {
num_inputs_check(context, 2, 2);
int op_case = context.get_op_case();
Output<Node> res;
auto data = context.get_input(0);
auto indices = context.get_input(1);
if (op_case == 2) {
// The input comes from a VIEW
indices = process_view_input(context, 1);
}
auto data = process_view_input_new(context, 0);
auto indices = process_view_input_new(context, 1);
// data[1,b,x,y] ind[1,1,b,x'] test-backend-ops case
// data[x,y] ind[1,1,1,x'] normal case
@@ -4,6 +4,7 @@
#include <memory>
#include <openvino/core/node_output.hpp>
#include <openvino/op/clamp.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/gelu.hpp>
#include <openvino/op/multiply.hpp>
@@ -21,23 +22,26 @@ OutputVector translate_glu_geglu(const NodeContext & context) {
ov::Output<ov::Node> src0;
ov::Output<ov::Node> src1;
if (context.get_input_size() == 2) {
src0 = context.get_input(0);
src1 = context.get_input(1);
// Inputs may be VIEW slices of a combined gate_up tensor (MoE experts):
// resolve them so each half has its real sliced shape, not the base tensor.
src0 = process_view_input_new(context, 0);
src1 = process_view_input_new(context, 1);
} else {
// GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2.
// Both halves are nc elements; if the dimension is odd, the last element is dropped.
// Use Slice instead of Split to handle odd dimensions correctly.
auto combined = context.get_input(0);
// Resolve a VIEW input (e.g. non-contiguous slice) to its real shape first.
auto combined = process_view_input_new(context, 0);
auto combined_shape = combined.get_partial_shape();
int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length();
int64_t nc = last_dim_val / 2;
auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc});
auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc});
src0 = std::make_shared<ov::op::v8::Slice>(combined, start0, stop0, step, axis);
src1 = std::make_shared<ov::op::v8::Slice>(combined, start1, stop1, step, axis);
@@ -49,6 +53,16 @@ OutputVector translate_glu_geglu(const NodeContext & context) {
std::swap(src0, src1);
}
if (context.is_static()) {
// TODO: Temporary solution for NPU accuracy issue due to fp16 overflow
// To be removed once permanent solution is implemented
// Justification:
// For |x| > 5, GELU(x) ≈ max(x, 0) (behaves like ReLU)
// So Clamp(-10, 10) only affects values where GELU would return ≈ x anyway.
// The only loss: values > 10 get mapped to 10 instead of x.
// In practice, FFN intermediates rarely exceed 10 after GEGLU gating.
src0 = std::make_shared<ov::op::v0::Clamp>(src0, -10.0, 10.0);
}
auto gelu = std::make_shared<ov::op::v7::Gelu>(src0);
auto res = std::make_shared<ov::op::v1::Multiply>(gelu, src1);
@@ -21,23 +21,26 @@ OutputVector translate_glu_swiglu(const NodeContext & context) {
ov::Output<ov::Node> src0;
ov::Output<ov::Node> src1;
if (context.get_input_size() == 2) {
src0 = context.get_input(0);
src1 = context.get_input(1);
// Inputs may be VIEW slices of a combined gate_up tensor (MoE experts):
// resolve them so each half has its real sliced shape, not the base tensor.
src0 = process_view_input_new(context, 0);
src1 = process_view_input_new(context, 1);
} else {
// GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2.
// Both halves are nc elements; if the dimension is odd, the last element is dropped.
// Use Slice instead of Split to handle odd dimensions correctly.
auto combined = context.get_input(0);
// Resolve a VIEW input (e.g. non-contiguous slice) to its real shape first.
auto combined = process_view_input_new(context, 0);
auto combined_shape = combined.get_partial_shape();
int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length();
int64_t nc = last_dim_val / 2;
auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc});
auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc});
auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc});
src0 = std::make_shared<ov::op::v8::Slice>(combined, start0, stop0, step, axis);
src1 = std::make_shared<ov::op::v8::Slice>(combined, start1, stop1, step, axis);
@@ -0,0 +1,120 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include "ggml-impl.h"
#include <cstddef>
#include <memory>
#include <openvino/core/shape.hpp>
#include <openvino/core/strides.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/extractimagepatches.hpp>
#include <openvino/op/pad.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/transpose.hpp>
#include <openvino/op/util/attr_types.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_im2col(const NodeContext & context) {
num_inputs_check(context, 2, 2);
const int32_t * params = context.get_output_op_params();
int32_t s0 = params[0];
int32_t s1 = params[1];
int32_t p0 = params[2];
int32_t p1 = params[3];
int32_t d0 = params[4];
int32_t d1 = params[5];
bool is_2D = params[6] == 1;
ov::Output<Node> res;
ov::Output<Node> image = context.get_input(1);
const ov::Shape kernel_shape = context.get_input(0).get_shape();
const size_t IC = is_2D ? kernel_shape[1] : kernel_shape[2];
const size_t KH = is_2D ? kernel_shape[2] : 1;
const size_t KW = kernel_shape[3];
int32_t stride_w = s0;
int32_t stride_h = is_2D ? s1 : 1;
int32_t pad_w = p0;
int32_t pad_h = is_2D ? p1 : 0;
int32_t dil_w = d0;
int32_t dil_h = is_2D ? d1 : 1;
if (!is_2D) {
// GGML input shape: [IW, IC, N, 1]
// OpenVINO input shape: [1, N, IC, IW]
// Reshape image to: [N, IC, 1, IW]
const ov::Shape image_shape = image.get_shape();
const size_t N = image_shape[1];
const size_t IW = image_shape[3];
auto image_reshape_shape = ov::op::v0::Constant::create(
ov::element::i64, ov::Shape{4},
std::vector<int64_t>{static_cast<int64_t>(N), static_cast<int64_t>(IC), 1, static_cast<int64_t>(IW)});
image = std::make_shared<ov::op::v1::Reshape>(image, image_reshape_shape, false);
}
const ov::Shape patch_sizes = {KH, KW};
const ov::Strides strides = {static_cast<size_t>(stride_h), static_cast<size_t>(stride_w)};
const ov::Shape rates = {static_cast<size_t>(dil_h), static_cast<size_t>(dil_w)};
auto pads_begin =
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, pad_h, pad_w});
auto pads_end =
ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 0, pad_h, pad_w});
auto pad = std::make_shared<ov::op::v1::Pad>(image, pads_begin, pads_end, ov::op::PadMode::CONSTANT);
auto patches =
std::make_shared<ov::op::v3::ExtractImagePatches>(pad, patch_sizes, strides, rates, ov::op::PadType::VALID);
// [N, KH*KW*IC, OH, OW] → [N, OH, OW, KH*KW*IC]
auto perm1 = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{0, 2, 3, 1});
auto t1 = std::make_shared<ov::op::v1::Transpose>(patches, perm1);
// [N, OH, OW, KH*KW*IC] → [N, OH, OW, KH*KW, IC]
const ov::Shape out_shape = t1->get_output_shape(0);
const size_t N = out_shape[0];
const size_t OH = out_shape[1];
const size_t OW = out_shape[2];
auto reshape1_shape = ov::op::v0::Constant::create(
ov::element::i64, ov::Shape{5},
std::vector<int64_t>{static_cast<int64_t>(N), static_cast<int64_t>(OH), static_cast<int64_t>(OW),
static_cast<int64_t>(KH * KW), static_cast<int64_t>(IC)});
auto r1 = std::make_shared<ov::op::v1::Reshape>(t1, reshape1_shape, false);
// [N, OH, OW, KH*KW, IC] → [N, OH, OW, IC, KH*KW]
auto perm2 = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, std::vector<int64_t>{0, 1, 2, 4, 3});
auto t2 = std::make_shared<ov::op::v1::Transpose>(r1, perm2);
// flatten back to [N, OH, OW, IC*KH*KW]
auto r2_shape = ov::op::v0::Constant::create(
ov::element::i64, ov::Shape{4},
std::vector<int64_t>{static_cast<int64_t>(N), static_cast<int64_t>(OH), static_cast<int64_t>(OW),
static_cast<int64_t>(IC * KH * KW)});
res = std::make_shared<ov::op::v1::Reshape>(t2, r2_shape, false);
if (!is_2D) {
// [N, 1, OW, IC * KW] -> [1, N, OW, IC * KW]
auto final_reshape_shape = ov::op::v0::Constant::create(
ov::element::i64, ov::Shape{4},
std::vector<int64_t>{1, static_cast<int64_t>(N), static_cast<int64_t>(OW), static_cast<int64_t>(IC * KW)});
res = std::make_shared<ov::op::v1::Reshape>(res, final_reshape_shape, false);
}
auto output_type = context.get_output_type();
if (res.get_element_type() != output_type) {
res = std::make_shared<ov::op::v0::Convert>(res, output_type);
}
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,44 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <memory>
#include <openvino/op/constant.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/maximum.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reduce_sum.hpp>
#include <openvino/op/sqrt.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_l2_norm(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input_node = process_view_input_new(context, 0);
auto squared = std::make_shared<ov::op::v1::Multiply>(input_node, input_node);
auto sum_squared = std::make_shared<ov::op::v1::ReduceSum>(
squared, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true);
auto l2_norm = std::make_shared<ov::op::v0::Sqrt>(sum_squared);
float eps;
memcpy(&eps, context.get_output_op_params(), sizeof(float));
auto eps_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps});
auto clamped_norm = std::make_shared<ov::op::v1::Maximum>(l2_norm, eps_const);
auto res = std::make_shared<ov::op::v1::Divide>(input_node, clamped_norm);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,108 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <memory>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/matmul.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/unsqueeze.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_mul_mat_id(const NodeContext & context) {
num_inputs_check(context, 3, 3);
auto expert_weights = process_view_input_new(context, 0);
auto activations = process_view_input_new(context, 1);
auto ids = process_view_input_new(context, 2);
// OpenVINO sees GGML tensors in reversed dimension order:
// weights: [1, n_expert, m, k]
// activations: [1, n_tokens, n_used_or_1, k]
// ids: [1, 1, n_tokens, n_used]
// Rebuild the logical ranks explicitly from the 4D inputs instead of relying
// on fixed squeeze axes: real graphs can arrive through VIEW/RESHAPE chains
// where singleton axes are still represented differently at this point.
auto expert_weights_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(expert_weights, ov::element::i64);
auto activations_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
auto ids_shape_4d = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
auto expert_weights_shape_3d = get_dimensions(expert_weights_shape_4d, {1, 2, 3});
auto activations_shape_3d = get_dimensions(activations_shape_4d, {1, 2, 3});
auto ids_shape_2d = get_dimensions(ids_shape_4d, {2, 3});
expert_weights = std::make_shared<ov::op::v1::Reshape>(expert_weights, expert_weights_shape_3d, false);
activations = std::make_shared<ov::op::v1::Reshape>(activations, activations_shape_3d, false);
ids = std::make_shared<ov::op::v1::Reshape>(ids, ids_shape_2d, false);
if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) {
ids = std::make_shared<ov::op::v0::Convert>(ids, ov::element::i32);
}
auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
ov::Output<ov::Node> selected_weights = std::make_shared<ov::op::v8::Gather>(expert_weights, ids, gather_axis);
const auto output_type = context.get_output_type();
if (selected_weights.get_element_type() != ov::element::f32) {
selected_weights = std::make_shared<ov::op::v0::Convert>(selected_weights, ov::element::f32);
}
if (activations.get_element_type() != ov::element::f32) {
activations = std::make_shared<ov::op::v0::Convert>(activations, ov::element::f32);
}
auto activations_shape = std::make_shared<ov::op::v3::ShapeOf>(activations, ov::element::i64);
auto ids_shape = std::make_shared<ov::op::v3::ShapeOf>(ids, ov::element::i64);
ov::Output<ov::Node> acts_target_dims = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{
get_dimensions(activations_shape, {0}),
get_dimensions(ids_shape, {1}),
get_dimensions(activations_shape, {2}),
},
0);
ov::Output<ov::Node> acts_broadcasted =
std::make_shared<ov::op::v3::Broadcast>(activations, acts_target_dims, ov::op::BroadcastType::BIDIRECTIONAL);
auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
auto activations_expanded = std::make_shared<ov::op::v0::Unsqueeze>(acts_broadcasted, unsqueeze_axes);
auto batch_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto output_shape = context.get_output_shape();
FRONT_END_OP_CONVERSION_CHECK(output_shape.rank().is_static() && output_shape.rank().get_length() == 4,
"Unexpected MUL_MAT_ID output rank");
FRONT_END_OP_CONVERSION_CHECK(output_shape[3].is_static(), "Expected static row dimension for MUL_MAT_ID output");
const auto row_dim_value = output_shape[3].get_length();
auto row_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {row_dim_value});
ov::Output<ov::Node> result =
std::make_shared<ov::op::v0::MatMul>(activations_expanded, selected_weights, false, true);
auto result_target_dims = std::make_shared<ov::op::v0::Concat>(
ov::OutputVector{
batch_dim,
get_dimensions(ids_shape, {0, 1}),
row_dim,
},
0);
result = std::make_shared<ov::op::v1::Reshape>(result, result_target_dims, false);
if (result.get_element_type() != output_type) {
result = std::make_shared<ov::op::v0::Convert>(result, output_type);
}
return rename_outputs_with_suffix({result}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
+19 -9
View File
@@ -30,17 +30,16 @@ OutputVector translate_mulmat(const NodeContext & context) {
int op_case = context.get_op_case();
ov::Output<Node> res;
ov::Output<ov::Node> B = context.get_input(0);
ov::Output<ov::Node> A = context.get_input(1);
bool transpose_b = true;
if (op_case == 2) {
B = B.get_node_shared_ptr()->input_value(0);
transpose_b = false;
} else if (op_case == 3) {
ov::Output<ov::Node> B;
ov::Output<ov::Node> A;
if (op_case == 3) {
B = process_view_input(context, 0);
A = process_view_input(context, 1);
} else {
B = process_view_input_new(context, 0);
A = process_view_input_new(context, 1);
}
if (A.get_element_type() != B.get_element_type()) {
B = std::make_shared<ov::op::v0::Convert>(context.get_input(0), context.get_input_type(1));
}
@@ -55,6 +54,7 @@ OutputVector translate_mulmat(const NodeContext & context) {
auto batch_small = A_batch_larger ? B_batch : A_batch;
Output<Node> Z = A_batch_larger ? B : A;
auto Z_shape = A_batch_larger ? B_shape : A_shape;
int64_t factor = batch_large / batch_small;
if (factor > 1 && batch_small > 1) {
auto batch_large_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{batch_large});
@@ -67,7 +67,11 @@ OutputVector translate_mulmat(const NodeContext & context) {
auto broadcast_shape = ov::op::v0::Constant::create(
ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
auto new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
{(int64_t) 0, batch_large, (int64_t) -1, (int64_t) A_shape[3]});
{(int64_t) 0, batch_large, (int64_t) -1, (int64_t) Z_shape[3]});
if (op_case == 2) {
new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
{(int64_t) 0, batch_large, (int64_t) Z_shape[2], (int64_t) -1});
}
auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape,
ov::op::BroadcastType::BIDIRECTIONAL);
@@ -79,8 +83,14 @@ OutputVector translate_mulmat(const NodeContext & context) {
A = Z;
}
bool transpose_b = true;
res = std::make_shared<ov::op::v0::MatMul>(A, B, false, transpose_b);
const auto output_type = context.get_output_type();
if (res.get_element_type() != output_type) {
res = std::make_shared<ov::op::v0::Convert>(res, output_type);
}
return rename_outputs_with_suffix({res}, context.get_name());
}
@@ -0,0 +1,58 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <memory>
#include <openvino/op/add.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/power.hpp>
#include <openvino/op/reduce_mean.hpp>
#include <openvino/op/sqrt.hpp>
#include <openvino/op/subtract.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_norm(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input_node = process_view_input_new(context, 0);
// Step 1: Calculate mean along the last dimension
// mean = reduce_mean(input, axis=-1, keepdims=true)
auto mean = std::make_shared<ov::op::v1::ReduceMean>(
input_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true);
// Step 2: Calculate (input - mean)
auto centered = std::make_shared<ov::op::v1::Subtract>(input_node, mean);
// Step 3: Calculate squared differences (input - mean)^2
auto squared = std::make_shared<ov::op::v1::Power>(
centered, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f}));
// Step 4: Calculate variance = mean((input - mean)^2)
auto variance = std::make_shared<ov::op::v1::ReduceMean>(
squared, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true);
// Step 5: Get epsilon from op_params
float eps;
memcpy(&eps, context.get_output_op_params(), sizeof(float));
// Step 6: Calculate std = sqrt(variance + eps)
auto std_dev = std::make_shared<ov::op::v0::Sqrt>(std::make_shared<ov::op::v1::Add>(
variance, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps})));
// Step 7: Normalize: output = (input - mean) / std
auto res = std::make_shared<ov::op::v1::Divide>(centered, std_dev);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,95 @@
#include "../op_table.h"
#include "../utils.h"
#include <array>
#include <cstdint>
#include <openvino/op/constant.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/pad.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <vector>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
namespace {
ov::Output<ov::Node> translate_circular_pad(ov::Output<ov::Node> input,
const std::array<int32_t, 8> & pads,
const ov::Shape & input_shape) {
ov::Output<ov::Node> result = input;
const std::array<int32_t, 4> pads_begin = {pads[6], pads[4], pads[2], pads[0]};
const std::array<int32_t, 4> pads_end = {pads[7], pads[5], pads[3], pads[1]};
for (size_t axis = 0; axis < input_shape.size(); ++axis) {
const int64_t input_dim = static_cast<int64_t>(input_shape[axis]);
const int64_t pad_begin = pads_begin[axis];
const int64_t pad_end = pads_end[axis];
if (pad_begin == 0 && pad_end == 0) {
continue;
}
FRONT_END_CHECK_IMPLEMENTED(input_dim > 0, "Circular PAD requires static non-zero input dimensions");
std::vector<int64_t> indices(static_cast<size_t>(input_dim + pad_begin + pad_end));
for (int64_t index = 0; index < static_cast<int64_t>(indices.size()); ++index) {
int64_t wrapped = (index - pad_begin) % input_dim;
if (wrapped < 0) {
wrapped += input_dim;
}
indices[static_cast<size_t>(index)] = wrapped;
}
auto gather_indices = ov::op::v0::Constant::create(ov::element::i64, {indices.size()}, indices);
auto gather_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {axis});
result = std::make_shared<ov::op::v8::Gather>(result, gather_indices, gather_axis);
}
return result;
}
} // namespace
OutputVector translate_pad(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = process_view_input_new(context, 0);
if (context.get_input_shape(0) == context.get_output_shape()) {
auto input_shape = std::make_shared<ov::op::v3::ShapeOf>(input);
auto res = std::make_shared<ov::op::v1::Reshape>(input, input_shape, false);
return rename_outputs_with_suffix({res}, context.get_name());
}
const int32_t * op_params = context.get_output_op_params();
FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "PAD requires output op params");
const std::array<int32_t, 8> pads = {op_params[0], op_params[1], op_params[2], op_params[3],
op_params[4], op_params[5], op_params[6], op_params[7]};
const bool circular = op_params[8] != 0;
if (circular) {
auto res = translate_circular_pad(input, pads, context.get_input_shape(0).to_shape());
return rename_outputs_with_suffix({res}, context.get_name());
}
const std::vector<int64_t> pads_begin = {pads[6], pads[4], pads[2], pads[0]};
const std::vector<int64_t> pads_end = {pads[7], pads[5], pads[3], pads[1]};
auto pads_begin_node = ov::op::v0::Constant::create(ov::element::i64, {pads_begin.size()}, pads_begin);
auto pads_end_node = ov::op::v0::Constant::create(ov::element::i64, {pads_end.size()}, pads_end);
auto pad_value = ov::op::v0::Constant::create(context.get_input_type(0), ov::Shape{}, {0});
auto res =
std::make_shared<ov::op::v1::Pad>(input, pads_begin_node, pads_end_node, pad_value, ov::op::PadMode::CONSTANT);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
+58 -13
View File
@@ -12,6 +12,7 @@
#include <openvino/op/reshape.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/transpose.hpp>
#include <vector>
namespace ov {
namespace frontend {
@@ -22,16 +23,33 @@ OutputVector translate_permute(const NodeContext & context) {
num_inputs_check(context, 1, 1);
int op_case = context.get_op_case();
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4,
"Unsupported PERMUTE case");
FRONT_END_CHECK_IMPLEMENTED(op_case != 0, "Unsupported PERMUTE case");
// op_case 1 is trivial permute
// op_case 2 is to permute Q. It has a preceding VIEW that reshapes Q to restore the sequqence dimension
// op_case 3 4 it to permute KV cache in the default layout
// op_case 5 6 is to permute V cache when `-fa off`, where v_trans=true
ov::Output<Node> res;
auto src = context.get_input(0);
auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
ov::Output<Node> src;
if (op_case == 3 || op_case == 4 || op_case == 5 || op_case == 6) {
src = context.get_input(0);
} else {
src = process_view_input_new(context, 0);
}
std::vector<int64_t> perm_values{0, 2, 1, 3};
const int32_t * op_params = context.get_output_op_params();
if (op_params != nullptr) {
for (size_t input_axis = 0; input_axis < perm_values.size(); ++input_axis) {
const size_t output_axis = static_cast<size_t>(op_params[input_axis]);
perm_values[perm_values.size() - 1 - output_axis] =
static_cast<int64_t>(perm_values.size() - 1 - input_axis);
}
}
auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, perm_values);
if (op_case == 1 || context.is_stateful()) {
res = std::make_shared<ov::op::v1::Transpose>(src, perm);
} else if (op_case == 4) {
} else if (op_case == 2) {
auto output_shape = context.get_output_shape().to_shape();
auto n_heads = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[1]});
auto head_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]});
@@ -54,13 +72,17 @@ OutputVector translate_permute(const NodeContext & context) {
auto output_shape = context.get_output_shape().to_shape();
int64_t head_size = output_shape[3];
int64_t n_heads = output_shape[1];
if (op_case == 5 || op_case == 6) {
head_size = output_shape[2];
n_heads = output_shape[1];
}
int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1;
int64_t n_seq = cache_shape[1].get_length();
Output<Node> attention_size;
if (!context.has_input("attention_size")) {
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]});
} else if (op_case == 2) {
} else if (op_case == 3 || op_case == 5) {
attention_size = context.get_input("attention_size");
} else {
attention_size = context.get_input("attention_size_swa");
@@ -80,18 +102,41 @@ OutputVector translate_permute(const NodeContext & context) {
seq_active_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_end_val});
}
// 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size]
// 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] (for `-fa off` [n_seq, n_heads, head_size, ctx_per_seq])
// 2. slice out the active sequences
// 3. slice out the attention part in each sequence
// 4. permute
// 4. permute (skip for `-fa off`)
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}), false);
auto slice1 = std::make_shared<ov::op::v8::Slice>(src_reshaped, seq_active_start, seq_active_end, one, zero);
auto slice2 = std::make_shared<ov::op::v8::Slice>(slice1, zero, attention_size, one, one);
res = std::make_shared<ov::op::v1::Transpose>(slice2, perm);
if (op_case == 3 || op_case == 4) {
auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}),
false);
ov::Output<ov::Node> after_seq_slice;
if (n_seq == 1) {
after_seq_slice = src_reshaped;
} else {
after_seq_slice =
std::make_shared<ov::op::v8::Slice>(src_reshaped, seq_active_start, seq_active_end, one, zero);
}
auto slice2 = std::make_shared<ov::op::v8::Slice>(after_seq_slice, zero, attention_size, one, one);
res = std::make_shared<ov::op::v1::Transpose>(slice2, perm);
} else {
auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
auto src_reshaped = std::make_shared<ov::op::v1::Reshape>(
src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, n_heads, head_size, ctx_per_seq}),
false);
ov::Output<ov::Node> after_seq_slice;
if (n_seq == 1) {
after_seq_slice = src_reshaped;
} else {
after_seq_slice =
std::make_shared<ov::op::v8::Slice>(src_reshaped, seq_active_start, seq_active_end, one, zero);
}
auto slice2 = std::make_shared<ov::op::v8::Slice>(after_seq_slice, zero, attention_size, one, three);
res = slice2;
}
}
return rename_outputs_with_suffix({res}, context.get_name());
}
@@ -0,0 +1,74 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include "ggml.h"
#include <memory>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/tile.hpp>
#include <vector>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
// GGML_OP_REPEAT tiles src[0] to fill the destination shape. Every destination
// dimension is an integer multiple of the corresponding source dimension.
OutputVector translate_repeat(const NodeContext & context) {
num_inputs_check(context, 1, 2);
auto input = process_view_input_new(context, 0);
const auto input_shape = context.get_input_shape(0);
const auto output_shape = context.get_output_shape();
if (input_shape.rank().is_static() && output_shape.rank().is_static() &&
input_shape.rank() == output_shape.rank()) {
const auto rank = static_cast<size_t>(input_shape.rank().get_length());
std::vector<int64_t> repeats(rank, 1);
bool all_static = true;
for (size_t axis = 0; axis < rank; ++axis) {
if (!input_shape[axis].is_static() || !output_shape[axis].is_static()) {
all_static = false;
break;
}
const int64_t input_dim = input_shape[axis].get_length();
const int64_t output_dim = output_shape[axis].get_length();
FRONT_END_OP_CONVERSION_CHECK(input_dim > 0 && output_dim > 0 && output_dim % input_dim == 0,
"REPEAT input shape ", input_shape, " cannot tile to match ", output_shape);
repeats[axis] = output_dim / input_dim;
}
if (all_static) {
auto repeats_node = ov::op::v0::Constant::create(ov::element::i64, {repeats.size()}, repeats);
ov::Output<ov::Node> res = std::make_shared<ov::op::v0::Tile>(input, repeats_node);
return rename_outputs_with_suffix({res}, context.get_name());
}
}
// Dynamic fallback: tile by the ratio of output to input shape.
auto input_shape_node = std::make_shared<ov::op::v3::ShapeOf>(input, ov::element::i64);
std::shared_ptr<ov::Node> target_shape_node;
if (output_shape.rank().is_static() && output_shape.is_static()) {
target_shape_node =
ov::op::v0::Constant::create(ov::element::i64, {output_shape.to_shape().size()}, output_shape.to_shape());
} else {
target_shape_node = std::make_shared<ov::op::v3::ShapeOf>(context.get_input(1), ov::element::i64);
}
auto repeats_node = std::make_shared<ov::op::v1::Divide>(target_shape_node, input_shape_node);
ov::Output<ov::Node> res = std::make_shared<ov::op::v0::Tile>(input, repeats_node);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
+13 -6
View File
@@ -10,7 +10,6 @@
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/reshape.hpp>
#include <stdexcept>
#include <vector>
namespace ov {
@@ -20,7 +19,8 @@ namespace op {
OutputVector translate_reshape(const NodeContext & context) {
num_inputs_check(context, 1, 1);
if (context.get_input_shape(0) == context.get_output_shape()) {
if (context.get_input(0).get_partial_shape().is_static() &&
context.get_input_shape(0) == context.get_output_shape()) {
return {context.get_input(0)};
}
@@ -34,12 +34,12 @@ OutputVector translate_reshape(const NodeContext & context) {
if (op_case == 1) {
if (context.is_stateful()) {
new_shape_node = ov::op::v0::Constant::create(
ov::element::i64, {3},
std::vector<int64_t>{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
ov::element::i64, {3}, std::vector<int64_t>{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
} else {
new_shape_node = ov::op::v0::Constant::create(
ov::element::i64, {4},
std::vector<int64_t>{(int64_t) output_shape[0], -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
std::vector<int64_t>{(int64_t) output_shape[0], -1, (int64_t) output_shape[2],
(int64_t) output_shape[3]});
}
} else if (op_case == 2) {
new_shape_node = ov::op::v0::Constant::create(
@@ -47,7 +47,14 @@ OutputVector translate_reshape(const NodeContext & context) {
std::vector<int64_t>{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, (int64_t) output_shape[3]});
} else if (op_case == 3) {
throw std::runtime_error("might be outdated RESHAPE case");
// - 14: [ 1, 1024, 1, 1] RESHAPE Vcur-0 (reshaped) (reshaped)
// [ 512, 2, 1, 1] 0: RESHAPE Vcur-0 (reshaped)
// - 15: [ 1, 524288, 1, 1] RESHAPE cache_v_l0 (reshaped)
// [ 512, 1024, 1, 1] 0: NONE cache_v_l0
// - 16: [ 1, 524288, 1, 1] SET_ROWS cache_v_l0 (reshaped) (view)
// [ 1, 1024, 1, 1] 0: RESHAPE Vcur-0 (reshaped) (reshaped)
// [ 1024, 1, 1, 1] 1: NONE leaf_11
// [ 1, 524288, 1, 1] 2: RESHAPE cache_v_l0 (reshaped)
new_shape_node = ov::op::v0::Constant::create(
ov::element::i64, {4}, std::vector<int64_t>{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, 1});
@@ -19,7 +19,7 @@ namespace op {
OutputVector translate_rms_norm(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input_node = context.get_input(0);
auto input_node = process_view_input_new(context, 0);
auto square = std::make_shared<ov::op::v1::Power>(
input_node, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f}));
+131 -35
View File
@@ -7,6 +7,7 @@
#include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
@@ -38,8 +39,7 @@ OutputVector translate_rope(const NodeContext & context) {
auto data_node = context.get_input(0).get_node_shared_ptr();
auto output_shape = context.get_output_shape().to_shape();
int32_t * op_params = context.get_output_op_params();
const int mode = (op_case & 0xFFFF0000) >> 16;
op_case = (op_case & 0x0000FFFF);
const int mode = op_case;
constexpr int TYPE_NORMAL = 0;
constexpr int TYPE_NEOX = 1;
@@ -56,55 +56,146 @@ OutputVector translate_rope(const NodeContext & context) {
if (context.get_input_size() == 3) {
rope_freqs_weight = context.get_input(2).get_node_shared_ptr();
}
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE);
auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE, false);
sin_theta_node = sin_cos.first;
cos_theta_node = sin_cos.second;
}
if (op_case == 2) {
// The input comes from a VIEW
int slice_len = output_shape[2] * output_shape[3];
data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr();
if (context.get_view_input_size(0) > 0) {
data_node = process_view_input_new(context, 0).get_node_shared_ptr();
if (context.is_stateful()) {
auto data_shape = ov::op::v0::Constant::create(
ov::element::i64, {3}, std::vector<int64_t>{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
data_node = std::make_shared<ov::op::v1::Reshape>(data_node, data_shape, false);
} else {
auto data_shape = ov::op::v0::Constant::create(
ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
ov::element::i64, {4},
std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
data_node = std::make_shared<ov::op::v1::Reshape>(data_node, data_shape, false);
}
}
auto output_type = context.get_output_type();
if (data_node->get_element_type() != ov::element::f32) {
data_node = std::make_shared<ov::op::v0::Convert>(data_node, ov::element::f32);
}
// TODO(openvino-gpu-rope-fusion): TEMPORARY WORKAROUND - do NOT revert until the
// OpenVINO GPU plugin is updated.
//
// For TYPE_NORMAL rope (both stateful and stateless) we emit the Flux-style
// interleaved pattern below so the GPU plugin's RoPEFusionFlux matcher folds it
// into ov::op::internal::RoPE. The matcher requires rank-4 inputs, which is why
// the original even/odd Slice translation (kept in the `else if (mode ==
// TYPE_NORMAL)` branch below for reference) does not get fused.
//
// Once the GPU plugin's RoPE fusion is extended to also recognize the original
// even/odd Slice form, this Flux rewrite should be removed and both modes should
// be restored to the captured even/odd translation. Until then, keep both paths:
// the active Flux rewrite here and the previous translation preserved below.
if (mode == TYPE_NORMAL) {
auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]});
Output<Node> even_slice;
Output<Node> odd_slice;
int32_t unsqueeze_dim = context.is_stateful() ? 3 : 4;
even_slice = std::make_shared<ov::op::v8::Slice>(data_node, zero, end, two, neg_one);
odd_slice = std::make_shared<ov::op::v8::Slice>(data_node, one, end, two, neg_one);
// Emit the Flux-style interleaved-RoPE pattern so the GPU plugin's
// RoPEFusionFlux matcher folds this subgraph into ov::op::internal::RoPE:
// x_paired = Reshape(x, [1, S, n_heads, head_size/2, 2])
// x0, x1 = Split(x_paired, axis=-1, num_splits=2)
// x1_neg = x1 * -1
// x_rotated = Reshape(Concat([x1_neg, x0], axis=-1), [1, S, n_heads, head_size])
// y = x * t_cos + x_rotated * t_sin
// Mathematically equivalent to the even/odd Slice form below.
//
// RoPEFusionFlux requires rank_equals(4) on x, t_cos and t_sin. The cos/sin
// tables are already built rank-4 ([1, S, 1, head_size/2]) for both modes. In
// stateful mode the data arrives rank-3 ([S, n_heads, head_size]), so lift it
// to rank-4 ([1, S, n_heads, head_size]) here. Stateful RoPE already produced
// rank-4 output, so downstream attention is unaffected.
if (context.is_stateful()) {
auto r4_shape = ov::op::v0::Constant::create(
ov::element::i64, {4},
std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
data_node = std::make_shared<ov::op::v1::Reshape>(data_node, r4_shape, false);
}
const int64_t head_size = static_cast<int64_t>(output_shape[3]);
const int64_t n_heads = static_cast<int64_t>(output_shape[2]);
const int64_t half = head_size / 2;
Output<Node> first_half =
std::make_shared<ov::op::v1::Subtract>(std::make_shared<ov::op::v1::Multiply>(even_slice, cos_theta_node),
std::make_shared<ov::op::v1::Multiply>(odd_slice, sin_theta_node));
Output<Node> second_half =
std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(even_slice, sin_theta_node),
std::make_shared<ov::op::v1::Multiply>(odd_slice, cos_theta_node));
auto neg_one_f = ov::op::v0::Constant::create(data_node->get_element_type(), ov::Shape{}, {-1.0f});
first_half = std::make_shared<ov::op::v0::Unsqueeze>(first_half,
ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim}));
second_half = std::make_shared<ov::op::v0::Unsqueeze>(second_half,
ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim}));
auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, unsqueeze_dim);
auto paired_shape =
ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector<int64_t>{1, -1, n_heads, half, 2});
auto x_paired = std::make_shared<ov::op::v1::Reshape>(data_node, paired_shape, false);
auto data_shape = ov::op::v0::Constant::create(
ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
} else if (mode == TYPE_NEOX) {
auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1});
auto data_split = std::make_shared<ov::op::v1::Split>(x_paired, split_axis, 2);
Output<Node> x0 = data_split->outputs()[0];
Output<Node> x1 = data_split->outputs()[1];
auto x1_neg = std::make_shared<ov::op::v1::Multiply>(x1, neg_one_f);
auto x_rotated_paired = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{x1_neg, x0}, -1);
auto flat_shape =
ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{1, -1, n_heads, head_size});
auto x_rotated = std::make_shared<ov::op::v1::Reshape>(x_rotated_paired, flat_shape, false);
// Expand cos/sin from [..., head_size/2] to [..., head_size] by repeating each
// entry twice. Use special_zero on the final Reshape so the seq dim passes
// through dynamically. Final rank is 4 to satisfy the matcher's predicate.
auto expand_cos_sin = [&](Output<Node> cs) {
auto cs_unsq =
std::make_shared<ov::op::v0::Unsqueeze>(cs, ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}));
auto bcast_target =
ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector<int64_t>{1, 1, 1, half, 2});
auto bcast =
std::make_shared<ov::op::v3::Broadcast>(cs_unsq, bcast_target, ov::op::BroadcastType::BIDIRECTIONAL);
auto flat = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{0, 0, 0, head_size});
return std::make_shared<ov::op::v1::Reshape>(bcast, flat, true);
};
Output<Node> cos_full = expand_cos_sin(cos_theta_node);
Output<Node> sin_full = expand_cos_sin(sin_theta_node);
auto y1 = std::make_shared<ov::op::v1::Multiply>(data_node, cos_full);
auto y2 = std::make_shared<ov::op::v1::Multiply>(x_rotated, sin_full);
res = std::make_shared<ov::op::v1::Add>(y1, y2);
}
// PRESERVED PREVIOUS TRANSLATION - Re-enable this branch (and remove the Flux branch above) once
// the GPU plugin's RoPE fusion is updated to recognize the even/odd Slice form;
// see the TODO(openvino-gpu-rope-fusion) note above. Do not delete.
//
// Original even/odd Slice form. In stateless mode it ran on rank-4 data
// ([1, S, n_heads, head_size]); in stateful mode on rank-3 data
// ([S, n_heads, head_size]). Either way it does not match RoPEFusionFlux
// (which needs rank-4 x in the interleaved layout), so the RoPE stays as
// discrete elementwise ops.
//
// } else if (mode == TYPE_NORMAL) {
// auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
// auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
// auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
// auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
// auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]});
// Output<Node> even_slice;
// Output<Node> odd_slice;
// // stateful data is rank 3 (unsqueeze at axis 3), stateless is rank 4 (axis 4)
// int32_t unsqueeze_dim = context.is_stateful() ? 3 : 4;
// even_slice = std::make_shared<ov::op::v8::Slice>(data_node, zero, end, two, neg_one);
// odd_slice = std::make_shared<ov::op::v8::Slice>(data_node, one, end, two, neg_one);
//
// Output<Node> first_half =
// std::make_shared<ov::op::v1::Subtract>(std::make_shared<ov::op::v1::Multiply>(even_slice, cos_theta_node),
// std::make_shared<ov::op::v1::Multiply>(odd_slice, sin_theta_node));
// Output<Node> second_half =
// std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(even_slice, sin_theta_node),
// std::make_shared<ov::op::v1::Multiply>(odd_slice, cos_theta_node));
//
// first_half = std::make_shared<ov::op::v0::Unsqueeze>(first_half,
// ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim}));
// second_half = std::make_shared<ov::op::v0::Unsqueeze>(second_half,
// ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim}));
// auto stack = std::make_shared<ov::op::v0::Concat>(OutputVector{first_half, second_half}, unsqueeze_dim);
//
// auto data_shape = ov::op::v0::Constant::create(
// ov::element::i64, {4}, std::vector<int64_t>{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]});
// res = std::make_shared<ov::op::v1::Reshape>(stack, data_shape, false);
else if (mode == TYPE_NEOX) {
auto data_split = std::make_shared<ov::op::v1::Split>(
data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2);
Output<Node> slice_data_node_0 = data_split->outputs()[0];
@@ -120,8 +211,9 @@ OutputVector translate_rope(const NodeContext & context) {
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{first_half_node, second_half_node}, -1);
} else if (mode == TYPE_IMROPE) {
int64_t n_dims = data_node->get_shape()[3];
auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1,-1,1,(n_dims >> 1)});
int64_t n_dims = data_node->get_output_partial_shape(0)[3].get_length();
auto cos_sin_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4},
std::vector<int64_t>{1, -1, 1, (n_dims >> 1)});
auto cos_reshaped = std::make_shared<ov::op::v1::Reshape>(cos_theta_node, cos_sin_shape, true);
auto sin_reshaped = std::make_shared<ov::op::v1::Reshape>(sin_theta_node, cos_sin_shape, true);
@@ -140,6 +232,10 @@ OutputVector translate_rope(const NodeContext & context) {
res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{sub, add}, 3);
}
if (res.get_element_type() != output_type) {
res = std::make_shared<ov::op::v0::Convert>(res, output_type);
}
return rename_outputs_with_suffix({res}, context.get_name());
}
@@ -28,20 +28,20 @@ namespace op {
OutputVector translate_set_rows(const NodeContext & context) {
num_inputs_check(context, 3, 3);
auto data = context.get_input(0);
auto data = process_view_input_new(context, 0);
auto indices = context.get_input(1);
auto dst = context.get_input(2);
data = std::make_shared<ov::op::v0::Convert>(data, context.get_output_type());
auto dst_shape = context.get_output_shape().to_shape();
auto row_size = context.get_input_shape(2)[3].get_length();
auto ind_squeezed =
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 1, 2}));
auto data_reshaped = std::make_shared<ov::op::v1::Reshape>(
data,
ov::op::v0::Constant::create(ov::element::i64, {4},
{(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) dst_shape[3]}),
{(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) row_size}),
false);
auto axes = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2});
+69 -52
View File
@@ -2,18 +2,16 @@
#include "../op_table.h"
#include "../utils.h"
#include <climits>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/frontend/exception.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/matmul.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/softmax.hpp>
#include <vector>
@@ -22,63 +20,82 @@ namespace frontend {
namespace ggml {
namespace op {
// Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend:
// 1) logits = src0 * scale
// 2) logits += mask (if provided)
// 3) softmax over the last dimension
OutputVector translate_soft_max(const NodeContext & context) {
// TODO code is outdated
num_inputs_check(context, 1, 2);
auto input_node = context.get_input(0).get_node_shared_ptr();
ov::Output<Node> res;
float scale = 1.0f;
float max_bias = 0.0f;
auto * op_params = context.get_output_op_params();
memcpy(&scale, (float *) op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) op_params + 1, sizeof(float));
auto src0_shape = context.get_input_shape(0).get_shape();
const uint32_t h = src0_shape[2];
const uint32_t n_head = src0_shape[0];
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
memcpy(&scale, (float *) context.get_output_op_params() + 0, sizeof(float));
memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float));
const float m0 = powf(2.0f, -(max_bias) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
const float slope =
(max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f;
ov::Output<ov::Node> logits = context.get_input(0);
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
auto scaled_input = std::make_shared<ov::op::v1::Multiply>(input_node, scale_node);
if (context.get_input_size() < 2) {
res = std::make_shared<ov::op::v8::Softmax>(scaled_input, 2);
return rename_outputs_with_suffix({res}, context.get_name());
// Apply scale first: logits = src0 * scale
if (scale != 1.0f) {
auto scale_const =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{scale});
logits = std::make_shared<ov::op::v1::Multiply>(logits, scale_const);
}
ov::Output<ov::Node> mask_node_sliced;
if (context.has_input("KQ_mask_sliced")) {
mask_node_sliced = context.get_input("KQ_mask_sliced");
} else {
auto token_len = get_dimensions(input_node, {1});
auto mask_node = context.get_input(1);
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
mask_node_sliced = std::make_shared<ov::op::v8::Slice>(mask_node, zero, token_len, one, one);
FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && context.get_input_size() < 2),
"OpenVINO softmax ALiBi path requires mask input");
// Optional mask add: logits += mask
// For max_bias > 0 (ALiBi), apply per-head slope to mask before adding.
if (context.get_input_size() > 1) {
ov::Output<ov::Node> mask = context.get_input(1);
// For stateful
std::string mask_name = "KQ_mask_sliced";
if (context.get_input_names()[1].find("swa") != std::string::npos) {
mask_name = "KQ_mask_swa_sliced";
}
if (context.has_input(mask_name)) {
mask = context.get_input(mask_name);
}
if (mask.get_element_type() != logits.get_element_type()) {
mask = std::make_shared<ov::op::v0::Convert>(mask, logits.get_element_type());
}
if (max_bias > 0.0f) {
auto out_shape = context.get_output_shape().to_shape();
FRONT_END_CHECK_IMPLEMENTED(out_shape.size() == 4, "OpenVINO softmax ALiBi path expects rank-4 tensor");
const uint32_t n_head = static_cast<uint32_t>(out_shape[1]);
FRONT_END_CHECK_IMPLEMENTED(n_head > 0, "OpenVINO softmax ALiBi path expects n_head > 0");
const uint32_t n_head_log2 = 1u << static_cast<uint32_t>(std::floor(std::log2(static_cast<float>(n_head))));
const float m0 = std::pow(2.0f, -(max_bias) / static_cast<float>(n_head_log2));
const float m1 = std::pow(2.0f, -(max_bias / 2.0f) / static_cast<float>(n_head_log2));
std::vector<float> slopes(n_head);
for (uint32_t h = 0; h < n_head; ++h) {
slopes[h] = h < n_head_log2 ? std::pow(m0, static_cast<float>(h + 1)) :
std::pow(m1, static_cast<float>(2 * (h - n_head_log2) + 1));
}
ov::Output<ov::Node> slope_node =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{n_head}, slopes);
if (slope_node.get_element_type() != mask.get_element_type()) {
slope_node = std::make_shared<ov::op::v0::Convert>(slope_node, mask.get_element_type());
}
auto slope_shape = std::make_shared<ov::op::v0::Constant>(
ov::element::i64, ov::Shape{4}, std::vector<int64_t>{1, static_cast<int64_t>(n_head), 1, 1});
auto slope_4d = std::make_shared<ov::op::v1::Reshape>(slope_node, slope_shape, false);
mask = std::make_shared<ov::op::v1::Multiply>(mask, slope_4d);
}
logits = std::make_shared<ov::op::v1::Add>(logits, mask);
}
if (mask_node_sliced.get_element_type() != context.get_output_type()) {
mask_node_sliced = std::make_shared<ov::op::v0::Convert>(mask_node_sliced, context.get_output_type());
}
Output<Node> slope_mask;
if (slope != 1.0f) {
auto slope_node =
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{}, std::vector<float>{slope});
slope_mask = std::make_shared<ov::op::v1::Multiply>(mask_node_sliced, slope_node);
throw std::runtime_error("Slope != 1.0f in softmax has not been tested, verify it before use.");
}
slope_mask = mask_node_sliced;
auto input_slope_mask_node = std::make_shared<ov::op::v1::Add>(scaled_input, slope_mask);
res = std::make_shared<ov::op::v8::Softmax>(input_slope_mask_node, 2);
// Softmax along last dimension (equivalent to ggml softmax over ne[0]).
auto res = std::make_shared<ov::op::v8::Softmax>(logits, -1);
return rename_outputs_with_suffix({res}, context.get_name());
}
@@ -0,0 +1,59 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <openvino/op/constant.hpp>
#include <openvino/op/group_conv.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/transpose.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_ssm_conv(const NodeContext & context) {
num_inputs_check(context, 2, 2);
auto sx = context.get_input(0); // conv state + input: OV shape [1, n_s, d_inner, ncs]
auto c = context.get_input(1); // conv1d weight: OV shape [1, 1, d_inner, d_conv]
auto sx_shape = context.get_input_shape(0).to_shape(); // [1, n_s, d_inner, ncs]
auto c_shape = context.get_input_shape(1).to_shape(); // [1, 1, d_inner, d_conv]
int64_t n_s = sx_shape[1];
int64_t d_inner = sx_shape[2];
int64_t ncs = sx_shape[3]; // d_conv - 1 + n_t
int64_t d_conv = c_shape[3];
int64_t n_t = ncs - d_conv + 1;
// Reshape sx from [1, n_s, d_inner, ncs] to [n_s, d_inner, ncs] for 1D GroupConvolution
auto sx_new_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{n_s, d_inner, ncs});
auto sx_reshaped = std::make_shared<ov::op::v1::Reshape>(sx, sx_new_shape, false);
// Reshape c from [1, 1, d_inner, d_conv] to [d_inner, 1, 1, d_conv]
// GroupConvolution filter: [groups, out_channels/groups, in_channels/groups, kernel_size]
auto c_new_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{d_inner, 1, 1, d_conv});
auto c_reshaped = std::make_shared<ov::op::v1::Reshape>(c, c_new_shape, false);
// Depthwise 1D convolution: groups=d_inner, stride=1, no padding, no dilation
// Input: [n_s, d_inner, ncs], Filter: [d_inner, 1, 1, d_conv]
// Output: [n_s, d_inner, n_t]
auto conv = std::make_shared<ov::op::v1::GroupConvolution>(
sx_reshaped, c_reshaped, ov::Strides{1}, ov::CoordinateDiff{0}, ov::CoordinateDiff{0}, ov::Strides{1});
// Transpose from [n_s, d_inner, n_t] to [n_s, n_t, d_inner]
auto perm = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{0, 2, 1});
auto transposed = std::make_shared<ov::op::v1::Transpose>(conv, perm);
// Reshape to output shape [1, n_s, n_t, d_inner]
auto out_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<int64_t>{1, n_s, n_t, d_inner});
auto res = std::make_shared<ov::op::v1::Reshape>(transposed, out_shape, false);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -0,0 +1,27 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <memory>
#include <openvino/op/constant.hpp>
#include <openvino/op/reduce_sum.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_sum_rows(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = process_view_input_new(context, 0);
auto res = std::make_shared<ov::op::v1::ReduceSum>(
input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -12,8 +12,39 @@ namespace op {
OutputVector translate_transpose(const NodeContext & context) {
num_inputs_check(context, 1, 1);
// Compute permute order from input/output shape and stride information
// so it adapts to different input and output layouts.
auto input_shape = context.get_input_shape(0).to_shape();
auto input_stride = context.get_input_stride(0);
auto output_shape = context.get_output_shape().to_shape();
auto output_stride = context.get_output_stride();
// Compute permute order by matching output and input stride rankings.
// Build <stride, dim_index> pairs.
std::vector<std::pair<size_t, int>> output_stride_dims;
std::vector<std::pair<size_t, int>> input_stride_dims;
for (int i = 0; i < 4; ++i) {
output_stride_dims.push_back({output_stride[i], i});
input_stride_dims.push_back({input_stride[i], i});
}
// Sort by stride in descending order.
std::sort(output_stride_dims.rbegin(), output_stride_dims.rend());
std::sort(input_stride_dims.rbegin(), input_stride_dims.rend());
// Build permute order.
std::vector<int64_t> permute_order(4);
for (int i = 0; i < 4; ++i) {
int output_dim = output_stride_dims[i].second;
int input_dim = input_stride_dims[i].second;
permute_order[output_dim] = input_dim;
}
auto input = process_view_input_new(context, 0);
auto res = std::make_shared<ov::op::v1::Transpose>(
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 1, 3, 2}));
input, ov::op::v0::Constant::create(ov::element::i64, {4}, permute_order));
return rename_outputs_with_suffix({res}, context.get_name());
}
@@ -1,25 +0,0 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <openvino/core/node_output.hpp>
#include <openvino/op/gelu.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_unary_gelu(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = context.get_input(0);
auto res = std::make_shared<ov::op::v7::Gelu>(input);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
@@ -14,7 +14,7 @@ namespace op {
OutputVector translate_unary_silu(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = context.get_input(0);
auto input = process_view_input_new(context, 0);
auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(input);
auto res = std::make_shared<ov::op::v1::Multiply>(input, sigmoid);
@@ -0,0 +1,38 @@
#include "../node_context.h"
#include "../op_table.h"
#include "../utils.h"
#include <openvino/op/abs.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/exp.hpp>
#include <openvino/op/log.hpp>
#include <openvino/op/negative.hpp>
#include <openvino/op/relu.hpp>
namespace ov {
namespace frontend {
namespace ggml {
namespace op {
OutputVector translate_unary_softplus(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = process_view_input_new(context, 0);
const auto element_type = input.get_element_type();
auto one = ov::op::v0::Constant::create(element_type, ov::Shape{}, {1.0f});
auto positive = std::make_shared<ov::op::v0::Relu>(input);
auto abs = std::make_shared<ov::op::v0::Abs>(input);
auto neg_abs = std::make_shared<ov::op::v0::Negative>(abs);
auto exp_neg_abs = std::make_shared<ov::op::v0::Exp>(neg_abs);
auto log_term = std::make_shared<ov::op::v0::Log>(std::make_shared<ov::op::v1::Add>(one, exp_neg_abs));
auto res = std::make_shared<ov::op::v1::Add>(positive, log_term);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
} // namespace ggml
} // namespace frontend
} // namespace ov
+91 -26
View File
@@ -1,6 +1,11 @@
#include "../op_table.h"
#include "../utils.h"
#include <openvino/op/constant.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/slice.hpp>
#include <set>
namespace ov {
namespace frontend {
namespace ggml {
@@ -9,42 +14,102 @@ namespace op {
OutputVector translate_view(const NodeContext & context) {
num_inputs_check(context, 1, 1);
if (context.get_op_case() == 2) {
auto dst_shape = context.get_output_shape().to_shape();
return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[2] * dst_shape[3])},
context.get_name());
if (!context.is_static()) {
return {context.get_input(0)};
}
// op_case 3
if (context.get_op_case() == 3) {
auto input = context.get_input(0);
auto input_ov_shape = input.get_partial_shape();
auto input_llama_shape = context.get_input_shape(0).to_shape();
auto input = context.get_input(0);
auto src_shape = context.get_input_shape(0);
auto dst_shape = context.get_output_shape();
// if the input ov shape size is different from the input llama shape size, it means the input is already reshaped and we need to reshape it back to the original shape before slicing
if (input_ov_shape.size() != input_llama_shape.size()) {
input = std::make_shared<ov::op::v1::Reshape>(input, ov::op::v0::Constant::create(ov::element::i64, {input_llama_shape.size()}, input_llama_shape), false);
if (src_shape.rank().is_dynamic() || dst_shape.rank().is_dynamic()) {
return {input};
}
int64_t src_elems = 1, dst_elems = 1;
for (int64_t i = 0; i < src_shape.rank().get_length(); ++i) {
if (src_shape[i].is_dynamic()) {
return {input};
}
src_elems *= src_shape[i].get_length();
}
for (int64_t i = 0; i < dst_shape.rank().get_length(); ++i) {
if (dst_shape[i].is_dynamic()) {
return {input};
}
dst_elems *= dst_shape[i].get_length();
}
auto dst_shape = context.get_output_shape().to_shape();
if (dst_elems >= src_elems) {
return {input};
}
// find the index of dst_shape that is different from input shape, and use that index to slice the input
int slice_dim = -1;
for (size_t i = 0; i < dst_shape.size(); ++i) {
if (dst_shape[i] != input_llama_shape[i]) {
slice_dim = i;
auto src_stride = context.get_input_stride(0);
auto dst_stride = context.get_output_stride();
size_t view_offset = context.get_output_op_offset();
bool same_stride = (src_stride.size() == dst_stride.size());
if (same_stride) {
for (size_t i = 0; i < src_stride.size(); ++i) {
if (src_stride[i] != dst_stride[i]) {
same_stride = false;
break;
}
}
auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {dst_shape[slice_dim]});
auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim});
auto sliced = std::make_shared<ov::op::v8::Slice>(input, begin, end, stride, axes);
return {sliced};
}
return {context.get_input(0)};
if (!same_stride) {
return {input};
}
auto src_ov_shape = src_shape.to_shape();
auto dst_ov_shape = dst_shape.to_shape();
size_t ndims = src_ov_shape.size();
if (dst_ov_shape.size() != ndims) {
return {input};
}
std::vector<int> diff_dims;
for (size_t i = 0; i < ndims; ++i) {
if (src_ov_shape[i] != dst_ov_shape[i]) {
diff_dims.push_back(static_cast<int>(i));
}
}
if (diff_dims.size() != 1) {
return {input};
}
int slice_dim = diff_dims[0];
int64_t dim_size = static_cast<int64_t>(src_ov_shape[slice_dim]);
size_t ov_stride_for_dim = 1;
for (size_t i = slice_dim + 1; i < ndims; ++i) {
ov_stride_for_dim *= src_ov_shape[i];
}
size_t elem_size = src_stride.back();
if (elem_size == 0) {
elem_size = 1;
}
int64_t begin_val = 0;
if (ov_stride_for_dim > 0 && elem_size > 0) {
begin_val = static_cast<int64_t>((view_offset / elem_size) / ov_stride_for_dim);
}
int64_t end_val = begin_val + static_cast<int64_t>(dst_ov_shape[slice_dim]);
if (begin_val < 0 || end_val > dim_size) {
return {input};
}
auto sliced =
std::make_shared<ov::op::v8::Slice>(input, ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {1}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim}));
sliced->set_friendly_name(context.get_output_name());
return {sliced->output(0)};
}
} // namespace op
+40 -23
View File
@@ -5,9 +5,11 @@
#include <openvino/op/add.hpp>
#include <openvino/op/divide.hpp>
#include <openvino/op/gather.hpp>
#include <openvino/op/gelu.hpp>
#include <openvino/op/matmul.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/op/tanh.hpp>
namespace ov {
namespace frontend {
@@ -16,29 +18,44 @@ namespace ggml {
std::unordered_map<std::string, CreatorFunction> get_supported_ops() {
using namespace ov::op;
return {
{"GGML_OP_ADD", op::translate_1to1_match_2_inputs<v1::Add> },
{"GGML_OP_ADD1", op::translate_1to1_match_2_inputs<v1::Add> },
{"GGML_OP_CONT", op::translate_cont },
{"GGML_OP_DIV", op::translate_1to1_match_2_inputs<v1::Divide> },
{"GGML_OP_GET_ROWS", op::translate_get_rows },
{"GGML_OP_MUL", op::translate_1to1_match_2_inputs<v1::Multiply>},
{"GGML_OP_MUL_MAT", op::translate_mulmat },
{"GGML_OP_PERMUTE", op::translate_permute },
{"GGML_OP_RESHAPE", op::translate_reshape },
{"GGML_OP_RMS_NORM", op::translate_rms_norm },
{"GGML_OP_ROPE", op::translate_rope },
{"GGML_OP_SCALE", op::translate_scale },
{"GGML_OP_SOFT_MAX", op::translate_soft_max },
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
{"GGML_OP_TRANSPOSE", op::translate_transpose },
{"GGML_UNARY_OP_GELU", op::translate_unary_gelu },
{"GGML_UNARY_OP_SILU", op::translate_unary_silu },
{"GGML_OP_VIEW", op::translate_view },
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
{"GGML_GLU_OP_GEGLU", op::translate_glu_geglu },
{"GGML_OP_SET_ROWS", op::translate_set_rows },
{"GGML_OP_CPY", op::translate_cpy },
{"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext },
{"GGML_OP_ADD", op::translate_1to1_match_2_inputs<v1::Add> },
{"GGML_OP_ADD1", op::translate_1to1_match_2_inputs<v1::Add> },
{"GGML_OP_ADD_ID", op::translate_add_id },
{"GGML_OP_CONCAT", op::translate_concat },
{"GGML_OP_CONT", op::translate_cont },
{"GGML_OP_DIV", op::translate_div },
{"GGML_OP_GET_ROWS", op::translate_get_rows },
{"GGML_OP_IM2COL", op::translate_im2col },
{"GGML_OP_MUL", op::translate_1to1_match_2_inputs<v1::Multiply>},
{"GGML_OP_MUL_MAT", op::translate_mulmat },
{"GGML_OP_MUL_MAT_ID", op::translate_mul_mat_id },
{"GGML_OP_PERMUTE", op::translate_permute },
{"GGML_OP_RESHAPE", op::translate_reshape },
{"GGML_OP_RMS_NORM", op::translate_rms_norm },
{"GGML_OP_NORM", op::translate_norm },
{"GGML_OP_L2_NORM", op::translate_l2_norm },
{"GGML_OP_SUM_ROWS", op::translate_sum_rows },
{"GGML_OP_ROPE", op::translate_rope },
{"GGML_OP_SCALE", op::translate_scale },
{"GGML_OP_SOFT_MAX", op::translate_soft_max },
{"GGML_OP_ARGSORT", op::translate_argsort },
{"GGML_OP_SUB", op::translate_1to1_match_2_inputs<v1::Subtract>},
{"GGML_OP_TRANSPOSE", op::translate_transpose },
{"GGML_UNARY_OP_GELU", op::translate_1to1_match_1_input<v7::Gelu> },
{"GGML_UNARY_OP_SILU", op::translate_unary_silu },
{"GGML_UNARY_OP_SOFTPLUS", op::translate_unary_softplus },
{"GGML_UNARY_OP_TANH", op::translate_1to1_match_1_input<v0::Tanh> },
{"GGML_OP_VIEW", op::translate_view },
{"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu },
{"GGML_GLU_OP_GEGLU", op::translate_glu_geglu },
{"GGML_OP_SET_ROWS", op::translate_set_rows },
{"GGML_OP_CPY", op::translate_cpy },
{"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext },
{"GGML_OP_CLAMP", op::translate_clamp },
{"GGML_OP_PAD", op::translate_pad },
{"GGML_OP_SSM_CONV", op::translate_ssm_conv },
{"GGML_OP_GATED_DELTA_NET", op::translate_gated_delta_net },
{"GGML_OP_REPEAT", op::translate_repeat },
};
}
+17 -5
View File
@@ -8,20 +8,26 @@ namespace ggml {
namespace op {
#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& context)
#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext & context)
GGML_OP_CONVERTER(translate_add);
GGML_OP_CONVERTER(translate_cont);
GGML_OP_CONVERTER(translate_concat);
GGML_OP_CONVERTER(translate_add_id);
GGML_OP_CONVERTER(translate_div);
GGML_OP_CONVERTER(translate_get_rows);
GGML_OP_CONVERTER(translate_mul);
GGML_OP_CONVERTER(translate_im2col);
GGML_OP_CONVERTER(translate_mulmat);
GGML_OP_CONVERTER(translate_mul_mat_id);
GGML_OP_CONVERTER(translate_permute);
GGML_OP_CONVERTER(translate_reshape);
GGML_OP_CONVERTER(translate_rms_norm);
GGML_OP_CONVERTER(translate_norm);
GGML_OP_CONVERTER(translate_l2_norm);
GGML_OP_CONVERTER(translate_sum_rows);
GGML_OP_CONVERTER(translate_rope);
GGML_OP_CONVERTER(translate_scale);
GGML_OP_CONVERTER(translate_unary_silu);
GGML_OP_CONVERTER(translate_unary_gelu);
GGML_OP_CONVERTER(translate_unary_softplus);
GGML_OP_CONVERTER(translate_soft_max);
GGML_OP_CONVERTER(translate_transpose);
GGML_OP_CONVERTER(translate_view);
@@ -29,9 +35,15 @@ GGML_OP_CONVERTER(translate_glu_swiglu);
GGML_OP_CONVERTER(translate_glu_geglu);
GGML_OP_CONVERTER(translate_set_rows);
GGML_OP_CONVERTER(translate_cpy);
GGML_OP_CONVERTER(translate_argsort);
GGML_OP_CONVERTER(translate_flash_attn_ext);
GGML_OP_CONVERTER(translate_clamp);
GGML_OP_CONVERTER(translate_pad);
GGML_OP_CONVERTER(translate_ssm_conv);
GGML_OP_CONVERTER(translate_gated_delta_net);
GGML_OP_CONVERTER(translate_repeat);
} // namespace op
} // namespace op
std::unordered_map<std::string, CreatorFunction> get_supported_ops();
@@ -1,8 +1,8 @@
#pragma once
#include "mark_decompression_convert_constant_folding.h"
#include "openvino/pass/matcher_pass.hpp"
#include "openvino/core/visibility.hpp"
#include "openvino/pass/matcher_pass.hpp"
#ifdef OPENVINO_STATIC_LIBRARY
# define TRANSFORMATIONS_API
@@ -13,6 +13,7 @@
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/core/preprocess/pre_post_process.hpp>
#include <openvino/core/type/element_type.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
@@ -77,49 +78,48 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
return pairs;
}
void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) {
void add_sliced_mask_stateful(TensorMap & tensor_map) {
auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name) {
if ((tensor_map.find(mask_name) != tensor_map.end()) &&
(tensor_map.find("token_len_per_seq") != tensor_map.end())) {
auto token_len_per_seq = tensor_map.at("token_len_per_seq").get_node_shared_ptr();
auto mask = tensor_map.at(mask_name).get_node_shared_ptr();
std::shared_ptr<ov::Node> mask_sliced;
if (is_static) {
mask_sliced = mask;
} else if (ggml_model_decoder.is_stateful()) {
auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0});
auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1});
auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1});
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
auto gather_inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, neg_one_1d, three_1d);
auto reshaped_inp_pos = std::make_shared<ov::op::v1::Reshape>(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false);
auto inp_pos_incremented = std::make_shared<ov::op::v1::Add>(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1}));
auto stop = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{token_len_per_seq, std::make_shared<v1::ConvertLike>(inp_pos_incremented, token_len_per_seq)}, 0);
mask_sliced =
std::make_shared<ov::op::v8::Slice>(mask, zero_2d, stop, one_2d, axes);
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
mask_sliced->set_friendly_name(sliced_name);
} else {
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len_per_seq, one, two);
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
mask_sliced->set_friendly_name(sliced_name);
}
std::shared_ptr<ov::Node> mask_sliced = mask;
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3});
auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1});
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
auto last_inp_pos = std::make_shared<ov::op::v8::Gather>(inp_pos, neg_one, three);
auto last_inp_pos_1d = std::make_shared<ov::op::v1::Reshape>(
last_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false);
auto last_inp_pos_cvt = std::make_shared<ov::op::v0::Convert>(last_inp_pos_1d, ov::element::i64);
auto last_inp_pos_inc = std::make_shared<ov::op::v1::Add>(last_inp_pos_cvt, one);
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, last_inp_pos_inc, step, axes);
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
mask_sliced->set_friendly_name(sliced_name);
tensor_map.insert({sliced_name, mask_sliced->output(0)});
}
};
create_sliced_mask("self_kq_mask", "KQ_mask_sliced", ggml_model_decoder.is_static());
create_sliced_mask("self_kq_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static());
create_sliced_mask("self_kq_mask", "KQ_mask_sliced");
create_sliced_mask("self_kq_mask_swa", "KQ_mask_swa_sliced");
}
void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
// When ROPE ops in the graph have divergent op_params (e.g. gemma4's mixed
// SWA/non-SWA layers with different n_dims or freq_base), a shared sin/cos
// precompute cannot broadcast across every ROPE use. Skip it here and let
// translate_rope() build sin/cos per-op from its own op_params.
if (ggml_model_decoder.has_mixed_rope_params()) {
return;
}
int32_t * rope_params = ggml_model_decoder.get_rope_params();
if (tensor_map.find("inp_pos") == tensor_map.end() || rope_params == nullptr) {
return;
@@ -142,8 +142,11 @@ void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder)
// Create common patterns
void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) {
add_sliced_mask(tensor_map, ggml_model_decoder);
add_rope_sin_cos(tensor_map, ggml_model_decoder);
if (ggml_model_decoder.is_stateful()) {
add_sliced_mask_stateful(tensor_map);
}
// This optimization is error-prone
// add_rope_sin_cos(tensor_map, ggml_model_decoder);
}
} // namespace
@@ -288,19 +291,19 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
if (ggml_model_decoder->is_stateful()) {
auto output_names = ggml_model_decoder->get_model_output_names();
std::map<std::string, int> model_output_indexes;
for (size_t i=0; i<output_names.size(); i++) {
for (size_t i = 0; i < output_names.size(); i++) {
model_output_indexes.insert(std::make_pair(output_names[i], i));
}
ov::preprocess::PrePostProcessor ppp(model);
for (size_t i=0; i<model->get_output_size(); i++) {
for (size_t i = 0; i < model->get_output_size(); i++) {
auto output_friendly_name = model->output(i).get_node_shared_ptr()->get_friendly_name();
auto output_id = model_output_indexes[output_friendly_name];
auto model_output_shape = model->output(i).get_partial_shape();
auto decoder_output_shape = ggml_model_decoder->get_output_shape(output_id);
if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static()
&& model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length()
&& decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) {
ppp.output(i).postprocess().custom([](const ov::Output<ov::Node>& node) {
if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static() &&
model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length() &&
decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) {
ppp.output(i).postprocess().custom([](const ov::Output<ov::Node> & node) {
auto axes = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {0});
return std::make_shared<ov::op::v0::Unsqueeze>(node, axes);
});
@@ -9,16 +9,17 @@ namespace ggml {
class TranslateSession {
public:
TranslateSession(const frontend::InputModel::Ptr& input_model,
const std::unordered_map<std::string, CreatorFunction>& translator_map, bool naive = false);
TranslateSession(const frontend::InputModel::Ptr & input_model,
const std::unordered_map<std::string, CreatorFunction> & translator_map,
bool naive = false);
std::shared_ptr<Model> get_converted_model();
std::shared_ptr<Model> translate_graph(const frontend::InputModel::Ptr& input_model);
std::shared_ptr<Model> translate_graph(const frontend::InputModel::Ptr & input_model);
private:
std::shared_ptr<Model> apply_transformations(std::shared_ptr<Model> model);
const frontend::InputModel::Ptr m_input_model;
const std::unordered_map<std::string, CreatorFunction>& m_translator_map;
const std::unordered_map<std::string, CreatorFunction> & m_translator_map;
std::shared_ptr<Model> m_ov_model;
bool m_naive;
};
+548 -3
View File
@@ -17,6 +17,7 @@
#include <openvino/op/reshape.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/sin.hpp>
#include <openvino/op/split.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/op/transpose.hpp>
@@ -123,7 +124,8 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
bool imrope,
bool stateful) {
if (stateful) {
inp_pos = std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
inp_pos =
std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
inp_pos = std::make_shared<ov::op::v0::Convert>(inp_pos, ov::element::f32);
auto pos_perm =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int64_t>{2, 1, 0});
@@ -212,8 +214,9 @@ std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params
}
auto one_minus_ramp = std::make_shared<ov::op::v1::Subtract>(one, ramp_mix);
theta = std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
theta =
std::make_shared<ov::op::v1::Add>(std::make_shared<ov::op::v1::Multiply>(theta_interp, one_minus_ramp),
std::make_shared<ov::op::v1::Multiply>(theta_extrap, ramp_mix));
mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale));
}
}
@@ -252,6 +255,548 @@ ov::Output<ov::Node> process_view_input(const NodeContext & context, int input_i
return sliced;
}
ov::Output<ov::Node> process_view_input_new(const NodeContext & context, int input_index) {
auto input = context.get_input(input_index);
// Check if this input has view inputs
size_t view_input_size = context.get_view_input_size(input_index);
if (view_input_size == 0) {
// No view inputs, return the input as is
return input;
}
// If translate_view already resolved this VIEW (produced a Slice), the input
// will already have the expected shape — skip re-slicing.
auto expected_ov_shape = context.get_view_input_ov_shape(input_index, 0);
auto actual_shape = input.get_partial_shape();
if (expected_ov_shape.rank().is_static() && actual_shape.rank().is_static() &&
expected_ov_shape.rank() == actual_shape.rank()) {
bool shapes_match = true;
for (int64_t i = 0; i < expected_ov_shape.rank().get_length(); ++i) {
if (!expected_ov_shape[i].is_static() || !actual_shape[i].is_static()) {
shapes_match = false;
break;
}
if (expected_ov_shape[i] != actual_shape[i]) {
shapes_match = false;
break;
}
}
if (shapes_match) {
return input;
}
}
// In static mode, use Split instead of Slice for single-dimension reductions.
// This ensures NPUW's FOLD doesn't parametrize per-layer slice indices (which
// would introduce dynamic shapes). A shared Split node sits outside the repeated
// subgraph boundary; each layer receives one of its output ports.
if (context.is_static() && view_input_size == 1) {
auto view_stride_v = context.get_view_input_stride(input_index, 0);
auto view_src_stride_v = context.get_view_input_src_stride(input_index, 0);
auto view_ggml_shape = context.get_view_input_ggml_shape(input_index, 0);
auto view_src_ggml_shape = context.get_view_input_src_ggml_shape(input_index, 0);
auto view_offset = context.get_view_input_offset(input_index, 0);
auto view_src_offset = context.get_view_input_src_offset(input_index, 0);
size_t ndims = view_ggml_shape.size();
std::vector<int> diff_dims;
if (view_src_ggml_shape.size() == ndims) {
for (size_t i = 0; i < ndims; ++i) {
if (view_ggml_shape[i] != view_src_ggml_shape[i]) {
diff_dims.push_back(static_cast<int>(i));
}
}
}
if (diff_dims.size() == 1) {
int split_dim = diff_dims[0];
int64_t num_splits = static_cast<int64_t>(view_src_ggml_shape[split_dim]);
int64_t chunk_size = static_cast<int64_t>(view_ggml_shape[split_dim]);
// Only apply when slicing exactly 1 element from a multi-element dimension
if (chunk_size == 1 && num_splits > 1) {
// Check suffix strides match (dimensions after split_dim)
bool suffix_ok = view_stride_v.size() == view_src_stride_v.size();
if (suffix_ok) {
for (size_t i = static_cast<size_t>(split_dim) + 1; i < ndims; ++i) {
if (view_stride_v[i] != view_src_stride_v[i]) {
suffix_ok = false;
break;
}
}
}
if (suffix_ok && view_src_stride_v[split_dim] > 0) {
size_t relative_offset = view_offset >= view_src_offset ? view_offset - view_src_offset : 0;
int64_t split_index = static_cast<int64_t>(relative_offset / view_src_stride_v[split_dim]);
if (split_index >= 0 && split_index < num_splits) {
auto src_node = input.get_node_shared_ptr();
std::string rt_key = "split_dim_" + std::to_string(split_dim);
auto & rt_info = src_node->get_rt_info();
if (rt_info.find(rt_key) == rt_info.end()) {
auto axis_const =
ov::op::v0::Constant::create(ov::element::i64, {}, {static_cast<int64_t>(split_dim)});
auto split_node =
std::make_shared<ov::op::v1::Split>(input, axis_const, static_cast<size_t>(num_splits));
split_node->set_friendly_name(src_node->get_friendly_name() + "_split");
rt_info[rt_key] = split_node;
}
auto split_node = rt_info[rt_key].as<std::shared_ptr<ov::op::v1::Split>>();
return split_node->output(static_cast<size_t>(split_index));
}
}
}
}
}
// Lambda function to process a single view operation
auto process_single_view =
[](ov::Output<ov::Node> current, size_t view_offset, const std::vector<size_t> & view_stride,
const ov::Shape & view_ggml_shape, const ov::PartialShape & view_ov_shape, const std::string & view_name,
size_t view_src_offset, const std::vector<size_t> & view_src_stride, const ov::Shape & view_src_ggml_shape,
const ov::PartialShape & view_src_ov_shape, const std::string & view_src_name) -> ov::Output<ov::Node> {
auto build_reshape_pattern = [](const ov::PartialShape & target_ov_shape,
const ov::Shape & target_ggml_shape) -> std::vector<int64_t> {
const size_t ndims = target_ggml_shape.size();
std::vector<int64_t> reshape_pattern(ndims);
size_t dynamic_dims = 0;
if (target_ov_shape.rank().is_static() &&
target_ov_shape.rank().get_length() == static_cast<int64_t>(ndims)) {
for (size_t i = 0; i < ndims; ++i) {
if (target_ov_shape[i].is_static()) {
reshape_pattern[i] = target_ov_shape[i].get_length();
} else {
reshape_pattern[i] = -1;
++dynamic_dims;
}
}
} else {
dynamic_dims = 2;
}
if (dynamic_dims > 1) {
for (size_t i = 0; i < ndims; ++i) {
reshape_pattern[i] = static_cast<int64_t>(target_ggml_shape[i]);
}
}
return reshape_pattern;
};
auto build_prefix_tail_reshape_pattern = [](const ov::PartialShape & target_ov_shape,
const ov::Shape & target_ggml_shape, size_t prefix_dims,
int64_t tail_dim) -> std::vector<int64_t> {
std::vector<int64_t> reshape_pattern(prefix_dims + 1);
size_t dynamic_dims = 0;
if (target_ov_shape.rank().is_static() &&
target_ov_shape.rank().get_length() == static_cast<int64_t>(target_ggml_shape.size())) {
for (size_t i = 0; i < prefix_dims; ++i) {
if (target_ov_shape[i].is_static()) {
reshape_pattern[i] = target_ov_shape[i].get_length();
} else {
reshape_pattern[i] = -1;
++dynamic_dims;
}
}
} else {
dynamic_dims = 2;
}
if (dynamic_dims > 1) {
for (size_t i = 0; i < prefix_dims; ++i) {
reshape_pattern[i] = static_cast<int64_t>(target_ggml_shape[i]);
}
}
reshape_pattern[prefix_dims] = tail_dim;
return reshape_pattern;
};
bool same_stride = view_stride.size() == view_src_stride.size();
if (same_stride) {
for (size_t i = 0; i < view_stride.size(); ++i) {
if (view_stride[i] != view_src_stride[i]) {
same_stride = false;
break;
}
}
}
bool same_ggml_shape = view_ggml_shape.size() == view_src_ggml_shape.size();
if (same_ggml_shape) {
for (size_t i = 0; i < view_ggml_shape.size(); ++i) {
if (view_ggml_shape[i] != view_src_ggml_shape[i]) {
same_ggml_shape = false;
break;
}
}
}
if (same_stride && same_ggml_shape) {
return current;
}
if (same_stride) {
const size_t relative_offset = view_offset >= view_src_offset ? view_offset - view_src_offset : 0;
const size_t ndims = view_stride.size();
std::vector<int> diff_dims;
if (view_ggml_shape.size() == ndims && view_src_ggml_shape.size() == ndims) {
for (size_t i = 0; i < ndims; ++i) {
if (view_ggml_shape[i] != view_src_ggml_shape[i]) {
diff_dims.push_back(static_cast<int>(i));
}
}
}
if (diff_dims.size() == 1) {
const int slice_dim = diff_dims[0];
const int64_t dim_size = static_cast<int64_t>(view_src_ggml_shape[slice_dim]);
if (view_stride[slice_dim] > 0 && relative_offset % view_stride[slice_dim] == 0) {
const int64_t begin_val = static_cast<int64_t>((relative_offset / view_stride[slice_dim]) %
static_cast<size_t>(dim_size));
const int64_t end_val = begin_val + static_cast<int64_t>(view_ggml_shape[slice_dim]);
if (begin_val >= 0 && end_val <= dim_size) {
auto sliced = std::make_shared<ov::op::v8::Slice>(
current, ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {1}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim}));
if (view_ov_shape.is_static()) {
auto reshaped = std::make_shared<ov::op::v1::Reshape>(
sliced,
ov::op::v0::Constant::create(ov::element::i64, {ndims}, view_ov_shape.to_shape()),
false);
reshaped->set_friendly_name(view_name);
return reshaped;
}
sliced->set_friendly_name(view_name);
return sliced;
}
}
int64_t tail_src_elems = 1;
int64_t tail_dst_elems = 1;
for (size_t i = slice_dim; i < ndims; ++i) {
tail_src_elems *= static_cast<int64_t>(view_src_ggml_shape[i]);
tail_dst_elems *= static_cast<int64_t>(view_ggml_shape[i]);
}
const size_t elem_stride = view_stride[ndims - 1];
int64_t tail_begin = 0;
if (elem_stride > 0) {
tail_begin =
static_cast<int64_t>((relative_offset / elem_stride) % static_cast<size_t>(tail_src_elems));
}
const int64_t tail_end = tail_begin + tail_dst_elems;
if (tail_begin >= 0 && tail_end <= tail_src_elems) {
std::vector<int64_t> flat_shape;
for (int i = 0; i < slice_dim; ++i) {
flat_shape.push_back(static_cast<int64_t>(view_src_ggml_shape[i]));
}
flat_shape.push_back(tail_src_elems);
const size_t flat_ndims = flat_shape.size();
auto flat = std::make_shared<ov::op::v1::Reshape>(
current, ov::op::v0::Constant::create(ov::element::i64, {flat_ndims}, flat_shape), false);
auto sliced = std::make_shared<ov::op::v8::Slice>(
flat, ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_begin}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_end}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {1}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim}));
if (view_ov_shape.is_static()) {
auto reshaped = std::make_shared<ov::op::v1::Reshape>(
sliced, ov::op::v0::Constant::create(ov::element::i64, {ndims}, view_ov_shape.to_shape()),
false);
reshaped->set_friendly_name(view_name);
return reshaped;
}
sliced->set_friendly_name(view_name);
return sliced;
}
}
std::vector<int64_t> begin(ndims, 0);
std::vector<int64_t> end(ndims, 0);
std::vector<int64_t> step(ndims, 1);
std::vector<int64_t> axes(ndims, 0);
size_t remaining_offset = relative_offset;
for (size_t i = 0; i < ndims; ++i) {
axes[i] = static_cast<int64_t>(i);
if (view_stride[i] > 0) {
begin[i] = static_cast<int64_t>(remaining_offset / view_stride[i]);
remaining_offset %= view_stride[i];
}
end[i] = begin[i] + static_cast<int64_t>(view_ggml_shape[i]);
}
bool in_bounds = view_src_ggml_shape.size() == ndims && view_ggml_shape.size() == ndims;
if (in_bounds) {
for (size_t i = 0; i < ndims; ++i) {
if (end[i] > static_cast<int64_t>(view_src_ggml_shape[i])) {
in_bounds = false;
break;
}
}
}
if (in_bounds && remaining_offset == 0) {
auto sliced = std::make_shared<ov::op::v8::Slice>(
current, ov::op::v0::Constant::create(ov::element::i64, {ndims}, begin),
ov::op::v0::Constant::create(ov::element::i64, {ndims}, end),
ov::op::v0::Constant::create(ov::element::i64, {ndims}, step),
ov::op::v0::Constant::create(ov::element::i64, {ndims}, axes));
sliced->set_friendly_name(view_name);
return sliced;
}
} else {
bool same_rank = view_stride.size() == view_src_stride.size() &&
view_ggml_shape.size() == view_src_ggml_shape.size() &&
view_stride.size() == view_ggml_shape.size();
const size_t relative_offset = view_offset >= view_src_offset ? view_offset - view_src_offset : 0;
if (same_rank) {
const size_t ndims = view_ggml_shape.size();
std::vector<int> diff_dims;
for (size_t i = 0; i < ndims; ++i) {
if (view_ggml_shape[i] != view_src_ggml_shape[i]) {
diff_dims.push_back(static_cast<int>(i));
}
}
if (diff_dims.size() == 1) {
const size_t slice_dim = static_cast<size_t>(diff_dims[0]);
bool suffix_stride_match = true;
for (size_t i = slice_dim + 1; i < ndims; ++i) {
if (view_stride[i] != view_src_stride[i]) {
suffix_stride_match = false;
break;
}
}
if (suffix_stride_match && view_src_stride[slice_dim] > 0 &&
relative_offset % view_src_stride[slice_dim] == 0) {
const int64_t begin_val = static_cast<int64_t>(relative_offset / view_src_stride[slice_dim]);
const int64_t end_val = begin_val + static_cast<int64_t>(view_ggml_shape[slice_dim]);
const int64_t dim_size = static_cast<int64_t>(view_src_ggml_shape[slice_dim]);
if (begin_val >= 0 && end_val <= dim_size) {
auto sliced = std::make_shared<ov::op::v8::Slice>(
current, ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {1}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {static_cast<int64_t>(slice_dim)}));
sliced->set_friendly_name(view_name);
return sliced;
}
}
}
}
size_t view_elems = 1;
size_t src_elems = 1;
if (same_rank) {
for (size_t i = 0; i < view_ggml_shape.size(); ++i) {
view_elems *= view_ggml_shape[i];
src_elems *= view_src_ggml_shape[i];
}
}
bool same_num_elements = same_rank && view_elems == src_elems;
if (same_rank && relative_offset == 0 && same_num_elements) {
auto reshape_pattern = build_reshape_pattern(view_ov_shape, view_ggml_shape);
auto reshaped = std::make_shared<ov::op::v1::Reshape>(
current, ov::op::v0::Constant::create(ov::element::i64, {reshape_pattern.size()}, reshape_pattern),
false);
reshaped->set_friendly_name(view_name);
return reshaped;
}
if (same_rank) {
const size_t ndims = view_ggml_shape.size();
// Match views that can be expressed as a regular strided slice over the
// already reconstructed source tensor, e.g. offset on one axis plus step > 1
// on another axis.
bool is_regular_slice = view_src_ggml_shape.size() == ndims;
std::vector<int64_t> begin(ndims, 0);
std::vector<int64_t> end(ndims, 0);
std::vector<int64_t> step(ndims, 1);
std::vector<int64_t> axes(ndims, 0);
size_t remaining_offset = relative_offset;
if (is_regular_slice) {
for (size_t i = 0; i < ndims; ++i) {
axes[i] = static_cast<int64_t>(i);
if (view_src_stride[i] == 0 || view_stride[i] == 0 ||
view_stride[i] % view_src_stride[i] != 0) {
is_regular_slice = false;
break;
}
step[i] = static_cast<int64_t>(view_stride[i] / view_src_stride[i]);
if (step[i] <= 0) {
is_regular_slice = false;
break;
}
begin[i] = static_cast<int64_t>(remaining_offset / view_src_stride[i]);
remaining_offset %= view_src_stride[i];
if (view_ggml_shape[i] == 0) {
end[i] = begin[i];
continue;
}
end[i] = begin[i] + step[i] * static_cast<int64_t>(view_ggml_shape[i] - 1) + 1;
if (begin[i] < 0 || end[i] > static_cast<int64_t>(view_src_ggml_shape[i])) {
is_regular_slice = false;
break;
}
}
}
if (is_regular_slice && remaining_offset == 0) {
auto sliced = std::make_shared<ov::op::v8::Slice>(
current, ov::op::v0::Constant::create(ov::element::i64, {ndims}, begin),
ov::op::v0::Constant::create(ov::element::i64, {ndims}, end),
ov::op::v0::Constant::create(ov::element::i64, {ndims}, step),
ov::op::v0::Constant::create(ov::element::i64, {ndims}, axes));
sliced->set_friendly_name(view_name);
return sliced;
}
const size_t elem_stride = view_src_stride.back();
const bool aligned_offset = elem_stride > 0 && relative_offset % elem_stride == 0;
if (aligned_offset) {
size_t suffix_start = 0;
size_t expected_stride = elem_stride;
for (int i = static_cast<int>(ndims) - 1; i >= 0; --i) {
if (view_stride[i] != expected_stride) {
suffix_start = static_cast<size_t>(i + 1);
break;
}
expected_stride *= view_ggml_shape[i];
}
size_t prefix_elems = 1;
size_t suffix_elems = 1;
for (size_t i = 0; i < suffix_start; ++i) {
prefix_elems *= view_ggml_shape[i];
}
for (size_t i = suffix_start; i < ndims; ++i) {
suffix_elems *= view_ggml_shape[i];
}
if (prefix_elems > 0 && src_elems % prefix_elems == 0) {
const size_t src_tail_elems = src_elems / prefix_elems;
const int64_t tail_begin = static_cast<int64_t>(relative_offset / elem_stride);
const int64_t tail_end = tail_begin + static_cast<int64_t>(suffix_elems);
if (tail_begin >= 0 && tail_end <= static_cast<int64_t>(src_tail_elems)) {
auto prefix_tail_pattern = build_prefix_tail_reshape_pattern(
view_ov_shape, view_ggml_shape, suffix_start, static_cast<int64_t>(src_tail_elems));
auto prefix_tail = std::make_shared<ov::op::v1::Reshape>(
current,
ov::op::v0::Constant::create(ov::element::i64, {prefix_tail_pattern.size()},
prefix_tail_pattern),
false);
ov::Output<ov::Node> selected = prefix_tail;
if (tail_begin != 0 || tail_end != static_cast<int64_t>(src_tail_elems)) {
selected = std::make_shared<ov::op::v8::Slice>(
prefix_tail, ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_begin}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_end}),
ov::op::v0::Constant::create(ov::element::i64, {1}, {1}),
ov::op::v0::Constant::create(ov::element::i64, {1},
{static_cast<int64_t>(suffix_start)}));
}
auto reshape_pattern = build_reshape_pattern(view_ov_shape, view_ggml_shape);
auto reshaped = std::make_shared<ov::op::v1::Reshape>(
selected,
ov::op::v0::Constant::create(ov::element::i64, {reshape_pattern.size()},
reshape_pattern),
false);
reshaped->set_friendly_name(view_name);
return reshaped;
}
}
}
}
return current;
}
(void) view_name;
(void) view_src_ov_shape;
(void) view_src_name;
return current;
};
// Process views from the base tensor (last) to the current view (first)
// Start with the base tensor
ov::Output<ov::Node> current = input;
// Process each view in reverse order (from base to current)
for (int view_idx = view_input_size - 1; view_idx >= 0; view_idx--) {
auto view_offset = context.get_view_input_offset(input_index, view_idx);
auto view_stride = context.get_view_input_stride(input_index, view_idx);
auto view_ggml_shape = context.get_view_input_ggml_shape(input_index, view_idx);
auto view_ov_shape = context.get_view_input_ov_shape(input_index, view_idx);
auto view_name = context.get_view_input_name(input_index, view_idx);
// print view info
// std::cout << "View " << view_idx << ": name = " << view_name << ", offset = " << view_offset << ", stride = ["
// << view_stride[0] << "," << view_stride[1] << "," << view_stride[2] << "," << view_stride[3]
// << "], ggml shape = [" << view_ggml_shape[0] << "," << view_ggml_shape[1] << ","
// << view_ggml_shape[2] << "," << view_ggml_shape[3] << "], ov shape = " << view_ov_shape << std::endl;
auto view_src_offset = context.get_view_input_src_offset(input_index, view_idx);
auto view_src_stride = context.get_view_input_src_stride(input_index, view_idx);
auto view_src_ggml_shape = context.get_view_input_src_ggml_shape(input_index, view_idx);
auto view_src_ov_shape = context.get_view_input_src_ov_shape(input_index, view_idx);
auto view_src_name = context.get_view_input_src_name(input_index, view_idx);
// print source view info
// std::cout << "View " << view_idx << ": source name = " << view_src_name
// << ", source offset = " << view_src_offset << ", source stride = [" << view_src_stride[0] << ","
// << view_src_stride[1] << "," << view_src_stride[2] << "," << view_src_stride[3]
// << "], source ggml shape = [" << view_src_ggml_shape[0] << "," << view_src_ggml_shape[1] << ","
// << view_src_ggml_shape[2] << "," << view_src_ggml_shape[3]
// << "], source ov shape = " << view_src_ov_shape << std::endl;
current = process_single_view(current, view_offset, view_stride, view_ggml_shape, view_ov_shape, view_name,
view_src_offset, view_src_stride, view_src_ggml_shape, view_src_ov_shape,
view_src_name);
}
return current;
}
} // namespace ggml
} // namespace frontend
} // namespace ov
+28 -26
View File
@@ -1,13 +1,13 @@
#pragma once
#include "node_context.h"
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/op/shape_of.hpp>
#include <openvino/op/slice.hpp>
#include <utility>
#include "node_context.h"
namespace ov {
namespace frontend {
namespace ggml {
@@ -16,30 +16,23 @@ std::string getCurrentTime();
void dump_ov_model(std::shared_ptr<ov::Model> model);
void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs);
void num_inputs_check(const NodeContext & context, size_t min_inputs, size_t max_inputs);
int non_cont_dim(std::vector<size_t> ne, std::vector<size_t> nb);
template <typename T>
std::vector<int> argsort_descend(const std::vector<T>& v) {
template <typename T> std::vector<int> argsort_descend(const std::vector<T> & v) {
std::vector<int> idx(v.size());
std::iota(idx.begin(), idx.end(), 0);
std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) {
return v[i1] > v[i2];
});
std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) { return v[i1] > v[i2]; });
return idx;
}
template <typename T>
std::vector<T> sorted_descend(std::vector<T> v) {
std::sort(v.begin(), v.end(), [](T a, T b) {
return a > b;
});
template <typename T> std::vector<T> sorted_descend(std::vector<T> v) {
std::sort(v.begin(), v.end(), [](T a, T b) { return a > b; });
return v;
}
template <typename T>
bool is_permuted(const std::vector<T>& strides) {
template <typename T> bool is_permuted(const std::vector<T> & strides) {
for (size_t i = 0; i < strides.size() - 1; ++i) {
if (strides[i] < strides[i + 1]) {
return true;
@@ -48,8 +41,7 @@ bool is_permuted(const std::vector<T>& strides) {
return false;
}
template <typename T>
std::vector<T> permute(const std::vector<T>& x, const std::vector<int>& perm) {
template <typename T> std::vector<T> permute(const std::vector<T> & x, const std::vector<int> & perm) {
std::vector<T> result;
result.reserve(perm.size());
for (int i : perm) {
@@ -58,25 +50,35 @@ std::vector<T> permute(const std::vector<T>& x, const std::vector<int>& perm) {
return result;
}
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf>& shape,
const std::vector<int>& dims);
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node>& node, const std::vector<int>& dims);
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::op::v3::ShapeOf> & shape,
const std::vector<int> & dims);
std::shared_ptr<ov::Node> get_dimensions(const std::shared_ptr<ov::Node> & node, const std::vector<int> & dims);
OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix);
OutputVector rename_outputs_with_suffix(const OutputVector & outputs, const std::string & suffix);
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t* rope_params,
std::pair<ov::Output<Node>, ov::Output<Node>> make_sin_cos(int32_t * rope_params,
std::shared_ptr<ov::Node> inp_pos,
std::shared_ptr<ov::Node> rope_freqs_weight = nullptr,
bool imrope = false,
bool stateful = false);
ov::Output<ov::Node> process_view_input(const NodeContext& context, int input_index, int slice_len = 0);
ov::Output<ov::Node> process_view_input(const NodeContext & context, int input_index, int slice_len = 0);
ov::Output<ov::Node> process_view_input_new(const NodeContext & context, int input_index);
namespace op {
template <typename T>
OutputVector translate_1to1_match_2_inputs(const NodeContext& context) {
template <typename T> OutputVector translate_1to1_match_2_inputs(const NodeContext & context) {
num_inputs_check(context, 2, 2);
auto res = std::make_shared<T>(context.get_input(0), context.get_input(1));
auto input_0 = process_view_input_new(context, 0);
auto input_1 = process_view_input_new(context, 1);
auto res = std::make_shared<T>(input_0, input_1);
return rename_outputs_with_suffix({res}, context.get_name());
}
template <typename T> OutputVector translate_1to1_match_1_input(const NodeContext & context) {
num_inputs_check(context, 1, 1);
auto input = process_view_input_new(context, 0);
auto res = std::make_shared<T>(input);
return rename_outputs_with_suffix({res}, context.get_name());
}
} // namespace op
+385 -96
View File
@@ -14,6 +14,7 @@
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <memory>
@@ -25,9 +26,11 @@
#include <openvino/openvino.hpp>
#include <openvino/runtime/compiled_model.hpp>
#include <openvino/runtime/infer_request.hpp>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/properties.hpp>
#include <openvino/runtime/properties.hpp>
#include <openvino/runtime/tensor.hpp>
#include <optional>
#include <string>
#include <unordered_map>
#include <vector>
@@ -39,7 +42,7 @@
enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend) {
ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context;
try {
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DUMP_CGRAPH")) {
std::string filename = "cgraph_ov.txt";
GgmlOvDecoder::dump_cgraph(cgraph, filename);
}
@@ -62,10 +65,92 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend)
}
}
// For a KV cache input, return an ov::Tensor sized to n_kv (== attention_size
// for that layer) instead of the fully-allocated ctx_per_seq. Pre-conditions:
// * non-static (CPU/GPU) backend, single sequence, seq_active_start == 0
// * ggml KV layout is a contiguous [1, 1, ctx_per_seq, n_heads_kv*head_size]
// so the first n_kv rows are the live prefix and shrinking the ctx axis
// gives a valid tensor over the same host storage
// * not an SWA layer (ring cache): once the window has wrapped the first
// n_kv rows no longer contain the live prefix
// On any unmet pre-condition returns std::nullopt; the caller falls back to
// the full-size tensor.
static std::optional<ov::Tensor> try_make_kv_sliced_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
const std::string & name,
const ggml_tensor * ggml_tensor) {
static const bool kv_slice_disabled = ggml_openvino_getenv_int("GGML_OPENVINO_DISABLE_KV_SLICE");
if (kv_slice_disabled) {
return std::nullopt;
}
if (ggml_decoder->is_static() || ggml_decoder->is_stateful()) {
return std::nullopt;
}
if (ggml_tensor->op != GGML_OP_NONE || ggml_tensor->view_src != nullptr) {
return std::nullopt;
}
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
if (!GgmlOvDecoder::is_kvcache(ggml_tensor, op)) {
return std::nullopt;
}
const auto & compute_params = ggml_decoder->get_compute_params();
if (compute_params.n_seq_active != 1 || compute_params.seq_active_start != 0) {
return std::nullopt;
}
int layer;
if (auto layer_opt = extract_layer_from_name(name); layer_opt.has_value()) {
layer = layer_opt.value();
} else {
return std::nullopt;
}
const bool is_swa = ggml_decoder->is_swa_layer(layer);
if (is_swa) {
return std::nullopt;
}
const int ctx_per_seq = ggml_decoder->get_ctx_per_seq();
const int n_kv = compute_params.attention_size;
if (ctx_per_seq <= 0 || n_kv <= 0 || n_kv >= ctx_per_seq) {
return std::nullopt;
}
ov::Shape full_shape = ggml_decoder->get_shape(ggml_tensor);
if (full_shape.size() != 4 || full_shape[0] != 1 || full_shape[1] != 1 ||
static_cast<int>(full_shape[2]) != ctx_per_seq) {
return std::nullopt;
}
ov::Shape sliced_shape = full_shape;
sliced_shape[2] = static_cast<size_t>(n_kv);
// Disabling for now as gpu has bug with in-place ScatterUpdate with remote tensors, can re-enable once CVS-186519 is fixed
// if (ggml_openvino_buffer_is_remote(ggml_tensor)) {
// auto remote_context = ggml_openvino_get_remote_context();
// auto gpu_context = remote_context->as<ov::intel_gpu::ocl::ClContext>();
// return gpu_context.create_tensor(ggml_decoder->get_ov_type(ggml_tensor), sliced_shape, ggml_tensor->data);
// }
return ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), sliced_shape, ggml_tensor->data);
}
ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
std::shared_ptr<ov::InferRequest> infer_request,
int output_index,
const ggml_tensor * ggml_tensor) {
if (auto sliced = try_make_kv_sliced_tensor(ggml_decoder, std::string(ggml_tensor->name), ggml_tensor)) {
return *sliced;
}
// Disabling for now as gpu has bug with in-place ScatterUpdate with remote tensors, can re-enable once CVS-186519 is fixed
// if (ggml_tensor->extra != nullptr && !ggml_decoder->is_splited_model()) {
// auto * extra_base = static_cast<ggml_openvino_extra_base *>(ggml_tensor->extra);
// if (extra_base->type == ggml_openvino_extra_base::Type::TENSOR) {
// auto * tensor_extra = static_cast<ggml_openvino_tensor_extra *>(extra_base);
// return *tensor_extra->tensor;
// }
// }
auto output_type = ggml_decoder->get_ov_type(ggml_tensor);
ov::Shape output_shape;
if (ggml_decoder->is_static()) {
@@ -86,7 +171,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
static auto is_static = false;
if (is_naive(cgraph)) {
return naive_compute(cgraph, core, device, config);
if (!is_model_splitted(cgraph)) {
return naive_compute(cgraph, core, device, config);
}
}
auto start_time = ggml_time_us();
@@ -98,18 +185,20 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
graph_key key(cgraph);
bool cache_hit;
static const bool cache_enabled = !ggml_openvino_getenv_int("GGML_OPENVINO_DISABLE_CACHE");
bool cache_hit = false;
int64_t decoder_end_time;
int64_t conversion_end_time;
int64_t compile_end_time;
int64_t infer_end_time;
int64_t ov_raw_infer_start;
{
std::shared_ptr<decoder_runtime_ctx> entry;
ModelParams old_m_params;
{
if (cache_enabled) {
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
@@ -120,6 +209,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
entry = std::make_shared<decoder_runtime_ctx>(mutex);
r_ctx->decoder_cache[key] = entry;
}
} else {
auto mutex = std::make_shared<std::mutex>();
entry = std::make_shared<decoder_runtime_ctx>(mutex);
cache_hit = false;
}
std::lock_guard<std::mutex> lock(*(entry->mutex));
@@ -127,9 +220,14 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
if (cache_hit) {
ggml_decoder = entry->ptr;
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_dynamically(m_params);
if (!ggml_decoder->is_splited_model()) {
cache_hit = old_m_params.can_reuse_dynamically(m_params);
}
}
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
if (cache_hit) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
ggml_decoder->set_compute_params(c_params);
@@ -141,6 +239,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
infer_request = r_ctx->infer_request_cache.at(key);
ov_input_names = r_ctx->ov_input_names_cache.at(key);
ov_output_names = r_ctx->ov_output_names_cache.at(key);
}
if (stateful) {
@@ -162,14 +262,15 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
try {
state_name = r_ctx->kv_state_input_name_map.at(state.get_name());
} catch (...) {
GGML_LOG_ERROR("GGML OpenVINO backend stateful inference failed: no input found for the state\n");
GGML_LOG_ERROR(
"GGML OpenVINO backend stateful inference failed: no input found for the state\n");
return GGML_STATUS_FAILED;
}
auto kv_tensor = get_ov_input_tensor(ggml_decoder, state_name);
kv_tensor.set_shape({state_tensor_shape[0], kv_tensor.get_shape()[2],
state_tensor_shape[2], state_tensor_shape[3]});
state_tensor = kv_tensor;
state_tensor_shape = state_tensor.get_shape();
kv_tensor.set_shape({state_tensor_shape[0], kv_tensor.get_shape()[2], state_tensor_shape[2],
state_tensor_shape[3]});
state_tensor = kv_tensor;
state_tensor_shape = state_tensor.get_shape();
}
ov::Coordinate begin = {0, 0, 0, 0};
ov::Coordinate end = {state_tensor_shape[0], static_cast<uint32_t>(pos_data[0]),
@@ -177,7 +278,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
ov::Tensor new_state_tensor(state_tensor, begin, end);
state.set_state(new_state_tensor);
}
r_ctx->stateful_kv_size = pos_data[0] + 1;
r_ctx->stateful_kv_size = pos_data[0] + pos_shape[3];
}
}
@@ -185,15 +286,17 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
conversion_end_time = decoder_end_time;
compile_end_time = decoder_end_time;
} else {
{
if (cache_enabled) {
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache.erase(key);
}
bool model_is_splitted = is_model_splitted(cgraph);
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful);
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static,
stateful, model_is_splitted);
decoder_end_time = ggml_time_us();
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
@@ -201,7 +304,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
ggml_decoder->clear_model_weights();
conversion_end_time = ggml_time_us();
if (getenv("GGML_OPENVINO_DUMP_IR")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DUMP_IR")) {
char timestamped_filename[64];
auto timestamp = (long long) ggml_time_us();
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
@@ -219,8 +322,6 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
entry->ptr = ggml_decoder;
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
for (const auto & ov_param : model->get_parameters()) {
ov_input_names.push_back(ov_param->get_friendly_name());
}
@@ -228,66 +329,64 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
ov_output_names.push_back(ov_output->get_friendly_name());
}
{
if (cache_enabled) {
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache[key] = infer_request;
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
r_ctx->ov_input_names_cache[key] = ov_input_names;
r_ctx->ov_output_names_cache[key] = ov_output_names;
}
if (stateful) {
if (stateful && cache_enabled) {
const auto * inp_pos = get_inp_pos_tensor(cgraph);
auto pos_shape = ggml_decoder->get_shape(inp_pos);
r_ctx->stateful_kv_size = pos_shape[3];
const auto kv_param_res_names = ggml_decoder->get_kv_param_res_names();
for (const auto& pair : kv_param_res_names) {
r_ctx->kv_state_input_name_map[pair.first+pair.second] = pair.first;
for (const auto & pair : kv_param_res_names) {
r_ctx->kv_state_input_name_map[pair.first + pair.second] = pair.first;
}
}
}
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
ov_input_names = r_ctx->ov_input_names_cache[key];
ov_output_names = r_ctx->ov_output_names_cache[key];
}
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
infer_request->set_input_tensor(i, input_tensor);
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_INPUT")) {
print_input_tensor_info(param_name, input_tensor);
}
}
for (size_t i = 0; i < ov_output_names.size(); i++) {
auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]);
if (ggml_nbytes(ggml_tensor) == 0) {
continue;
}
auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor);
infer_request->set_output_tensor(i, output_tensor);
}
ov_raw_infer_start = ggml_time_us();
infer_request->infer();
infer_end_time = ggml_time_us();
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data());
}
}
if (getenv("GGML_OPENVINO_PROFILING")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_PROFILING")) {
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000);
GGML_LOG_INFO(" - Graph decoder time: %.3f ms \n", (decoder_end_time - start_time) / 1000.0);
if (!cache_hit) {
GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
GGML_LOG_INFO(" - Graph conversion time: %.3f ms \n",
(conversion_end_time - decoder_end_time) / 1000.0);
GGML_LOG_INFO(" - Graph compile time: %.3f ms \n", (compile_end_time - conversion_end_time) / 1000.0);
}
GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
GGML_LOG_INFO(" - Graph inference time: %.3f ms \n", (infer_end_time - compile_end_time) / 1000.0);
GGML_LOG_INFO(" - OV raw infer time: %.3f ms \n", (infer_end_time - ov_raw_infer_start) / 1000.0);
}
}
@@ -298,17 +397,18 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
auto & core = ov_singleton_core();
auto get_prefill_chunk_size = [] {
const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE");
if (chunk_size_str && atoi(chunk_size_str) > 0) {
return atoi(chunk_size_str);
}
return 256;
static const int chunk_size = []() {
int env_prefill_chunk_size = ggml_openvino_getenv_int("GGML_OPENVINO_PREFILL_CHUNK_SIZE");
return env_prefill_chunk_size > 0 ? env_prefill_chunk_size : 256;
}();
return chunk_size;
};
static std::string device = "NPU";
static auto is_static = true;
static auto stateful = false;
static auto prefill_chunk_size = get_prefill_chunk_size();
auto prefill_chunk_size = get_prefill_chunk_size();
const auto & config = ggml_openvino_get_compile_config();
if (is_naive(cgraph)) {
@@ -326,17 +426,20 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
const auto * inp_pos = get_inp_pos_tensor(cgraph);
const auto is_prefill = get_is_prefill(inp_pos);
graph_key key(cgraph);
bool cache_hit;
static const bool cache_enabled = !ggml_openvino_getenv_int("GGML_OPENVINO_DISABLE_CACHE");
bool cache_hit = false;
int64_t decoder_end_time;
int64_t conversion_end_time;
int64_t compile_end_time;
int64_t infer_end_time;
int64_t ov_raw_infer_start;
int64_t ov_raw_infer_total = 0;
std::shared_ptr<decoder_runtime_ctx> entry;
ModelParams old_m_params;
{
if (cache_enabled) {
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
auto it = r_ctx->decoder_cache.find(key);
cache_hit = it != r_ctx->decoder_cache.end();
@@ -347,6 +450,10 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
entry = std::make_shared<decoder_runtime_ctx>(mutex);
r_ctx->decoder_cache[key] = entry;
}
} else {
auto mutex = std::make_shared<std::mutex>();
entry = std::make_shared<decoder_runtime_ctx>(mutex);
cache_hit = false;
}
std::lock_guard<std::mutex> lock(*(entry->mutex));
@@ -357,6 +464,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
cache_hit = old_m_params.can_reuse_statically(m_params);
}
std::vector<std::string> ov_input_names_local;
std::vector<std::string> ov_output_names_local;
if (cache_hit) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
ggml_decoder->m_is_prefill = is_prefill;
@@ -370,13 +480,15 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
infer_request =
is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key);
ov_input_names_local = r_ctx->ov_input_names_cache.at(key);
ov_output_names_local = r_ctx->ov_output_names_cache.at(key);
}
decoder_end_time = ggml_time_us();
conversion_end_time = decoder_end_time;
compile_end_time = decoder_end_time;
} else {
{
if (cache_enabled) {
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache.erase(key);
r_ctx->infer_request_cache_prefill.erase(key);
@@ -385,10 +497,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
is_static, stateful, true, prefill_chunk_size);
if (m_params.n_heads_kv == -1) {
// graph is not a LLM, e.g. context-shift graph
prefill_chunk_size = inp_pos->ne[0];
}
auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(
cgraph, m_params, c_params, model_weights, is_static, stateful, false, true, prefill_chunk_size);
auto ggml_decoder_decode = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static,
stateful, false, prefill_chunk_size);
stateful, false, false, prefill_chunk_size);
decoder_end_time = ggml_time_us();
auto input_model_prefill = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_prefill);
@@ -400,7 +516,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
ggml_decoder_decode->clear_model_weights();
conversion_end_time = ggml_time_us();
if (getenv("GGML_OPENVINO_DUMP_IR")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DUMP_IR")) {
char timestamped_filename[64];
auto timestamp = (long long) ggml_time_us();
snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp);
@@ -429,32 +545,22 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_request = is_prefill ? infer_request_prefill : infer_request_decode;
entry->ptr = ggml_decoder;
std::vector<std::string> ov_input_names;
std::vector<std::string> ov_output_names;
for (const auto & ov_param : model->get_parameters()) {
ov_input_names.push_back(ov_param->get_friendly_name());
ov_input_names_local.push_back(ov_param->get_friendly_name());
}
for (const auto & ov_output : model->get_results()) {
ov_output_names.push_back(ov_output->get_friendly_name());
ov_output_names_local.push_back(ov_output->get_friendly_name());
}
{
if (cache_enabled) {
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
r_ctx->infer_request_cache_prefill[key] = infer_request_prefill;
r_ctx->infer_request_cache[key] = infer_request_decode;
r_ctx->ov_input_names_cache[key] = std::move(ov_input_names);
r_ctx->ov_output_names_cache[key] = std::move(ov_output_names);
r_ctx->ov_input_names_cache[key] = ov_input_names_local;
r_ctx->ov_output_names_cache[key] = ov_output_names_local;
}
}
std::vector<std::string> ov_input_names_local;
std::vector<std::string> ov_output_names_local;
{
std::lock_guard<std::mutex> map_lock(r_ctx->ctx_mutex);
ov_input_names_local = r_ctx->ov_input_names_cache[key];
ov_output_names_local = r_ctx->ov_output_names_cache[key];
}
if (is_prefill) {
auto inp_len = inp_pos->ne[0];
for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) {
@@ -463,7 +569,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index);
infer_request->set_input_tensor(i, input_tensor);
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_INPUT")) {
const auto input_tensor = infer_request->get_input_tensor(i);
print_input_tensor_info(param_name, input_tensor);
}
@@ -475,9 +581,11 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_request->set_output_tensor(i, output_tensor);
}
ov_raw_infer_start = ggml_time_us();
infer_request->infer();
ov_raw_infer_total += ggml_time_us() - ov_raw_infer_start;
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
@@ -491,7 +599,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name);
infer_request->set_input_tensor(i, input_tensor);
if (getenv("GGML_OPENVINO_DEBUG_INPUT")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_INPUT")) {
const auto input_tensor = infer_request->get_input_tensor(i);
print_input_tensor_info(param_name, input_tensor);
}
@@ -503,10 +611,12 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
infer_request->set_output_tensor(i, output_tensor);
}
ov_raw_infer_start = ggml_time_us();
infer_request->infer();
infer_end_time = ggml_time_us();
ov_raw_infer_total = infer_end_time - ov_raw_infer_start;
if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_OUTPUT")) {
for (size_t i = 0; i < ov_output_names_local.size(); i++) {
const auto output_tensor = infer_request->get_output_tensor(i);
print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data());
@@ -514,19 +624,75 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
}
}
if (getenv("GGML_OPENVINO_PROFILING")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_PROFILING")) {
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000);
GGML_LOG_INFO(" - Graph decoder time: %.3f ms \n", (decoder_end_time - start_time) / 1000.0);
if (!cache_hit) {
GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
GGML_LOG_INFO(" - Graph conversion time: %.3f ms \n", (conversion_end_time - decoder_end_time) / 1000.0);
GGML_LOG_INFO(" - Graph compile time: %.3f ms \n", (compile_end_time - conversion_end_time) / 1000.0);
}
GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
GGML_LOG_INFO(" - Graph inference time: %.3f ms \n", (infer_end_time - compile_end_time) / 1000.0);
GGML_LOG_INFO(" - OV raw infer time: %.3f ms \n", ov_raw_infer_total / 1000.0);
}
return GGML_STATUS_SUCCESS;
}
// Detect whether a cgraph is a split subgraph or not.
// Step 1 compares each node's recorded use_count with actual fan-out references in node->src.
// Step 2 verifies that node inputs come from model nodes/weights/leafs; external sources imply split.
bool is_model_splitted(ggml_cgraph * cgraph) {
// check the nodes of the model are used by the following nodes, through compare the node's use count and the count of nodes that use it as input. If does not match, return true, else return false.
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
int use_count = cgraph->use_counts[ggml_hash_find(&cgraph->visited_hash_set, node)];
// TODO: this is a workround for the tests case from llama.cpp, fix should from the root cause in the future.
if ((cgraph->n_nodes <= 1 && use_count == 0) ||
(cgraph->n_nodes <= 1 && node->op == GGML_OP_VIEW && use_count == 1 && node->src[0] != nullptr &&
node->src[0]->op == GGML_OP_NONE)) {
return false;
}
if (cgraph->n_nodes == 1 &&
(cgraph->nodes[0]->op == GGML_OP_TRANSPOSE || cgraph->nodes[0]->op == GGML_OP_PERMUTE)) {
return false;
}
int input_use_count = 0;
for (int j = 0; j < cgraph->n_nodes; j++) {
ggml_tensor * other_node = cgraph->nodes[j];
for (int k = 0; k < GGML_MAX_SRC; k++) {
if (other_node->src[k] == node) {
input_use_count++;
}
}
}
if (use_count != input_use_count && node->op != GGML_OP_NONE) {
return true;
}
}
// if all nodes's src node's src is not come from the nodes in the model, we think the model is splitted. This is a complementary check for the above check, because for some special case like the output node is not used by any node, the use count and input use count are both 0, we can not determine whether the model is splitted or not just based on the first check.
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, true);
std::set<ggml_tensor *> model_nodes(cgraph->nodes, cgraph->nodes + cgraph->n_nodes);
// leaf nodes
std::set<ggml_tensor *> model_leafs(cgraph->leafs, cgraph->leafs + cgraph->n_leafs);
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
for (int j = 0; j < GGML_MAX_SRC; j++) {
ggml_tensor * src = node->src[j];
// the src is also not the model weights, we think the model is splitted.
// the src is also not in model leafs, we think the model is splitted.
if (src != nullptr && model_nodes.find(src) == model_nodes.end() &&
model_weights.find(std::string(src->name)) == model_weights.end() && !model_leafs.empty() == false &&
model_leafs.find(src) == model_leafs.end()) {
if (GgmlOvDecoder::is_inp_tok(src, node)) {
return false;
}
return true;
}
}
}
return false;
}
bool is_naive(ggml_cgraph * cgraph) {
constexpr int naive_graph_size_threshold = 20;
int count = 0;
@@ -551,7 +717,7 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
auto decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights);
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
if (getenv("GGML_OPENVINO_DUMP_IR")) {
if (ggml_openvino_getenv_int("GGML_OPENVINO_DUMP_IR")) {
ov::serialize(model, "IR_naive.xml");
}
@@ -578,40 +744,92 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
infer_request->set_input_tensor(i, input_tensor);
}
auto ov_results = model->get_results();
for (size_t i = 0; i < ov_results.size(); i++) {
auto * ggml_tensor = decoder->get_model_outputs().at(ov_results[i]->get_friendly_name());
auto output_tensor = create_ov_output_tensor(decoder, infer_request, i, ggml_tensor);
infer_request->set_output_tensor(i, output_tensor);
}
// Use get_output_tensor + memcpy instead of set_output_tensor to avoid memory overwritten
// when i/o buffer overlaps, e.g. the cgraph is a single PERMUTE
infer_request->infer();
auto ov_results = model->get_results();
for (size_t i = 0; i < ov_results.size(); i++) {
auto output_tensor = infer_request->get_output_tensor(i);
auto * ggml_tensor = decoder->get_model_outputs().at(ov_results[i]->get_friendly_name());
std::memcpy(ggml_tensor->data, output_tensor.data(), output_tensor.get_byte_size());
}
return GGML_STATUS_SUCCESS;
}
namespace {
template <typename T> void set_zero_diagonal(std::vector<T> & matrix, size_t rows, size_t cols, T zero_value = T{}) {
for (size_t i = 0; i < rows; ++i) {
size_t diag_col = std::min(i, cols - 1);
matrix[i * cols + diag_col] = zero_value;
}
}
ov::Tensor make_contiguous_split_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
const struct ggml_tensor * ggml_tensor,
const ov::Shape & input_shape) {
const size_t element_size = ggml_type_size(ggml_tensor->type);
const size_t block_size = ggml_blck_size(ggml_tensor->type);
GGML_ASSERT(block_size == 1 && "non-contiguous split inputs must be plain element types");
const struct ggml_tensor * source_tensor = ggml_tensor->view_src != nullptr ? ggml_tensor->view_src : ggml_tensor;
const size_t source_offset = ggml_tensor->view_src != nullptr ? ggml_tensor->view_offs : 0;
std::vector<uint8_t> source_data(ggml_nbytes(source_tensor));
ggml_backend_tensor_get(source_tensor, source_data.data(), 0, source_data.size());
ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape);
auto * dst = static_cast<uint8_t *>(input_tensor.data());
size_t dst_offset = 0;
for (size_t i3 = 0; i3 < static_cast<size_t>(ggml_tensor->ne[3]); ++i3) {
for (size_t i2 = 0; i2 < static_cast<size_t>(ggml_tensor->ne[2]); ++i2) {
for (size_t i1 = 0; i1 < static_cast<size_t>(ggml_tensor->ne[1]); ++i1) {
for (size_t i0 = 0; i0 < static_cast<size_t>(ggml_tensor->ne[0]); ++i0) {
const size_t src_offset = source_offset + i3 * ggml_tensor->nb[3] + i2 * ggml_tensor->nb[2] +
i1 * ggml_tensor->nb[1] + i0 * ggml_tensor->nb[0];
std::memcpy(dst + dst_offset, source_data.data() + src_offset, element_size);
dst_offset += element_size;
}
}
}
}
return input_tensor;
}
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name) {
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
if (ggml_tensor->extra != nullptr) {
// GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str());
if (auto sliced = try_make_kv_sliced_tensor(ggml_decoder, name, ggml_tensor)) {
return *sliced;
}
if (ggml_tensor->extra != nullptr && !ggml_decoder->is_splited_model()) {
auto * extra_base = static_cast<ggml_openvino_extra_base *>(ggml_tensor->extra);
if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) {
throw std::runtime_error("ggml tensor extra is not of type TENSOR for input: " + name);
if (extra_base->type == ggml_openvino_extra_base::Type::TENSOR) {
// GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str());
auto * tensor_extra = static_cast<ggml_openvino_tensor_extra *>(extra_base);
return *tensor_extra->tensor;
}
auto * tensor_extra = static_cast<ggml_openvino_tensor_extra *>(extra_base);
return *tensor_extra->tensor;
}
// GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str());
auto * input_data = ggml_tensor->data;
ov::Shape input_shape;
if (ggml_tensor->op == GGML_OP_VIEW) {
if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_splited_model()) {
// This case is added to make test-backend-ops work
input_shape = ggml_decoder->get_shape(ggml_tensor->view_src);
} else {
input_shape = ggml_decoder->get_shape(ggml_tensor);
}
if (ggml_decoder->is_splited_model() && !ggml_is_contiguous(ggml_tensor)) {
return make_contiguous_split_input_tensor(ggml_decoder, ggml_tensor, input_shape);
}
auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data);
return input_tensor;
}
@@ -660,6 +878,14 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
size_t context_size = ggml_decoder->get_ctx_size();
if (ggml_tensor->type == GGML_TYPE_F16) {
std::vector<ggml_fp16_t> padded_data =
pad_input<ggml_fp16_t>(ggml_tensor, 1, context_size, GGML_FP32_TO_FP16(-INFINITY));
ov::Tensor input_tensor(ov::element::f16, ov::Shape{1, 1, 1, context_size});
std::memcpy(input_tensor.data(), padded_data.data(), padded_data.size() * sizeof(ggml_fp16_t));
return input_tensor;
}
std::vector<float> padded_data = pad_input<float>(ggml_tensor, 1, context_size, -INFINITY);
ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size});
auto * data_ptr = input_tensor.data<float>();
@@ -728,9 +954,20 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) {
size_t cols = ggml_tensor->ne[0];
size_t rows = ggml_tensor->ne[1];
float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols;
size_t chunk_valid_rows = std::min(chunk_size, rows - chunk_index * chunk_size);
size_t context_size = ggml_decoder->get_ctx_size();
if (ggml_tensor->type == GGML_TYPE_F16) {
const auto * ggml_data =
static_cast<const ggml_fp16_t *>(ggml_tensor->data) + chunk_index * chunk_size * cols;
std::vector<ggml_fp16_t> padded_data = pad_input<ggml_fp16_t>(ggml_data, chunk_valid_rows, cols, chunk_size,
context_size, GGML_FP32_TO_FP16(-INFINITY));
set_zero_diagonal(padded_data, chunk_size, context_size, GGML_FP32_TO_FP16(0.0f));
ov::Tensor input_tensor(ov::element::f16, ov::Shape{1, 1, chunk_size, context_size});
std::memcpy(input_tensor.data(), padded_data.data(), padded_data.size() * sizeof(ggml_fp16_t));
return input_tensor;
}
const auto * ggml_data = static_cast<const float *>(ggml_tensor->data) + chunk_index * chunk_size * cols;
std::vector<float> padded_data =
pad_input<float>(ggml_data, chunk_valid_rows, cols, chunk_size, context_size, -INFINITY);
set_zero_diagonal(padded_data, chunk_size, context_size);
@@ -753,6 +990,65 @@ size_t checksum(const void * data, size_t size) {
return sum;
}
bool save_ggml_tensor_data_to_txt(const ggml_tensor * tensor, const std::string & file_path) {
if (tensor == nullptr || tensor->data == nullptr) {
return false;
}
std::ofstream out(file_path);
if (!out.is_open()) {
return false;
}
const size_t n = ggml_nelements(tensor);
out << "name: " << tensor->name << ", type: " << ggml_type_name(tensor->type) << ", shape: [" << tensor->ne[0]
<< ", " << tensor->ne[1] << ", " << tensor->ne[2] << ", " << tensor->ne[3] << "]" << ", elements: " << n
<< ", data:" << '\n';
switch (tensor->type) {
case GGML_TYPE_F32: {
const auto * data = static_cast<const float *>(tensor->data);
for (size_t i = 0; i < n; ++i) {
out << data[i] << '\n';
}
break;
}
case GGML_TYPE_F16: {
const auto * data = static_cast<const ggml_fp16_t *>(tensor->data);
for (size_t i = 0; i < n; ++i) {
out << ggml_fp16_to_fp32(data[i]) << '\n';
}
break;
}
case GGML_TYPE_BF16: {
const auto * data = static_cast<const ggml_bf16_t *>(tensor->data);
for (size_t i = 0; i < n; ++i) {
out << ggml_bf16_to_fp32(data[i]) << '\n';
}
break;
}
case GGML_TYPE_I32: {
const auto * data = static_cast<const int32_t *>(tensor->data);
for (size_t i = 0; i < n; ++i) {
out << data[i] << '\n';
}
break;
}
case GGML_TYPE_I64: {
const auto * data = static_cast<const int64_t *>(tensor->data);
for (size_t i = 0; i < n; ++i) {
out << data[i] << '\n';
}
break;
}
default:
out << "unsupported tensor type for text dump" << '\n';
return false;
}
return true;
}
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) {
std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data()
<< std::endl;
@@ -849,13 +1145,6 @@ void print_output_tensor_info(const std::string & name, const ov::Tensor & tenso
}
}
void set_zero_diagonal(std::vector<float> & matrix, size_t rows, size_t cols) {
for (size_t i = 0; i < rows; ++i) {
size_t diag_col = std::min(i, cols - 1);
matrix[i * cols + diag_col] = 0.0f;
}
}
const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) {
for (int i = 0; i < cgraph->n_nodes; ++i) {
auto * op = cgraph->nodes[i];
+11 -8
View File
@@ -1,4 +1,3 @@
#include "ggml-backend-impl.h"
#include "ggml-decoder.h"
#include "ggml-impl.h"
@@ -45,6 +44,7 @@ struct graph_key_hash {
struct decoder_runtime_ctx {
decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) : mutex(std::move(mutex)) {}
std::shared_ptr<std::mutex> mutex;
std::shared_ptr<GgmlOvDecoder> ptr;
};
@@ -64,11 +64,7 @@ struct ov_runtime_context {
std::map<std::string, std::string> kv_state_input_name_map;
std::atomic<int> backend_count;
ov_runtime_context() :
device("CPU"),
stateful(false),
stateful_kv_size(0),
backend_count(0) {}
ov_runtime_context() : device("CPU"), stateful(false), stateful_kv_size(0), backend_count(0) {}
void clear_caches() {
std::lock_guard<std::mutex> lock(ctx_mutex);
@@ -87,6 +83,8 @@ enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph, std::share
size_t checksum(const void * data, size_t size);
bool save_ggml_tensor_data_to_txt(const ggml_tensor * tensor, const std::string & file_path);
void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor);
void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst);
@@ -117,8 +115,6 @@ std::vector<T> pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t
padded_rows, padded_cols, pad_value);
}
void set_zero_diagonal(std::vector<float> & matrix, size_t rows, size_t cols);
const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph);
bool get_is_prefill(const ggml_tensor * inp_pos);
@@ -137,6 +133,13 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
bool is_naive(struct ggml_cgraph * cgraph);
/**
* @brief Heuristically checks whether the given computation graph is a split-model fragment.
* @param cgraph Pointer to the GGML computation graph to analyze.
* @return true if the graph is identified as split; otherwise false.
*/
bool is_model_splitted(struct ggml_cgraph * cgraph);
enum ggml_status naive_compute(struct ggml_cgraph * cgraph,
ov::Core & core,
const std::string & device,
+1
View File
@@ -17,6 +17,7 @@
#include "common.hpp"
#include "concat.hpp"
#include "conv.hpp"
#include "conv3d.hpp"
#include "convert.hpp"
#include "count-equal.hpp"
#include "cpy.hpp"
+7
View File
@@ -287,6 +287,13 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1),
main_stream);
#ifdef GGML_SYCL_HAS_BF16
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) {
op()((const sycl::ext::oneapi::bfloat16 *) src0->data, (const sycl::ext::oneapi::bfloat16 *) src1->data,
(sycl::ext::oneapi::bfloat16 *) dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2,
ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, ggml_is_contiguous(src0),
ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream);
#endif
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
ggml_type_name(src0->type), ggml_type_name(src1->type));
+8
View File
@@ -230,6 +230,7 @@ struct sycl_device_info {
size_t total_vram;
sycl_hw_info hw_info;
optimize_feature opt_feature;
bool usm_system_support; // support for USM system allocations
};
@@ -323,6 +324,11 @@ void ggml_sycl_free_device(void *ptr, sycl::queue &q);
void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};
namespace sycl_ex = sycl::ext::oneapi::experimental;
struct ggml_backend_sycl_context {
int device;
@@ -420,6 +426,8 @@ struct ggml_backend_sycl_context {
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
std::vector<mmid_row_mapping> mmid_row_mapping_host;
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
+21 -1
View File
@@ -10,6 +10,8 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
#include "ggml.h"
#include "concat.hpp"
static inline size_t elem_size(ggml_type t) {
@@ -192,11 +194,29 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
case GGML_TYPE_F32:
concat_impl_sycl<float>(ctx, dst);
break;
case GGML_TYPE_F16:
concat_impl_sycl<sycl::half>(ctx, dst);
break;
#ifdef GGML_SYCL_HAS_BF16
case GGML_TYPE_BF16:
concat_impl_sycl<sycl::ext::oneapi::bfloat16>(ctx, dst);
break;
#endif
case GGML_TYPE_I32:
concat_impl_sycl<int32_t>(ctx, dst);
break;
case GGML_TYPE_I16:
concat_impl_sycl<int16_t>(ctx, dst);
break;
case GGML_TYPE_I64:
concat_impl_sycl<int64_t>(ctx, dst);
break;
case GGML_TYPE_I8:
concat_impl_sycl<int8_t>(ctx, dst);
break;
default:
GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type");
fprintf(stderr, "%s: unsupported types: dst: %s\n", __func__, ggml_type_name(dst->type));
GGML_ASSERT(false);
break;
}
}
+218
View File
@@ -0,0 +1,218 @@
#include "conv3d.hpp"
static inline int64_t ggml_sycl_conv3d_calc_patch_total(const ggml_tensor * dst, int32_t n) {
return (int64_t) n * dst->ne[0] * dst->ne[1] * dst->ne[2];
}
static inline int64_t ggml_sycl_conv3d_calc_knl_n_total(const ggml_tensor * src0, int32_t c) {
return (int64_t) src0->ne[0] * src0->ne[1] * src0->ne[2] * c;
}
static inline void ggml_sycl_conv3d_write_output(
const ggml_tensor * dst,
const float * src, float * dst_data,
int64_t patch_total, int64_t oc,
int64_t dst_w, int64_t dst_h, int64_t dst_d,
dpct::queue_ptr stream) {
const int64_t dst_nb0 = dst->nb[0];
const int64_t dst_nb1 = dst->nb[1];
const int64_t dst_nb2 = dst->nb[2];
const int64_t dst_nb3 = dst->nb[3];
const int64_t total = patch_total * oc;
const int64_t block_size = 256;
const int64_t num_work_items = ((total + block_size - 1) / block_size) * block_size;
stream->parallel_for(sycl::range<1>(num_work_items), [=](sycl::id<1> id) {
const int64_t i = id[0];
if (i >= total) {
return;
}
const int64_t patch_idx = i / oc;
const int64_t out_ch = i % oc;
const int64_t p_in_batch = patch_idx % (dst_w * dst_h * dst_d);
const int64_t batch_idx = patch_idx / (dst_w * dst_h * dst_d);
const int64_t dst_z = p_in_batch / (dst_w * dst_h);
const int64_t dst_y = (p_in_batch % (dst_w * dst_h)) / dst_w;
const int64_t dst_x = p_in_batch % dst_w;
const int64_t ocn_idx = batch_idx * oc + out_ch;
const int64_t dst_offset = dst_x * dst_nb0 + dst_y * dst_nb1 + dst_z * dst_nb2 + ocn_idx * dst_nb3;
// `src` is a column-major (m x n) GEMM output where m == patch_total, n == oc.
// GEMM stores element (row, col) at index `row + col*m`, so compute index accordingly.
const int64_t src_index = patch_idx + out_ch * patch_total;
const float value = src[src_index];
*(float *)((char *)dst_data + dst_offset) = value;
});
}
void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
const int32_t * opts = (const int32_t *) dst->op_params;
const int32_t s0 = opts[0];
const int32_t s1 = opts[1];
const int32_t s2 = opts[2];
const int32_t p0 = opts[3];
const int32_t p1 = opts[4];
const int32_t p2 = opts[5];
const int32_t d0 = opts[6];
const int32_t d1 = opts[7];
const int32_t d2 = opts[8];
const int32_t c = opts[9];
const int32_t n = opts[10];
const int32_t oc = opts[11];
const int64_t knl_w = src0->ne[0];
const int64_t knl_h = src0->ne[1];
const int64_t knl_d = src0->ne[2];
const int64_t patch_total = ggml_sycl_conv3d_calc_patch_total(dst, n);
const int64_t knl_n_total = ggml_sycl_conv3d_calc_knl_n_total(src0, c);
const size_t kernel_type_size = ggml_element_size(src0);
ggml_sycl_pool_alloc<float> gemm_output(ctx.pool());
gemm_output.alloc((size_t) patch_total * oc);
ggml_tensor dst_mat = {};
dst_mat.type = GGML_TYPE_F32;
dst_mat.ne[0] = patch_total;
dst_mat.ne[1] = oc;
dst_mat.ne[2] = 1;
dst_mat.ne[3] = 1;
dst_mat.nb[0] = sizeof(float);
dst_mat.nb[1] = dst_mat.nb[0] * dst_mat.ne[0];
dst_mat.nb[2] = dst_mat.nb[1];
dst_mat.nb[3] = dst_mat.nb[2];
dst_mat.data = gemm_output.get();
dst_mat.buffer = dst->buffer;
dst_mat.extra = dst->extra;
dpct::queue_ptr stream = ctx.stream();
// allocate packed arrays: A_packed (k x m), B_packed (k x n)
ggml_sycl_pool_alloc<float> A_packed_alloc(ctx.pool());
ggml_sycl_pool_alloc<float> B_packed_alloc(ctx.pool());
A_packed_alloc.alloc((size_t) knl_n_total * patch_total * sizeof(float));
B_packed_alloc.alloc((size_t) knl_n_total * oc * sizeof(float));
float * A_packed = A_packed_alloc.get();
float * B_packed = B_packed_alloc.get();
const int m = (int) patch_total;
const int n_gemm = (int) oc;
const int k = (int) knl_n_total;
// Combined kernel: im2col -> pack A, and pack B simultaneously
const char * src1_base = (const char *) src1->data;
const int64_t src1_nb0 = src1->nb[0];
const int64_t src1_nb1 = src1->nb[1];
const int64_t src1_nb2 = src1->nb[2];
const int64_t src1_nb3 = src1->nb[3];
// Compute correct strides for src0 as (knl_n_total, oc) matrix
const int64_t src0_packed_nb0 = kernel_type_size;
const int64_t src0_packed_nb1 = kernel_type_size * knl_n_total;
const int64_t KW = knl_w;
const int64_t KH = knl_h;
const int64_t KD = knl_d;
const int64_t PW = dst->ne[0];
const int64_t PH = dst->ne[1];
const int64_t PD = dst->ne[2];
// Pack A (with inline im2col): for each (row, col) in k x m matrix
const int64_t A_total = (int64_t)k * m;
const int64_t A_block_size = 256;
const int64_t A_num_work = ((A_total + A_block_size - 1) / A_block_size) * A_block_size;
stream->parallel_for(sycl::range<1>(A_num_work), [=](sycl::id<1> id) {
const int64_t t = id[0];
if (t >= A_total) return;
const int64_t row = t % k;
const int64_t col = t / k;
// Inline im2col for this element
const int64_t k_index = row;
const int64_t patch_idx = col;
const int64_t ic = k_index / (KD * KH * KW);
const int64_t rem = k_index - ic * (KD * KH * KW);
const int64_t kz = rem / (KH * KW);
const int64_t rem2 = rem - kz * (KH * KW);
const int64_t ky = rem2 / KW;
const int64_t kx = rem2 % KW;
const int64_t p_in_batch = patch_idx % (PW * PH * PD);
const int64_t batch_idx = patch_idx / (PW * PH * PD);
const int64_t dst_z = p_in_batch / (PW * PH);
const int64_t dst_y = (p_in_batch % (PW * PH)) / PW;
const int64_t dst_x = p_in_batch % PW;
const int64_t sx = dst_x * s0 + kx * d0 - p0;
const int64_t sy = dst_y * s1 + ky * d1 - p1;
const int64_t sz = dst_z * s2 + kz * d2 - p2;
float val = 0.0f;
if (sx >= 0 && sx < src1->ne[0] && sy >= 0 && sy < src1->ne[1] && sz >= 0 && sz < src1->ne[2]) {
const int64_t channel_idx = batch_idx * c + ic;
const char * ptr = src1_base + sx * src1_nb0 + sy * src1_nb1 + sz * src1_nb2 + channel_idx * src1_nb3;
val = *(const float *) ptr;
}
A_packed[row + col * (int64_t)k] = val;
});
// Pack B: for each (row, col) in k x n_gemm matrix
const int64_t B_total = (int64_t)k * n_gemm;
const int64_t B_block_size = 256;
const int64_t B_num_work = ((B_total + B_block_size - 1) / B_block_size) * B_block_size;
stream->parallel_for(sycl::range<1>(B_num_work), [=](sycl::id<1> id) {
const int64_t t = id[0];
if (t >= B_total) return;
const int64_t row = t % k;
const int64_t col = t / k;
const char * src_ptr = (const char *) src0->data + row * src0_packed_nb0 + col * src0_packed_nb1;
float v;
if (src0->type == GGML_TYPE_F32) {
v = *(const float *) src_ptr;
} else {
v = sycl::vec<sycl::half, 1>(*(const sycl::half *) src_ptr).convert<float, sycl::rounding_mode::automatic>()[0];
}
B_packed[row + col * (int64_t)k] = v;
});
// GEMM: C = A^T * B where A is (k x m), B is (k x n), C is (m x n)
const float alpha = 1.0f;
const float beta = 0.0f;
const int lda = k;
const int ldb = k;
const int ldc = m;
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans,
m, n_gemm, k,
dpct::get_value(&alpha, *stream),
(const float *) A_packed, lda,
(const float *) B_packed, ldb,
dpct::get_value(&beta, *stream),
(float *) dst_mat.data, ldc)));
const float * gemm_data = (const float *) dst_mat.data;
float * dst_data = (float *) dst->data;
ggml_sycl_conv3d_write_output(dst, gemm_data, dst_data, patch_total, oc,
dst->ne[0], dst->ne[1], dst->ne[2], stream);
}
+8
View File
@@ -0,0 +1,8 @@
#ifndef GGML_SYCL_CONV3D_HPP
#define GGML_SYCL_CONV3D_HPP
#include "common.hpp"
void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_CONV3D_HPP
+133 -1
View File
@@ -1022,6 +1022,120 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
}
}
static void dequantize_mul_mat_vec_q5_k_reorder(const void *__restrict__ vx,
const float *__restrict__ yy,
float *__restrict__ dst,
const int ncols, int nrows,
const sycl::nd_item<3> &item_ct1) {
const int row = item_ct1.get_group(2);
const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row;
// SOA base pointers for the reordered layout:
// [qs: nb * QK_K/2] [qh: nb * QK_K/8] [scales: nb * K_SCALE_SIZE] [dm: nb * sizeof(half2)]
const int nb = nrows * num_blocks_per_row;
const uint8_t * qs_base = (const uint8_t *)vx;
const uint8_t * qh_base = qs_base + (size_t)nb * (QK_K / 2);
const uint8_t * scales_base = qh_base + (size_t)nb * (QK_K / 8);
const sycl::half2 * dm_base = (const sycl::half2 *)(scales_base + (size_t)nb * K_SCALE_SIZE);
float tmp = 0; // partial sum for thread in warp
#if QK_K == 256
const uint16_t kmask1 = 0x3f3f;
const uint16_t kmask2 = 0x0f0f;
const uint16_t kmask3 = 0xc0c0;
const int tid = item_ct1.get_local_id(2) / 2; // 0...15
const int ix = item_ct1.get_local_id(2) % 2;
const int il = tid/4; // 0...3
const int ir = tid - 4*il;// 0...3
const int n = 2;
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const int in = il%2;
const int l0 = n*(2*ir + in);
const int q_offset = 32*im + l0;
const int y_offset = 64*im + l0;
const uint8_t hm1 = 1 << (2*im);
const uint8_t hm2 = hm1 << 4;
uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux;
uint16_t q16[8];
const uint8_t * q4 = (const uint8_t *)q16;
for (int i = ix; i < num_blocks_per_row; i += 2) {
const int bi = ib0 + i;
const uint8_t * ql1 = qs_base + bi * (QK_K / 2) + q_offset;
const uint8_t * qh = qh_base + bi * (QK_K / 8) + l0;
const float * y1 = yy + i*QK_K + y_offset;
const float * y2 = y1 + 128;
const sycl::half2 dm_val = dm_base[bi];
const float dall = dm_val[0];
const float dmin = dm_val[1];
const uint16_t * a = (const uint16_t *)(scales_base + bi * K_SCALE_SIZE);
aux[0] = a[im+0] & kmask1;
aux[1] = a[im+2] & kmask1;
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
float smin = 0;
const uint16_t * q1 = (const uint16_t *)ql1;
const uint16_t * q2 = q1 + 32;
q16[0] = q1[0] & 0x0f0f;
q16[1] = q1[8] & 0x0f0f;
q16[2] = (q1[0] >> 4) & 0x0f0f;
q16[3] = (q1[8] >> 4) & 0x0f0f;
q16[4] = q2[0] & 0x0f0f;
q16[5] = q2[8] & 0x0f0f;
q16[6] = (q2[0] >> 4) & 0x0f0f;
q16[7] = (q2[8] >> 4) & 0x0f0f;
for (int l = 0; l < n; ++l) {
sum.x() +=
y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
sum.y() +=
y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
sum.z() +=
y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
sum.w() +=
y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
}
tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
sum.w() * sc[5]) -
dmin * smin;
}
#else
// The reordered Q5_K layout is only produced for QK_K == 256.
#endif
// sum up partial sums and write back result
#pragma unroll
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
tmp +=
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
}
if (item_ct1.get_local_id(2) == 0) {
dst[row] = tmp;
}
}
static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
const sycl::nd_item<3> &item_ct1) {
@@ -1599,6 +1713,19 @@ static void dequantize_mul_mat_vec_q4_K_sycl_reorder(const void *vx, const float
});
}
static void dequantize_mul_mat_vec_q5_K_sycl_reorder(const void *vx, const float *y,
float *dst, const int ncols,
const int nrows,
dpct::queue_ptr stream) {
GGML_ASSERT(ncols % QK_K == 0);
const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
stream->parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] {
dequantize_mul_mat_vec_q5_k_reorder(vx, y, dst, ncols, nrows, item_ct1);
});
}
static void dequantize_mul_mat_vec_q6_K_sycl_reorder(const void *vx, const float *y,
float *dst, const int ncols,
const int nrows,
@@ -1695,7 +1822,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
}
break;
case GGML_TYPE_Q5_K:
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
dequantize_mul_mat_vec_q5_K_sycl_reorder(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
} else {
dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
}
break;
case GGML_TYPE_Q6_K:
if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
+25 -51
View File
@@ -124,6 +124,11 @@ static __dpct_inline__ T op_exp(T x) {
return sycl::exp(x);
}
template<typename T>
static __dpct_inline__ T op_expm1(T x) {
return sycl::expm1(x);
}
template<typename T>
static __dpct_inline__ T op_log(T x) {
if (x <= static_cast<T>(0)) {
@@ -266,13 +271,6 @@ static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl:
}
}
template<typename T>
static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_floor(x[i]);
}
}
template<typename T>
static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
@@ -280,20 +278,6 @@ static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::
}
}
template<typename T>
static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_round(x[i]);
}
}
template<typename T>
static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_trunc(x[i]);
}
}
template<typename T>
static void clamp(const T * x, T * dst, const float min, const float max, const int k,
const sycl::nd_item<1> &item_ct1) {
@@ -605,6 +589,12 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor
});
}
static inline void ggml_sycl_op_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_expm1(x);
});
}
static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
@@ -728,16 +718,9 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens
}
static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256);
stream->parallel_for(
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) {
unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_floor(x);
});
}
static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
@@ -747,29 +730,15 @@ static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tenso
}
static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256);
stream->parallel_for(
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) {
unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_round(x);
});
}
static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256);
stream->parallel_for(
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) {
unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_trunc(x);
});
}
static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
@@ -1018,6 +987,11 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_exp(ctx, dst);
}
void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_expm1(ctx, dst);
}
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_log(ctx, dst);
+2
View File
@@ -59,6 +59,8 @@ void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+289 -50
View File
@@ -72,6 +72,9 @@
#include "ggml-sycl/gated_delta_net.hpp"
#include "ggml-sycl/pool.hpp"
#define MEM_SIZE_2M 0x00200000
#define MEM_SIZE_1G 0x40000000
static bool g_sycl_loaded = false;
int g_ggml_sycl_debug = 0;
int g_ggml_sycl_disable_optimize = 0;
@@ -83,7 +86,7 @@ int g_ggml_sycl_use_async_mem_op = 0;
int g_ggml_sycl_use_async_mem_op_requested = 1;
int g_ggml_sycl_enable_level_zero = 0;
int g_ggml_sycl_enable_flash_attention = 1;
int g_ggml_sycl_usm_system = 0;
static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
@@ -137,6 +140,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu);
info.devices[i].smpbo = prop.get_local_mem_size();
info.devices[i].warp_size = WARP_SIZE;
info.devices[i].usm_system_support = device.has(sycl::aspect::usm_system_allocations);
info.max_work_group_sizes[i] = prop.get_max_work_group_size();
info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
@@ -274,6 +278,8 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_flash_attention = 0;
#endif
g_ggml_sycl_usm_system = ggml_sycl_get_env("GGML_SYCL_USM_SYSTEM", 0);
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
GGML_LOG_INFO("Build with Macros:\n");
@@ -342,6 +348,8 @@ static void ggml_check_sycl() try {
g_ggml_sycl_enable_flash_attention);
#endif
GGML_LOG_INFO(" GGML_SYCL_USM_SYSTEM: %d\n", g_ggml_sycl_usm_system);
/* NOT REMOVE, keep it for next optimize for XMX.
#if defined(SYCL_USE_XMX)
fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
@@ -417,6 +425,14 @@ catch (sycl::exception const &exc) {
std::exit(1);
}
inline void free_aligned_mem_host(void * memblock) {
#ifdef _WIN32
_aligned_free(memblock);
#else
free(memblock);
#endif
}
// sycl buffer
struct ggml_backend_sycl_buffer_context {
@@ -426,9 +442,10 @@ struct ggml_backend_sycl_buffer_context {
std::string name;
optimize_feature opt_feature;
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
bool is_usm_system;
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
device(device), dev_ptr(dev_ptr), stream(stream) {
ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream, bool is_usm_system) :
device(device), dev_ptr(dev_ptr), stream(stream), is_usm_system(is_usm_system) {
check_allow_gpu_index(device);
name = (GGML_SYCL_NAME + std::to_string(device));
opt_feature = ggml_sycl_info().devices[device].opt_feature;
@@ -437,7 +454,10 @@ struct ggml_backend_sycl_buffer_context {
~ggml_backend_sycl_buffer_context() {
if (dev_ptr != nullptr) {
ggml_sycl_set_device(device);
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
if (is_usm_system)
free_aligned_mem_host(dev_ptr);
else
SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream)));
}
//release extra used by tensors
@@ -759,21 +779,59 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t
return ctx->name.c_str();
}
static bool check_usm_system(int device, size_t size) {
bool use_usm_system = g_ggml_sycl_usm_system && size >= MEM_SIZE_1G;
if (use_usm_system && !ggml_sycl_info().devices[device].usm_system_support) {
GGML_LOG_INFO("Device does not support USM system allocations\n");
use_usm_system = false;
}
return use_usm_system;
}
inline void * aligned_malloc_host(size_t alignment, size_t size) {
#ifdef _WIN32
return _aligned_malloc(size, alignment);
#else
return aligned_alloc(alignment, size);
#endif
}
static ggml_backend_buffer_t
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
size_t size) try {
ggml_check_sycl();
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
ggml_sycl_set_device(buft_ctx->device);
const queue_ptr stream = buft_ctx->stream;
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
/*
Alignment below ensures best performance. While in theory it could lead to
wasting memory, this is acceptable because in practice only few buffers are
allocated and even less exceed the minimum size accepted here for USM system
allocations.
*/
size_t alignment = MEM_SIZE_2M;
size_t aligned_size = ((size + alignment - 1) / alignment) * alignment;
bool use_usm_system = check_usm_system(buft_ctx->device, aligned_size);
void * dev_ptr;
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
if (use_usm_system) {
dev_ptr = (void *)aligned_malloc_host(alignment, aligned_size);
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
return nullptr;
}
} else {
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream)));
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
}
}
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream, use_usm_system);
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
}
catch (sycl::exception const &exc) {
@@ -1300,22 +1358,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_
GGML_UNUSED(buft);
}
inline void * aligned_malloc_host(size_t alignment, size_t size) {
#ifdef _WIN32
return _aligned_malloc(size, alignment);
#else
return aligned_alloc(alignment, size);
#endif
}
inline void free_aligned_mem_host(void * memblock) {
#ifdef _WIN32
_aligned_free(memblock);
#else
free(memblock);
#endif
}
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free_aligned_mem_host((void *)buffer->context);
}
@@ -3685,6 +3727,149 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
return true;
}
// Reorder each expert slice into a self-contained SoA layout.
static bool reorder_qw_q4_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
GGML_ASSERT(expert_bytes % sizeof(block_q4_K) == 0);
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q4_K));
const size_t total_bytes = expert_bytes * (size_t) n_expert;
sycl_reorder_temp_buffer tmp(stream, total_bytes);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}
const int total_blocks = blocks_per_expert * (int) n_expert;
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
const int gb = gb_;
const int e = gb / blocks_per_expert;
const int ib = gb % blocks_per_expert;
const block_q4_K * x = (const block_q4_K *) (tmp_buf + (size_t) e * expert_bytes);
uint8_t * base = data_device + (size_t) e * expert_bytes;
auto * qs_ptr = base;
auto * scales_ptr = qs_ptr + QK_K / 2 * blocks_per_expert;
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert);
for (int j = 0; j < QK_K / 2; ++j) {
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
}
for (int j = 0; j < K_SCALE_SIZE; ++j) {
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
}
dm_ptr[ib] = x[ib].dm;
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
return true;
}
// Reorder each Q5_K expert slice into [qs][qh][scales][dm].
static bool reorder_qw_q5_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
GGML_ASSERT(expert_bytes % sizeof(block_q5_K) == 0);
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q5_K));
const size_t total_bytes = expert_bytes * (size_t) n_expert;
sycl_reorder_temp_buffer tmp(stream, total_bytes);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}
const int total_blocks = blocks_per_expert * (int) n_expert;
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
const int gb = gb_;
const int e = gb / blocks_per_expert;
const int ib = gb % blocks_per_expert;
const block_q5_K * x = (const block_q5_K *) (tmp_buf + (size_t) e * expert_bytes);
uint8_t * base = data_device + (size_t) e * expert_bytes;
auto * qs_ptr = base;
auto * qh_ptr = qs_ptr + (QK_K / 2) * blocks_per_expert;
auto * scales_ptr = qh_ptr + (QK_K / 8) * blocks_per_expert;
auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert);
for (int j = 0; j < QK_K / 2; ++j) {
qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
}
for (int j = 0; j < QK_K / 8; ++j) {
qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j];
}
for (int j = 0; j < K_SCALE_SIZE; ++j) {
scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
}
dm_ptr[ib] = x[ib].dm;
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
return true;
}
// Reorder each Q6_K expert slice into [ql][qh][scales][d].
static bool reorder_qw_q6_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) {
GGML_ASSERT(expert_bytes % sizeof(block_q6_K) == 0);
const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q6_K));
const size_t total_bytes = expert_bytes * (size_t) n_expert;
sycl_reorder_temp_buffer tmp(stream, total_bytes);
if (!tmp) {
GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes);
return false;
}
uint8_t * tmp_buf = static_cast<uint8_t *>(tmp.ptr);
sycl::event copy_event;
SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes)));
if (!g_ggml_sycl_use_async_mem_op) {
copy_event.wait();
}
const int total_blocks = blocks_per_expert * (int) n_expert;
auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) {
const int gb = gb_;
const int e = gb / blocks_per_expert;
const int ib = gb % blocks_per_expert;
const block_q6_K * x = (const block_q6_K *) (tmp_buf + (size_t) e * expert_bytes);
uint8_t * base = data_device + (size_t) e * expert_bytes;
auto * ql_ptr = base;
auto * qh_ptr = ql_ptr + (QK_K / 2) * blocks_per_expert;
auto * scales_ptr = qh_ptr + (QK_K / 4) * blocks_per_expert;
auto * d_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * blocks_per_expert);
for (int j = 0; j < QK_K / 2; ++j) {
ql_ptr[ib * (QK_K / 2) + j] = x[ib].ql[j];
}
for (int j = 0; j < QK_K / 4; ++j) {
qh_ptr[ib * (QK_K / 4) + j] = x[ib].qh[j];
}
for (int j = 0; j < QK_K / 16; ++j) {
scales_ptr[ib * (QK_K / 16) + j] = x[ib].scales[j];
}
d_ptr[ib] = x[ib].d;
});
if (!g_ggml_sycl_use_async_mem_op) {
reorder_event.wait_and_throw();
}
return true;
}
static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
GGML_ASSERT(size % sizeof(block_q3_K) == 0);
GGML_ASSERT(offset % sizeof(block_q3_K) == 0);
@@ -3840,6 +4025,22 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
size_t nrows = src0->ne[1];
size_t size = ggml_nbytes(src0);
// MoE expert weights are addressed per expert via nb[2], so each slice must
// remain self-contained after reorder.
if (src0->ne[2] > 1) {
GGML_ASSERT((size_t) size == (size_t) src0->ne[2] * src0->nb[2]);
switch (src0->type) {
case GGML_TYPE_Q4_K:
return reorder_qw_q4_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
case GGML_TYPE_Q5_K:
return reorder_qw_q5_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
case GGML_TYPE_Q6_K:
return reorder_qw_q6_k_moe(data_device, src0->nb[2], src0->ne[2], stream);
default:
return false;
}
}
switch (src0->type) {
case GGML_TYPE_Q4_0:
return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
@@ -3854,7 +4055,6 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
case GGML_TYPE_Q6_K:
return reorder_qw_q6_k(data_device, size, 0, stream);
default:
GGML_ABORT("reorder_qw() called with unsupported type");
return false;
}
}
@@ -3902,6 +4102,23 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
}
}
// Lazily reorder supported MoE expert weights once their fused path is used.
static void opt_for_reorder_id(ggml_backend_sycl_context * ctx, const ggml_tensor * src0) {
if (g_ggml_sycl_disable_optimize || !ctx->opt_feature.reorder) {
return;
}
if (src0->type != GGML_TYPE_Q4_K && src0->type != GGML_TYPE_Q5_K && src0->type != GGML_TYPE_Q6_K) {
return;
}
ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
if (!extra || extra->optimized_feature.reorder) {
return;
}
if (reorder_qw(src0, ctx->stream())) {
extra->optimized_feature.reorder = true;
}
}
static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
// The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
@@ -4007,11 +4224,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
}
struct mmid_row_mapping {
int32_t i1;
int32_t i2;
};
__dpct_inline__ static void k_copy_src1_to_contiguous(
const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
const mmid_row_mapping *__restrict__ row_mapping,
@@ -4067,11 +4279,6 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
if (!ggml_is_contiguous(src1)) return false;
// Reorder layout not supported; fall back.
const ggml_tensor_extra_gpu * src0_extra =
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
if (src0_extra && src0_extra->optimized_feature.reorder) return false;
const int64_t n_ids_per_group = ids->ne[0];
if (ids->ne[1] != 1) return false;
if (ne11 != 1 && ne11 != n_ids_per_group) return false;
@@ -4081,16 +4288,37 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
const int n_experts_used = (int) n_ids_per_group;
const int nrows = (int) src0->ne[1];
// Lazily reorder the (Q4_K) expert weights into a per-expert SoA layout, then run the reorder
// GEMV. Placed after the bail checks so a non-dispatchable op does not pay the reorder cost.
opt_for_reorder_id(&ctx, src0);
const ggml_tensor_extra_gpu * src0_extra =
static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
const bool use_reorder = src0_extra && src0_extra->optimized_feature.reorder;
ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
(size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
char * src1_ddq = src1_q8_alloc.get();
quantize_row_q8_1_sycl<quantize_q8_1>(
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
src1_padded_cols, stream);
if (use_reorder) {
quantize_row_q8_1_sycl<quantize_and_reorder_q8_1_soa>(
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
src1_padded_cols, stream);
} else {
quantize_row_q8_1_sycl<quantize_q8_1>(
(const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
src1_padded_cols, stream);
}
const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
if (use_reorder) {
return ggml_sycl_mul_mat_vec_q_id_reorder(
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
(float *) dst->data, (int) ne10, nrows, n_experts_used,
/*expert_weight_stride=*/ src0->nb[2],
/*dst_row_stride=*/ dst->nb[1],
src1_row_stride, stream);
}
return ggml_sycl_mul_mat_vec_q_id(
src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
(float *) dst->data, (int) ne10, nrows, n_experts_used,
@@ -4166,6 +4394,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
SYCL_CHECK(CHECK_TRY_ERROR(
stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
// also ensures ctx.mmid_row_mapping_host is drained before we use it again
SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
ggml_tensor src0_row = *src0;
@@ -4223,7 +4453,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
// where each expert's slice starts and the previous ends (row indices, right-exclusive)
std::vector<int64_t> expert_row_offsets;
// the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous
std::vector<mmid_row_mapping> routed_row_src;
std::vector<mmid_row_mapping> & routed_row_src = ctx.mmid_row_mapping_host;
mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows,
expert_row_counts, expert_row_offsets, routed_row_src);
@@ -4342,6 +4572,11 @@ static void ggml_sycl_im2col_3d(ggml_backend_sycl_context & ctx, ggml_tensor * d
ggml_sycl_op_im2col_3d(ctx, dst);
}
static void ggml_sycl_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
ggml_sycl_op_conv_3d(ctx, dst);
}
static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
@@ -4408,6 +4643,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_sycl_op_conv_transpose_1d(ctx, dst);
break;
case GGML_OP_CONV_3D:
ggml_sycl_conv_3d(ctx, dst);
break;
case GGML_OP_REPEAT:
ggml_sycl_repeat(ctx, dst);
break;
@@ -4489,6 +4727,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_UNARY_OP_EXP:
ggml_sycl_exp(ctx, dst);
break;
case GGML_UNARY_OP_EXPM1:
ggml_sycl_expm1(ctx, dst);
break;
case GGML_UNARY_OP_SOFTPLUS:
ggml_sycl_softplus(ctx, dst);
break;
@@ -5138,6 +5379,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_EXPM1:
case GGML_UNARY_OP_SOFTPLUS:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_CEIL:
@@ -5145,11 +5387,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_ROUND:
case GGML_UNARY_OP_TRUNC:
#if defined (GGML_SYCL_F16)
return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
#else
return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
#endif
return true;
default:
return false;
}
@@ -5352,11 +5590,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_LOG:
#if defined (GGML_SYCL_F16)
return ((op->type == GGML_TYPE_F32 || op->type == GGML_SYCL_F16) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_SYCL_F16) && (op->type == op->src[0]->type));
#else
return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
#endif
case GGML_OP_NORM:
case GGML_OP_L2_NORM:
case GGML_OP_GROUP_NORM:
@@ -5390,6 +5623,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_IM2COL_3D:
case GGML_OP_UPSCALE:
return true;
case GGML_OP_CONV_3D:
return op->type == GGML_TYPE_F32 &&
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
op->src[1]->type == GGML_TYPE_F32 &&
ggml_is_contiguous(op->src[0]) &&
ggml_is_contiguous(op->src[1]);
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
+115
View File
@@ -2468,3 +2468,118 @@ bool ggml_sycl_mul_mat_vec_q_id(
return false;
}
}
// Reorder (SoA) MoE expert GEMV: MoE expert/row/lane indexing (from mul_mat_vec_q_moe) with the
// dense-reorder per-block reads (from mul_mat_vec_q_reorder). Each expert slice in vx_base is a
// self-contained SoA, so nblocks = nrows*(ncols/qk) per expert and the constant expert stride holds.
template <typename reorder_vec_dot_q_sycl>
static void mul_mat_vec_q_moe_reorder(
const void * __restrict__ vx_base, const void * __restrict__ vy_base,
float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
const int ncols, const int nrows,
const size_t expert_weight_stride, const size_t dst_row_stride,
const size_t src1_row_stride,
const sycl::nd_item<3> & item_ct1) {
using block_type = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
using block_traits = typename block_type::traits;
const int expert_idx = item_ct1.get_group(1);
const int i02 = ids_dev[expert_idx];
const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
float * dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
if (row >= nrows) {
return;
}
const auto sg = item_ct1.get_sub_group();
const int blocks_per_row = ncols / block_traits::qk;
constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
const int nblocks = nrows * (ncols / block_traits::qk);
static_assert(blocks_per_subgroup > 0);
static_assert(block_elements_per_subgroup > 0);
float partial_sum = 0.0f;
for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
const int ibx = row * blocks_per_row + i;
const auto bx_offset = block_type::get_block_offset(ibx, nblocks);
const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx);
const int iby = i * block_type::block_to_q8_1_ratio();
const int8_t * q8_1_quant_ptr = (const int8_t *) vy + iby * QK8_1;
const sycl::half2 * q8_1_ds_ptr = (const sycl::half2 *) ((const char *) vy + ncols + iby * sizeof(sycl::half2));
#pragma unroll
for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
}
}
auto sum = sycl::reduce_over_group(sg, partial_sum, std::plus<>());
if (sg.leader()) {
dst[row] = sum;
}
}
template <typename reorder_vec_dot_q_sycl>
static void launch_mul_mat_vec_q_moe_reorder(
const void * vx_base, const void * vy, const int32_t * ids_dev,
float * dst_base, const int ncols, const int nrows, const int n_experts_used,
const size_t expert_weight_stride, const size_t dst_row_stride,
const size_t src1_row_stride,
dpct::queue_ptr stream) {
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
stream->submit([&](sycl::handler & cgh) {
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl>(
vx_base, vy, dst_base, ids_dev, ncols, nrows,
expert_weight_stride, dst_row_stride, src1_row_stride, item);
});
});
}
bool ggml_sycl_mul_mat_vec_q_id_reorder(
enum ggml_type src0_type,
const void * vx_base,
const void * vy,
const int32_t * ids_dev,
float * dst_base,
int ncols,
int nrows,
int n_experts_used,
size_t expert_weight_stride,
size_t dst_row_stride,
size_t src1_row_stride,
dpct::queue_ptr stream) {
switch (src0_type) {
case GGML_TYPE_Q4_K:
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q5_K:
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q5_K>>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
case GGML_TYPE_Q6_K:
launch_mul_mat_vec_q_moe_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(
vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
expert_weight_stride, dst_row_stride, src1_row_stride, stream);
return true;
default:
return false;
}
}
+17
View File
@@ -40,4 +40,21 @@ bool ggml_sycl_mul_mat_vec_q_id(
size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes
dpct::queue_ptr stream);
// Reorder (SoA) variant of the fused MoE expert GEMV.
// vx_base: each expert slice (stride expert_weight_stride == src0->nb[2]) is a self-contained reorder/SoA layout.
// vy: src1 quantized with quantize_and_reorder_q8_1_soa (per-row SoA). Returns false if src0_type isn't handled.
bool ggml_sycl_mul_mat_vec_q_id_reorder(
enum ggml_type src0_type,
const void * vx_base,
const void * vy,
const int32_t * ids_dev,
float * dst_base,
int ncols,
int nrows,
int n_experts_used,
size_t expert_weight_stride,
size_t dst_row_stride,
size_t src1_row_stride,
dpct::queue_ptr stream);
#endif // GGML_SYCL_MMVQ_HPP
+106 -15
View File
@@ -902,14 +902,17 @@ struct vk_device_struct {
vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
vk_pipeline pipeline_timestep_embedding_f32;
vk_pipeline pipeline_conv_transpose_1d_f32;
vk_pipeline pipeline_col2im_1d_f32;
vk_pipeline pipeline_col2im_1d_f16;
vk_pipeline pipeline_col2im_1d_bf16;
vk_pipeline pipeline_snake_f32;
vk_pipeline pipeline_snake_f16;
vk_pipeline pipeline_snake_bf16;
vk_pipeline pipeline_pool2d_f32;
vk_pipeline pipeline_rwkv_wkv6_f32;
vk_pipeline pipeline_rwkv_wkv7_f32;
// [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128
vk_pipeline pipeline_gated_delta_net[3][2];
// [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128
vk_pipeline pipeline_gated_delta_net[4][2];
vk_pipeline pipeline_ssm_scan_f32_d128;
vk_pipeline pipeline_ssm_scan_f32_d256;
vk_pipeline pipeline_ssm_conv_f32;
@@ -1552,6 +1555,16 @@ struct vk_op_timestep_embedding_push_constants {
uint32_t max_period;
};
struct vk_op_col2im_1d_push_constants {
uint32_t T_out;
uint32_t OC;
uint32_t K_OC;
uint32_t T_in;
uint32_t K;
int32_t stride;
int32_t p0;
};
struct vk_op_conv_transpose_1d_push_constants {
uint32_t Cout;
uint32_t Cin;
@@ -2995,13 +3008,13 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std
if (memory_type_indices.empty()) {
continue;
}
buf->memory_property_flags = req_flags;
bool done = false;
for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) {
try {
buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info });
buf->memory_property_flags = mem_props.memoryTypes[*mtype_it].propertyFlags;
done = true;
break;
} catch (const vk::SystemError& e) {
@@ -3067,8 +3080,10 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) {
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal});
} else if (device->uma) {
// Fall back to host memory type
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal,
// On UMA, prefer host-visible memory so direct tensor borrowing works.
// If unavailable, fall back to device-local memory.
buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent,
vk::MemoryPropertyFlagBits::eDeviceLocal,
vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent});
} else if (device->disable_host_visible_vidmem) {
if (device->allow_sysmem_fallback) {
@@ -5203,6 +5218,9 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f32, "col2im_1d_f32", col2im_1d_f32_len, col2im_1d_f32_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f16, "col2im_1d_f16", col2im_1d_f16_len, col2im_1d_f16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_bf16, "col2im_1d_bf16", col2im_1d_bf16_len, col2im_1d_bf16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
@@ -5215,14 +5233,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
{
const uint32_t gdn_sizes[] = {32, 64, 128};
const uint32_t gdn_sizes[] = {16, 32, 64, 128};
const char * gdn_names[][2] = {
{"gated_delta_net_f32_d16", "gated_delta_net_f32_d16_kda"},
{"gated_delta_net_f32_d32", "gated_delta_net_f32_d32_kda"},
{"gated_delta_net_f32_d64", "gated_delta_net_f32_d64_kda"},
{"gated_delta_net_f32_d128", "gated_delta_net_f32_d128_kda"},
};
const bool use_subgroup_reduce = device->subgroup_arithmetic;
for (uint32_t si = 0; si < 3; si++) {
for (uint32_t si = 0; si < 4; si++) {
const uint32_t S_V = gdn_sizes[si];
GGML_ASSERT(is_pow2(S_V));
@@ -5236,10 +5254,29 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
lanes_per_column = std::min(S_V, device->subgroup_size);
}
const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size);
// gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and
// S_V % LANES_PER_COLUMN == 0 to avoid bounds checks.
while (lanes_per_column > 1u) {
const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 &&
(S_V % lanes_per_column) == 0;
const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0;
if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) {
break;
}
lanes_per_column >>= 1u;
}
GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0);
GGML_ASSERT((S_V % lanes_per_column) == 0);
GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0);
const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size;
const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce;
const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce;
const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce;
size_t gdn_len;
const void * gdn_data;
if (use_subgroup_reduce && need_clustered_shader) {
if (use_clustered_reduce) {
gdn_len = gated_delta_net_f32_len;
gdn_data = (const void *)gated_delta_net_f32_data;
} else if (use_subgroup_reduce) {
@@ -5256,7 +5293,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) {
for (uint32_t kda = 0; kda < 2; kda++) {
ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda],
gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants),
wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size);
wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size);
}
}
}
@@ -10702,6 +10739,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_conv_transpose_1d_f32;
}
return nullptr;
case GGML_OP_COL2IM_1D:
switch (src0->type) {
case GGML_TYPE_F32: return ctx->device->pipeline_col2im_1d_f32;
case GGML_TYPE_F16: return ctx->device->pipeline_col2im_1d_f16;
case GGML_TYPE_BF16: return ctx->device->pipeline_col2im_1d_bf16;
default: return nullptr;
}
case GGML_OP_POOL_2D:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_pool2d_f32;
@@ -10723,9 +10767,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0;
uint32_t si;
switch (S_v) {
case 32: si = 0; break;
case 64: si = 1; break;
case 128: si = 2; break;
case 16: si = 0; break;
case 32: si = 1; break;
case 64: si = 2; break;
case 128: si = 3; break;
default: return nullptr;
}
return ctx->device->pipeline_gated_delta_net[si][kda];
@@ -11147,6 +11192,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
{
elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
} break;
case GGML_OP_COL2IM_1D:
{
elements = { uint32_t(dst->ne[0]), uint32_t(dst->ne[1]), 1 };
} break;
case GGML_OP_POOL_2D:
{
const uint32_t N = dst->ne[3];
@@ -12936,6 +12985,32 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
}
static void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
// src0: [K_OC, T_in] columns from matmul
// dst: [T_out, OC]
const int32_t stride = dst->op_params[0];
const int32_t oc = dst->op_params[1];
const int32_t p0 = dst->op_params[2];
const uint32_t K_OC = static_cast<uint32_t>(src0->ne[0]);
const uint32_t T_in = static_cast<uint32_t>(src0->ne[1]);
const uint32_t T_out = static_cast<uint32_t>(dst->ne[0]);
const uint32_t OC = static_cast<uint32_t>(oc);
const uint32_t K = K_OC / OC;
vk_op_col2im_1d_push_constants p{};
p.T_out = T_out;
p.OC = OC;
p.K_OC = K_OC;
p.T_in = T_in;
p.K = K;
p.stride = stride;
p.p0 = p0;
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COL2IM_1D, std::move(p));
}
// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
// Match the naive mul -> sin -> sqr -> mul -> add chain and run the
// dedicated kernel directly. The pattern is validated by
@@ -14423,6 +14498,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
case GGML_OP_TIMESTEP_EMBEDDING:
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node);
break;
case GGML_OP_COL2IM_1D:
ggml_vk_col2im_1d(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CONV_TRANSPOSE_1D:
ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node);
@@ -17136,7 +17215,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_OP_GATED_DELTA_NET:
{
const uint32_t S_v = op->src[2]->ne[0];
if (S_v != 32 && S_v != 64 && S_v != 128) {
if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) {
return false;
}
for (int i = 0; i < 6; i++) {
@@ -17188,6 +17267,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_CONV_TRANSPOSE_1D:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_COL2IM_1D:
return (op->src[0]->type == GGML_TYPE_F32 ||
op->src[0]->type == GGML_TYPE_F16 ||
op->src[0]->type == GGML_TYPE_BF16) &&
op->type == op->src[0]->type &&
ggml_is_contiguous(op->src[0]) &&
ggml_is_contiguous(op);
case GGML_OP_CONV_2D:
case GGML_OP_CONV_TRANSPOSE_2D:
{
@@ -18019,6 +18105,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
const int32_t p0 = tensor->op_params[1];
const int32_t d0 = tensor->op_params[2];
tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
} else if (tensor->op == GGML_OP_COL2IM_1D) {
const int32_t stride = tensor->op_params[0];
const int32_t oc = tensor->op_params[1];
const int32_t p0 = tensor->op_params[2];
tensor_clone = ggml_col2im_1d(ggml_ctx, src_clone[0], stride, oc, p0);
} else if (tensor->op == GGML_OP_POOL_2D) {
enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
const int32_t k0 = tensor->op_params[1];
@@ -0,0 +1,61 @@
#version 450
#include "types.glsl"
layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // columns: [K_OC, T_in]
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; // output: [T_out, OC]
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout (push_constant) uniform parameter {
uint32_t T_out;
uint32_t OC;
uint32_t K_OC;
uint32_t T_in;
uint32_t K;
int32_t stride;
int32_t p0;
} p;
// Load A_TYPE to float
float load_col(uint32_t idx) {
#if defined(DATA_A_BF16)
return bf16_to_fp32(uint32_t(data_a[idx]));
#else
return float(data_a[idx]);
#endif
}
// Store float as D_TYPE
void store_dst(uint32_t idx, float v) {
#if defined(DATA_A_BF16)
data_d[idx] = D_TYPE(fp32_to_bf16(v));
#else
data_d[idx] = D_TYPE(v);
#endif
}
void main() {
const uint32_t t_out = gl_GlobalInvocationID.x;
const uint32_t oc = gl_GlobalInvocationID.y;
if (t_out >= p.T_out || oc >= p.OC) return;
const int32_t t_abs = int32_t(t_out) + p.p0; // absolute position in uncropped signal
// Gather: only the ceil(K/stride) columns that scatter into t_abs, no modulo
int32_t t_in_min = (t_abs - int32_t(p.K) + p.stride) / p.stride;
if (t_in_min < 0) t_in_min = 0;
int32_t t_in_max = t_abs / p.stride;
if (t_in_max >= int32_t(p.T_in)) t_in_max = int32_t(p.T_in) - 1;
float val = 0.0;
for (int32_t t_in = t_in_min; t_in <= t_in_max; t_in++) {
int32_t k = t_abs - t_in * p.stride;
// col layout: [K_OC, T_in], column index = oc * K + k
uint32_t col_idx = (oc * p.K + uint32_t(k)) + uint32_t(t_in) * p.K_OC;
val += load_col(col_idx);
}
// dst layout: [T_out, OC], element (t_out, oc) = t_out + oc * T_out
store_dst(t_out + oc * p.T_out, val);
}
@@ -407,7 +407,7 @@ std::map<std::string, std::string> merge_maps(const std::map<std::string, std::s
return result;
}
static std::vector<std::future<void>> compiles;
static std::deque<std::future<void>> compiles;
void string_to_spv(std::string name, const std::string& source, const std::map<std::string, std::string>& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false, const std::string& suffix = "") {
name = name + (f16acc ? "_f16acc" : "") + (coopmat ? "_cm1" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")) + suffix;
std::string out_path = join_paths(output_dir, name + ".spv");
@@ -426,6 +426,11 @@ void string_to_spv(std::string name, const std::string& source, const std::map<s
string_to_spv_func, name, input_filepath, out_path, defines, coopmat, generate_dep_file, std::move(slot)));
// Don't write the same dep file from multiple processes
generate_dep_file = false;
// Clean up completed futures - don't accumulate virtual memory for completed threads' stacks.
while (!compiles.empty() && compiles.front().wait_for(std::chrono::seconds(0)) == std::future_status::ready) {
compiles.pop_front();
}
}
void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool coopmat2, bool f16acc, bool dot2 = false) {
@@ -1003,6 +1008,9 @@ void process_shaders() {
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("col2im_1d_f32", "col2im_1d.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("col2im_1d_f16", "col2im_1d.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
string_to_spv("col2im_1d_bf16", "col2im_1d.comp", {{"DATA_A_BF16", "1"}, {"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}});
string_to_spv("snake_f32", "snake.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}});
string_to_spv("snake_f16", "snake.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
+58 -45
View File
@@ -1088,6 +1088,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
if (w_s) {
res = ggml_mul(ctx0, res, w_s);
}
for (const auto & lora : *loras) {
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
if (lw == nullptr) {
@@ -1106,18 +1110,24 @@ ggml_tensor * llm_graph_context::build_lora_mm(
res = ggml_add(ctx0, res, ab_cur);
}
if (w_s) {
res = ggml_mul(ctx0, res, w_s);
}
return res;
}
ggml_tensor * llm_graph_context::build_lora_mm_id(
ggml_tensor * w, // ggml_tensor * as
ggml_tensor * cur, // ggml_tensor * b
ggml_tensor * ids) const {
ggml_tensor * ids,
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
if (w_s) {
const int64_t n_expert = w_s->ne[0];
const int64_t n_tokens = cur->ne[2];
ggml_tensor * s = ggml_reshape_3d(ctx0, w_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, ids);
res = ggml_mul(ctx0, res, s);
}
for (const auto & lora : *loras) {
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
if (lw == nullptr) {
@@ -1269,6 +1279,29 @@ ggml_tensor * llm_graph_context::build_ffn(
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
int il) const {
// NVFP4 support is currently restricted to
// 1) LORA absence (*_s would be applied after LORA residual, which is incorrect)
// 2) bias absense (*_s would be applied after bias addition, which is incorrect)
// TODO: disambiguate LLM-architectural scales (which use *_s) from NVFP4 scale_2 (which also uses *_s currently)
auto has_lora = [this](ggml_tensor * w) {
if (!w) {
return false;
}
for (const auto & lora : *loras) {
if (lora.first->get_weight(w) != nullptr) {
return true;
}
}
return false;
};
GGML_ASSERT(!up_s || !up_b || !up || up->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!gate_s || !gate_b || !gate || gate->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!down_s || !down_b || !down || down->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!up_s || !up || up->type != GGML_TYPE_NVFP4 || !has_lora(up));
GGML_ASSERT(!gate_s || !gate || gate->type != GGML_TYPE_NVFP4 || !has_lora(gate));
GGML_ASSERT(!down_s || !down || down->type != GGML_TYPE_NVFP4 || !has_lora(down));
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
cb(tmp, "ffn_up", il);
@@ -1627,23 +1660,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
if (gate_up_exps) {
// merged gate_up path: one mul_mat_id, then split into gate and up views
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens]
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts, up_exps_s); // [n_ff*2, n_expert_used, n_tokens]
cb(gate_up, "ffn_moe_gate_up", il);
if (up_exps_s) {
cb(gate_up, "ffn_moe_gate_up_scaled", il);
}
if (gate_up_exps_b) {
gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts);
cb(gate_up, "ffn_moe_gate_up_biased", il);
}
// apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
if (up_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
gate_up = ggml_mul(ctx0, gate_up, s);
cb(gate_up, "ffn_moe_gate_up_scaled", il);
}
const int64_t n_ff = gate_up->ne[0] / 2;
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
cb(cur, "ffn_moe_gate", il);
@@ -1651,43 +1679,33 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(up, "ffn_moe_up", il);
} else {
// separate gate and up path
up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
up = build_lora_mm_id(up_exps, cur, selected_experts, up_exps_s); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il);
if (up_exps_s) {
cb(up, "ffn_moe_up_scaled", il);
}
if (up_exps_b) {
up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
cb(up, "ffn_moe_up_biased", il);
}
// apply per-expert scale2 to up
if (up_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
up = ggml_mul(ctx0, up, s);
cb(up, "ffn_moe_up_scaled", il);
}
if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cur = build_lora_mm_id(gate_exps, cur, selected_experts, gate_exps_s); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
}
if (gate_exps_s) {
cb(cur, "ffn_moe_gate_scaled", il);
}
if (gate_exps_b) {
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
cb(cur, "ffn_moe_gate_biased", il);
}
// apply per-expert scale2 to gate
if (gate_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
cur = ggml_mul(ctx0, cur, s);
cb(cur, "ffn_moe_gate_scaled", il);
}
}
const bool has_gate = gate_exps || gate_up_exps;
@@ -1759,23 +1777,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error");
}
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
experts = build_lora_mm_id(down_exps, cur, selected_experts, down_exps_s); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);
if (down_exps_s) {
cb(experts, "ffn_moe_down_scaled", il);
}
if (down_exps_b) {
experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
cb(experts, "ffn_moe_down_biased", il);
}
// apply per-expert scale2 to down
if (down_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
experts = ggml_mul(ctx0, experts, s);
cb(experts, "ffn_moe_down_scaled", il);
}
if (!weight_before_ffn) {
experts = ggml_mul(ctx0, experts, weights);
cb(experts, "ffn_moe_weighted", il);
+3 -2
View File
@@ -853,11 +853,12 @@ struct llm_graph_context {
ggml_tensor * cur,
ggml_tensor * w_s = nullptr) const;
// do mat_mul_id, while optionally apply lora
// do mat_mul_id, while optionally apply lora and per-expert scale
ggml_tensor * build_lora_mm_id(
ggml_tensor * w, // ggml_tensor * as
ggml_tensor * cur, // ggml_tensor * b
ggml_tensor * ids) const;
ggml_tensor * ids,
ggml_tensor * w_s = nullptr) const;
ggml_tensor * build_norm(
ggml_tensor * cur,
+20 -2
View File
@@ -1882,11 +1882,29 @@ static void test_lfm2_parser(const std::string & template_path, bool detailed_de
.expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org)."))
.run();
// Python tool with multiline code in string
// Python tool with multiline code in string: the \n in the literal decodes to a real
// newline, emitted as a JSON \n escape (not a doubled backslash).
tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "def hello():\\n print('hey')"})#", "" }
{ "python", R"#({"code": "def hello():\n print('hey')"})#", "" }
})
.run();
// String escape sequences decode to their actual characters (newline + tab here),
// so a "write a two line file" style call produces real line breaks, not literal "\n".
tst.test("<|tool_call_start|>[python(code=\"First line\\nSecond line\\tindented\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "First line\nSecond line\tindented"})#", "" }
})
.run();
// Escaped quotes inside a string argument survive the round-trip.
tst.test("<|tool_call_start|>[python(code=\"print(\\\"hi\\\")\")]<|tool_call_end|>")
.tools({ python_tool })
.expect_tool_calls({
{ "python", R"#({"code": "print(\"hi\")"})#", "" }
})
.run();
+7
View File
@@ -323,6 +323,7 @@ struct cmd_params {
std::vector<std::string> hf_repo;
std::vector<std::string> hf_file;
std::string hf_token;
bool offline;
std::vector<int> n_prompt;
std::vector<int> n_gen;
std::vector<std::pair<int, int>> n_pg;
@@ -367,6 +368,7 @@ static const cmd_params cmd_params_defaults = {
/* hf_repo */ {},
/* hf_file */ {},
/* hf_token */ "",
/* offline */ false,
/* n_prompt */ { 512 },
/* n_gen */ { 128 },
/* n_pg */ {},
@@ -437,6 +439,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" (default: unused)\n");
printf(" -hft, --hf-token <token> Hugging Face access token\n");
printf(" (default: value from HF_TOKEN environment variable)\n");
printf(" --offline Offline mode: forces use of cache, prevents network access\n");
printf(" (default: disabled)\n");
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
@@ -558,6 +562,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.hf_token = argv[i];
} else if (arg == "--offline") {
params.offline = true;
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
@@ -1040,6 +1046,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
common_download_opts opts;
opts.bearer_token = params.hf_token;
opts.offline = params.offline;
auto download_result = common_download_model(model, opts);
if (download_result.model_path.empty()) {
fprintf(stderr, "error: failed to download model from HuggingFace\n");
+3
View File
@@ -40,6 +40,7 @@ def main(args_in: list[str] | None = None) -> None:
required=True)
parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
parser.add_argument("--offline", action="store_true", default=False, help="Offline mode: forces use of cache, prevents network access")
parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
@@ -268,6 +269,8 @@ def start_server_background(args):
]
server_args.extend(['--hf-repo', args.hf_repo])
server_args.extend(['--hf-file', args.hf_file])
if args.offline:
server_args.append('--offline')
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
server_args.extend(['--ctx-size', args.ctx_size])
server_args.extend(['--parallel', args.parallel])
+27 -3
View File
@@ -201,6 +201,8 @@ struct server_slot {
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
int32_t n_draft_accepted = 0; // Draft tokens actually accepted
int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model
std::vector<int32_t> n_accepted_per_pos; // Accepted tokens per draft position
void reset() {
SLT_DBG(*this, "%s", "\n");
@@ -227,6 +229,8 @@ struct server_slot {
// clear speculative decoding stats
n_draft_total = 0;
n_draft_accepted = 0;
n_draft_verif_steps = 0;
n_accepted_per_pos.clear();
task_prev = std::move(task);
task.reset();
@@ -509,10 +513,22 @@ struct server_slot {
llama_perf_context(ctx_tgt).n_reused);
if (n_draft_total > 0) {
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
const float draft_ratio = (float) n_draft_accepted / n_draft_total;
const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0;
std::string acceptance_rates_per_pos;
if (n_draft_verif_steps > 0) {
for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) {
if (i > 0) {
acceptance_rates_per_pos += ", ";
}
acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps);
}
}
SLT_INF(*this,
"draft acceptance = %0.5f (%5d accepted / %5d generated)\n",
draft_ratio, n_draft_accepted, n_draft_total);
"draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n",
draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str());
}
common_speculative_print_stats(spec);
@@ -3543,6 +3559,14 @@ private:
// update how many tokens out of those tested were accepted
slot.n_draft_accepted += ids.size() - 1;
slot.n_draft_verif_steps += 1;
if (slot.n_accepted_per_pos.empty()) {
slot.n_accepted_per_pos.resize(common_speculative_n_max(&params_base.speculative), 0);
}
for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) {
slot.n_accepted_per_pos[i]++;
}
// add accepted tokens to the prompt
slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft);
+8
View File
@@ -40,6 +40,7 @@
"eslint-config-prettier": "10.1.8",
"eslint-plugin-storybook": "10.4.2",
"eslint-plugin-svelte": "3.19.0",
"fflate": "0.8.3",
"globals": "16.5.0",
"highlight.js": "11.11.1",
"http-server": "14.1.1",
@@ -9454,6 +9455,13 @@
}
}
},
"node_modules/fflate": {
"version": "0.8.3",
"resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.3.tgz",
"integrity": "sha512-tbZNuJrLwGUp3zshBtdy4W+ORxZuIh8a5ilyIEQDC5rY1f3U20JMry0Ll3WBzU58EZKsEuJFXhb5gwv8CsPvgA==",
"dev": true,
"license": "MIT"
},
"node_modules/file-entry-cache": {
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",
+1
View File
@@ -59,6 +59,7 @@
"eslint-config-prettier": "10.1.8",
"eslint-plugin-storybook": "10.4.2",
"eslint-plugin-svelte": "3.19.0",
"fflate": "0.8.3",
"globals": "16.5.0",
"highlight.js": "11.11.1",
"http-server": "14.1.1",
@@ -41,6 +41,7 @@
DATA_ERROR_HANDLED_ATTR,
BOOL_TRUE_STRING,
SETTINGS_KEYS,
CODE_BLOCK_HEADER_CLASS,
MERMAID_WRAPPER_CLASS,
MERMAID_BLOCK_CLASS,
MERMAID_LANGUAGE,
@@ -53,7 +54,11 @@
SVG_TAG_PREFIX,
SVG_SOURCE_ATTR,
SVG_RENDERED_ATTR,
SVG_INLINE_SHADOW_STYLE
SVG_INLINE_SHADOW_STYLE,
TOGGLE_SOURCE_BTN_CLASS,
DIAGRAM_VIEW_MODE_ATTR,
DIAGRAM_VIEW_RENDERED,
DIAGRAM_VIEW_SOURCE
} from '$lib/constants';
import { ColorMode, UrlProtocol } from '$lib/enums';
import { FileTypeText } from '$lib/enums/files.enums';
@@ -501,6 +506,23 @@
async function handleMermaidClick(event: MouseEvent) {
const target = event.target as HTMLElement;
// Toggle a diagram block between its rendered view and its source view.
// Shared by mermaid and svg, css drives the visibility from the wrapper mode.
const toggleBtn = target.closest(`.${TOGGLE_SOURCE_BTN_CLASS}`);
if (toggleBtn) {
event.preventDefault();
event.stopPropagation();
const wrapper = toggleBtn.closest(`.${MERMAID_WRAPPER_CLASS}, .${SVG_WRAPPER_CLASS}`);
if (!wrapper) return;
const isSource = wrapper.getAttribute(DIAGRAM_VIEW_MODE_ATTR) === DIAGRAM_VIEW_SOURCE;
const next = isSource ? DIAGRAM_VIEW_RENDERED : DIAGRAM_VIEW_SOURCE;
wrapper.setAttribute(DIAGRAM_VIEW_MODE_ATTR, next);
toggleBtn.setAttribute('aria-pressed', String(!isSource));
return;
}
// Check if clicking on copy or preview button in mermaid block
const copyBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .copy-code-btn`);
const previewBtn = target.closest(`.${MERMAID_WRAPPER_CLASS} .preview-code-btn`);
@@ -573,6 +595,11 @@
}
}
// A click on the header chrome targets the action buttons, never the
// diagram. Guard so a header click can not fall through to the click to
// zoom branches below, whatever the scroll position or stacking.
if (target.closest(`.${CODE_BLOCK_HEADER_CLASS}`)) return;
// Open preview when clicking the svg block itself. A final block carries its
// source, a streaming block does not and is mirrored live into the dialog.
const svgEl = target.closest(`.${SVG_BLOCK_CLASS}`);
@@ -300,7 +300,8 @@ div.markdown-user-content :global(.table-wrapper) {
}
.markdown-content :global(.copy-code-btn),
.markdown-content :global(.preview-code-btn) {
.markdown-content :global(.preview-code-btn),
.markdown-content :global(.toggle-source-btn) {
display: flex;
align-items: center;
justify-content: center;
@@ -312,15 +313,22 @@ div.markdown-user-content :global(.table-wrapper) {
}
.markdown-content :global(.copy-code-btn:hover),
.markdown-content :global(.preview-code-btn:hover) {
.markdown-content :global(.preview-code-btn:hover),
.markdown-content :global(.toggle-source-btn:hover) {
transform: scale(1.05);
}
.markdown-content :global(.copy-code-btn:active),
.markdown-content :global(.preview-code-btn:active) {
.markdown-content :global(.preview-code-btn:active),
.markdown-content :global(.toggle-source-btn:active) {
transform: scale(0.95);
}
/* Pressed state marks the source view as active */
.markdown-content :global(.toggle-source-btn[aria-pressed='true']) {
color: var(--primary);
}
.markdown-content :global(.code-block-wrapper pre) {
background: transparent;
margin: 0;
@@ -629,8 +637,8 @@ div.markdown-user-content :global(.table-wrapper) {
overflow-y: auto;
overflow-x: auto;
display: flex;
align-items: center;
justify-content: center;
align-items: safe center;
justify-content: safe center;
padding: 3rem 1rem 1rem;
}
@@ -645,7 +653,9 @@ div.markdown-user-content :global(.table-wrapper) {
overflow-y: visible;
}
/* Diagram block uses same header styling as code blocks */
/* Diagram block uses same header styling as code blocks. The header floats over
scrollable diagram content and stays transparent, so the overflow shows up to
the box edge. It keeps a z-index so it stays the click target above content. */
.markdown-content :global(.mermaid-block-wrapper .code-block-header),
.markdown-content :global(.svg-block-wrapper .code-block-header) {
display: flex;
@@ -657,6 +667,7 @@ div.markdown-user-content :global(.table-wrapper) {
top: 0;
left: 0;
right: 0;
z-index: 2;
}
.markdown-content :global(.mermaid-block-wrapper .code-block-actions),
@@ -683,6 +694,31 @@ div.markdown-user-content :global(.table-wrapper) {
padding: 3rem 1rem;
}
/* Source view stays hidden while the block renders, css swaps the two views
from the wrapper mode so the click handler only flips one attribute. The view
reuses the code block scroll container, so it matches the app code blocks. */
.markdown-content :global(.diagram-source) {
display: none;
text-align: left;
}
.markdown-content :global(.diagram-source pre) {
background: transparent;
margin: 0;
border-radius: 0;
border: none;
font-size: 0.875rem;
}
.markdown-content :global([data-view-mode='source'] .mermaid-scroll-container),
.markdown-content :global([data-view-mode='source'] .svg-scroll-container) {
display: none;
}
.markdown-content :global([data-view-mode='source'] .diagram-source) {
display: block;
}
/* Streaming mermaid block - empty preview box */
.mermaid-streaming-block {
min-height: 300px;
@@ -7,12 +7,16 @@ import type { Element, ElementContent } from 'hast';
import {
CODE_BLOCK_HEADER_CLASS,
CODE_BLOCK_ACTIONS_CLASS,
CODE_BLOCK_SCROLL_CONTAINER_CLASS,
CODE_LANGUAGE_CLASS,
COPY_CODE_BTN_CLASS,
PREVIEW_CODE_BTN_CLASS,
TOGGLE_SOURCE_BTN_CLASS,
DIAGRAM_SOURCE_CLASS,
RELATIVE_CLASS,
COPY_ICON_SVG,
PREVIEW_ICON_SVG
PREVIEW_ICON_SVG,
CODE_ICON_SVG
} from '$lib/constants';
export interface BlockIdGenerator {
@@ -32,14 +36,16 @@ export function createIconElement(svg: string): Element {
}
/**
* Creates a button element with icon.
* Creates a button element with icon. Extra properties merge onto the button,
* which lets a stateful button carry attributes like aria-pressed.
*/
export function createButton(
className: string,
title: string,
iconSvg: string,
id: string,
idAttribute: string
idAttribute: string,
extraProperties: Record<string, string> = {}
): Element {
return {
type: 'element',
@@ -48,7 +54,8 @@ export function createButton(
className: [className],
[idAttribute]: id,
title,
type: 'button'
type: 'button',
...extraProperties
},
children: [createIconElement(iconSvg)]
};
@@ -72,6 +79,52 @@ export function createPreviewButton(
return createButton(PREVIEW_CODE_BTN_CLASS, title, PREVIEW_ICON_SVG, id, idAttribute);
}
/**
* Creates a button that toggles a diagram block between its rendered view and
* its source view. aria-pressed starts false, the rendered view is the default.
*/
export function createToggleSourceButton(
id: string,
idAttribute: string,
title: string = 'Toggle source'
): Element {
return createButton(TOGGLE_SOURCE_BTN_CLASS, title, CODE_ICON_SVG, id, idAttribute, {
'aria-pressed': 'false'
});
}
/**
* Creates a source view for a diagram block. It reuses the code block scroll
* container so it matches the app code blocks, and wraps the highlighted code
* element captured at transform time. A missing code element falls back to a
* plain code node built from the raw source.
*/
export function createSourceView(
codeElement: Element | undefined,
source: string,
language: string
): Element {
const code: Element = codeElement ?? {
type: 'element',
tagName: 'code',
properties: { className: ['hljs', `language-${language}`] },
children: [{ type: 'text', value: source }]
};
return {
type: 'element',
tagName: 'div',
properties: { className: [DIAGRAM_SOURCE_CLASS, CODE_BLOCK_SCROLL_CONTAINER_CLASS] },
children: [
{
type: 'element',
tagName: 'pre',
properties: {},
children: [code]
}
]
};
}
/**
* Creates a block header with language label and action buttons.
*/
@@ -116,14 +169,17 @@ export function createScrollContainer(preElement: Element, scrollContainerClass:
}
/**
* Creates a wrapper element with header and scroll container.
* Creates a wrapper element with header and scroll container. Extra children
* append after the scroll container, which lets a block carry a source view
* alongside its rendered output.
*/
export function createWrapper(
header: Element,
preElement: Element,
wrapperClass: string,
scrollContainerClass: string,
additionalAttributes?: Record<string, string>
additionalAttributes?: Record<string, string>,
extraChildren: Element[] = []
): Element {
return {
type: 'element',
@@ -132,7 +188,7 @@ export function createWrapper(
className: [wrapperClass, RELATIVE_CLASS],
...additionalAttributes
} as Element['properties'],
children: [header, createScrollContainer(preElement, scrollContainerClass)]
children: [header, createScrollContainer(preElement, scrollContainerClass), ...extraChildren]
};
}
@@ -19,12 +19,17 @@ import {
MERMAID_BLOCK_CLASS,
MERMAID_LANGUAGE,
MERMAID_SYNTAX_ATTR,
MERMAID_ID_ATTR
MERMAID_ID_ATTR,
DIAGRAM_VIEW_MODE_ATTR,
DIAGRAM_VIEW_RENDERED
} from '$lib/constants';
import type { DiagramPreData } from './pre-transform';
import {
createBlockHeader,
createCopyButton,
createPreviewButton,
createToggleSourceButton,
createSourceView,
createWrapper,
generateBlockId
} from './code-block-utils';
@@ -75,16 +80,23 @@ export const rehypeEnhanceMermaidBlocks: Plugin<[], Root> = () => {
const actions = [
createCopyButton(mermaidId, MERMAID_ID_ATTR, 'Copy mermaid syntax'),
createToggleSourceButton(mermaidId, MERMAID_ID_ATTR, 'Toggle mermaid source'),
createPreviewButton(mermaidId, MERMAID_ID_ATTR, 'Preview diagram')
];
const header = createBlockHeader(MERMAID_LANGUAGE, mermaidId, MERMAID_ID_ATTR, actions);
const preservedCode = (node.data as DiagramPreData | undefined)?.sourceCode;
const sourceView = createSourceView(preservedCode, diagramText, MERMAID_LANGUAGE);
const wrapper = createWrapper(
header,
node,
MERMAID_WRAPPER_CLASS,
MERMAID_SCROLL_CONTAINER_CLASS,
{ [MERMAID_ID_ATTR]: mermaidId }
{
[MERMAID_ID_ATTR]: mermaidId,
[DIAGRAM_VIEW_MODE_ATTR]: DIAGRAM_VIEW_RENDERED
},
[sourceView]
);
// Replace pre with wrapper in parent
@@ -18,12 +18,17 @@ import {
SVG_BLOCK_CLASS,
SVG_LANGUAGE,
SVG_SOURCE_ATTR,
SVG_ID_ATTR
SVG_ID_ATTR,
DIAGRAM_VIEW_MODE_ATTR,
DIAGRAM_VIEW_RENDERED
} from '$lib/constants';
import type { DiagramPreData } from './pre-transform';
import {
createBlockHeader,
createCopyButton,
createPreviewButton,
createToggleSourceButton,
createSourceView,
createWrapper,
generateBlockId
} from './code-block-utils';
@@ -65,13 +70,24 @@ export const rehypeEnhanceSvgBlocks: Plugin<[], Root> = () => {
const actions = [
createCopyButton(svgId, SVG_ID_ATTR, 'Copy svg source'),
createToggleSourceButton(svgId, SVG_ID_ATTR, 'Toggle svg source'),
createPreviewButton(svgId, SVG_ID_ATTR, 'Preview svg')
];
const header = createBlockHeader(SVG_LANGUAGE, svgId, SVG_ID_ATTR, actions);
const wrapper = createWrapper(header, node, SVG_WRAPPER_CLASS, SVG_SCROLL_CONTAINER_CLASS, {
[SVG_ID_ATTR]: svgId
});
const preservedCode = (node.data as DiagramPreData | undefined)?.sourceCode;
const sourceView = createSourceView(preservedCode, svgSource, SVG_LANGUAGE);
const wrapper = createWrapper(
header,
node,
SVG_WRAPPER_CLASS,
SVG_SCROLL_CONTAINER_CLASS,
{
[SVG_ID_ATTR]: svgId,
[DIAGRAM_VIEW_MODE_ATTR]: DIAGRAM_VIEW_RENDERED
},
[sourceView]
);
// Replace pre with wrapper in parent
(parent.children as ElementContent[])[index] = wrapper;

Some files were not shown because too many files have changed in this diff Show More