mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-21 21:27:37 +02:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5d3eb999b6 |
@@ -13,20 +13,6 @@ ARG APP_REVISION=N/A
|
||||
# BUILD STAGE
|
||||
# Compile all binary files and libraries
|
||||
# ==============================================================================
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
FROM ${CANN_BASE_IMAGE} AS build
|
||||
|
||||
# -- Install build dependencies --
|
||||
@@ -40,8 +26,6 @@ WORKDIR /app
|
||||
# -- Copy project files --
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
# -- Set CANN environment variables (required for compilation) --
|
||||
# Using ENV instead of `source` allows environment variables to persist across the entire image layer
|
||||
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
|
||||
|
||||
@@ -3,20 +3,6 @@ ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
@@ -30,8 +16,6 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
|
||||
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
|
||||
else \
|
||||
|
||||
@@ -11,20 +11,6 @@ ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
ARG GCC_VERSION
|
||||
@@ -40,8 +26,6 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
|
||||
@@ -5,20 +5,6 @@ ARG APP_REVISION=N/A
|
||||
|
||||
## Build Image
|
||||
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=ON
|
||||
@@ -36,8 +22,6 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||
echo "GGML_SYCL_F16 is set" \
|
||||
&& export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \
|
||||
|
||||
@@ -10,20 +10,6 @@ ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||
|
||||
# MUSA architecture to build for (defaults to all supported archs)
|
||||
@@ -43,8 +29,6 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
|
||||
export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
|
||||
fi && \
|
||||
|
||||
@@ -22,20 +22,6 @@ ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
## Build Image
|
||||
FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build
|
||||
|
||||
@@ -83,8 +69,6 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
# Build Stage
|
||||
RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
|
||||
cmake -B build/ReleaseOV -G Ninja \
|
||||
|
||||
@@ -11,20 +11,6 @@ ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
### Build image
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
@@ -52,8 +38,6 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||
cmake -S . -B build \
|
||||
-DGGML_HIP=ON \
|
||||
|
||||
@@ -3,20 +3,6 @@ ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Install build tools
|
||||
@@ -31,8 +17,6 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
|
||||
|
||||
@@ -3,20 +3,6 @@ ARG BUILD_DATE=N/A
|
||||
ARG APP_VERSION=N/A
|
||||
ARG APP_REVISION=N/A
|
||||
|
||||
ARG NODE_VERSION=24
|
||||
|
||||
FROM docker.io/node:$NODE_VERSION AS web
|
||||
|
||||
ARG APP_VERSION
|
||||
|
||||
WORKDIR /app/tools/ui
|
||||
|
||||
COPY tools/ui/package.json tools/ui/package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY tools/ui/ ./
|
||||
RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build
|
||||
|
||||
FROM docker.io/ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
@@ -28,8 +14,6 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
COPY --from=web /app/tools/ui/dist tools/ui/dist
|
||||
|
||||
RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
|
||||
cmake --build build -j $(nproc)
|
||||
|
||||
|
||||
@@ -10,8 +10,6 @@
|
||||
|
||||
build*/
|
||||
|
||||
tools/ui/node_modules/
|
||||
|
||||
models/*
|
||||
|
||||
/llama-cli
|
||||
|
||||
@@ -58,13 +58,6 @@ jobs:
|
||||
git tag ${{ steps.srctag.outputs.name }} || exit 0
|
||||
git push origin ${{ steps.srctag.outputs.name }} || exit 0
|
||||
|
||||
build_ui:
|
||||
name: Build UI
|
||||
needs: create_tag
|
||||
uses: ./.github/workflows/ui-build.yml
|
||||
with:
|
||||
hf_ui_version: ${{ needs.create_tag.outputs.source_tag }}
|
||||
|
||||
prepare_matrices:
|
||||
name: Prepare Docker matrices
|
||||
runs-on: ubuntu-24.04
|
||||
@@ -86,7 +79,7 @@ jobs:
|
||||
[
|
||||
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true },
|
||||
{ "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
{ "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
|
||||
{ "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
|
||||
@@ -142,7 +135,7 @@ jobs:
|
||||
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Registry
|
||||
needs: [prepare_matrices, create_tag, build_ui]
|
||||
needs: [prepare_matrices, create_tag]
|
||||
|
||||
runs-on: ${{ matrix.config.runs_on }}
|
||||
strategy:
|
||||
@@ -157,13 +150,6 @@ jobs:
|
||||
fetch-depth: 0
|
||||
ref: ${{ needs.create_tag.outputs.source_tag }}
|
||||
|
||||
- name: Download prebuilt UI
|
||||
if: ${{ matrix.config.prebuilt_ui == true }}
|
||||
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
|
||||
with:
|
||||
name: ui-build
|
||||
path: tools/ui/dist
|
||||
|
||||
- name: Set up QEMU
|
||||
if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
|
||||
uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
|
||||
|
||||
@@ -1627,7 +1627,6 @@ jobs:
|
||||
**Windows:**
|
||||
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
|
||||
- [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip)
|
||||
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
|
||||
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip)
|
||||
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||
|
||||
+55
-57
@@ -17,7 +17,6 @@
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <shellapi.h>
|
||||
#endif
|
||||
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
@@ -303,6 +302,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
|
||||
if (!model.docker_repo.empty()) {
|
||||
model.path = common_docker_resolve_model(model.docker_repo);
|
||||
model.name = model.docker_repo;
|
||||
} else if (!model.hf_repo.empty()) {
|
||||
// If -m was used with -hf, treat the model "path" as the hf_file to download
|
||||
if (model.hf_file.empty() && !model.path.empty()) {
|
||||
@@ -322,6 +322,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
|
||||
throw std::runtime_error("failed to download model from Hugging Face");
|
||||
}
|
||||
|
||||
model.name = model.hf_repo;
|
||||
model.path = download_result.model_path;
|
||||
|
||||
if (!download_result.mmproj_path.empty()) {
|
||||
@@ -892,44 +893,7 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
|
||||
return true;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
struct utf8_argv {
|
||||
std::vector<std::string> buf;
|
||||
std::vector<char*> ptrs;
|
||||
};
|
||||
|
||||
static utf8_argv make_utf8_argv() {
|
||||
utf8_argv out;
|
||||
int wargc = 0;
|
||||
LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc);
|
||||
if (!wargv) return out;
|
||||
|
||||
out.buf.reserve(wargc);
|
||||
for (int i = 0; i < wargc; ++i) {
|
||||
int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr);
|
||||
if (n <= 0) { out.buf.emplace_back(); continue; }
|
||||
auto& s = out.buf.emplace_back();
|
||||
s.resize(static_cast<size_t>(n - 1));
|
||||
(void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr);
|
||||
}
|
||||
LocalFree(wargv);
|
||||
|
||||
out.ptrs.reserve(out.buf.size() + 1);
|
||||
for (auto& s : out.buf) out.ptrs.push_back(s.data());
|
||||
out.ptrs.push_back(nullptr);
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
||||
#ifdef _WIN32
|
||||
auto utf8 = make_utf8_argv();
|
||||
// repair argv only when it matches the process command line
|
||||
if (static_cast<int>(utf8.buf.size()) == argc) {
|
||||
argv = utf8.ptrs.data();
|
||||
}
|
||||
#endif
|
||||
|
||||
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
|
||||
const common_params params_org = ctx_arg.params; // the example can modify the default params
|
||||
|
||||
@@ -2866,26 +2830,62 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.api_prefix = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
||||
// Deprecated: use --ui-config instead (kept for backward compat)
|
||||
add_opt(common_arg(
|
||||
{"--ui-config", "--webui-config"}, "JSON",
|
||||
{"--webui-config"}, "JSON",
|
||||
"[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.ui_config_json = value;
|
||||
params.webui_config_json = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"--ui-config"}, "JSON",
|
||||
"JSON that provides default UI settings (overrides UI defaults)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.ui_config_json = value;
|
||||
params.webui_config_json = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
|
||||
|
||||
// Deprecated: use --ui-config-file instead (kept for backward compat)
|
||||
add_opt(common_arg(
|
||||
{"--ui-config-file", "--webui-config-file"}, "PATH",
|
||||
{"--webui-config-file"}, "PATH",
|
||||
"[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.ui_config_json = read_file(value);
|
||||
params.webui_config_json = params.ui_config_json;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"--ui-config-file"}, "PATH",
|
||||
"JSON file that provides default UI settings (overrides UI defaults)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.ui_config_json = read_file(value);
|
||||
params.webui_config_json = params.ui_config_json;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
|
||||
|
||||
// Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
|
||||
add_opt(common_arg(
|
||||
{"--ui-mcp-proxy", "--webui-mcp-proxy"},
|
||||
{"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"},
|
||||
{"--webui-mcp-proxy"},
|
||||
{"--no-webui-mcp-proxy"},
|
||||
"[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
|
||||
[](common_params & params, bool value) {
|
||||
params.ui_mcp_proxy = value;
|
||||
params.webui_mcp_proxy = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"--ui-mcp-proxy"},
|
||||
{"--no-ui-mcp-proxy"},
|
||||
"experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
|
||||
[](common_params & params, bool value) {
|
||||
params.ui_mcp_proxy = value;
|
||||
params.webui_mcp_proxy = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
|
||||
add_opt(common_arg(
|
||||
@@ -2897,26 +2897,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.server_tools = parse_csv_row(value);
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
|
||||
// Deprecated: use --ui/--no-ui instead (kept for backward compat)
|
||||
add_opt(common_arg(
|
||||
{"-ag", "--agent"},
|
||||
{"-no-ag", "--no-agent"},
|
||||
"whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)",
|
||||
{"--webui"},
|
||||
{"--no-webui"},
|
||||
"[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
|
||||
[](common_params & params, bool value) {
|
||||
if (value) {
|
||||
params.server_tools = {"all"};
|
||||
params.ui_mcp_proxy = true;
|
||||
} else {
|
||||
params.server_tools.clear();
|
||||
params.ui_mcp_proxy = false;
|
||||
}
|
||||
params.ui = value;
|
||||
params.webui = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT"));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
|
||||
|
||||
add_opt(common_arg(
|
||||
{"--ui", "--webui"},
|
||||
{"--no-ui", "--no-webui"},
|
||||
{"--ui"},
|
||||
{"--no-ui"},
|
||||
string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
|
||||
[](common_params & params, bool value) {
|
||||
params.ui = value;
|
||||
params.webui = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
|
||||
add_opt(common_arg(
|
||||
@@ -2947,7 +2945,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
||||
add_opt(common_arg(
|
||||
{"--api-key-file"}, "FNAME",
|
||||
"path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)",
|
||||
"path to file containing API keys (default: none)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
std::ifstream key_file(value);
|
||||
if (!key_file) {
|
||||
@@ -2955,7 +2953,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
}
|
||||
std::string key;
|
||||
while (std::getline(key_file, key)) {
|
||||
if (!key.empty() && key[0] != '#') {
|
||||
if (!key.empty()) {
|
||||
params.api_keys.push_back(key);
|
||||
}
|
||||
}
|
||||
|
||||
+1
-15
@@ -1074,18 +1074,6 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
|
||||
return files;
|
||||
}
|
||||
|
||||
std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) {
|
||||
#ifdef _WIN32
|
||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
|
||||
if (!wlen) { return std::ifstream(); }
|
||||
std::vector<wchar_t> wfname(wlen);
|
||||
(void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
|
||||
return std::ifstream(wfname.data(), mode);
|
||||
#else
|
||||
return std::ifstream(fname, mode);
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
@@ -2046,7 +2034,7 @@ bool common_prompt_batch_decode(
|
||||
}
|
||||
|
||||
size_t common_prompt_checkpoint::size() const {
|
||||
return data_tgt.size() + data_dft.size() + data_spec.size();
|
||||
return data_tgt.size() + data_dft.size();
|
||||
}
|
||||
|
||||
bool common_prompt_checkpoint::empty() const {
|
||||
@@ -2061,7 +2049,6 @@ void common_prompt_checkpoint::clear() {
|
||||
|
||||
data_tgt.clear();
|
||||
data_dft.clear();
|
||||
data_spec.clear();
|
||||
}
|
||||
|
||||
void common_prompt_checkpoint::update_pos(
|
||||
@@ -2151,5 +2138,4 @@ void common_prompt_checkpoint::clear_tgt() {
|
||||
|
||||
void common_prompt_checkpoint::clear_dft() {
|
||||
data_dft.clear();
|
||||
data_spec.clear();
|
||||
}
|
||||
|
||||
+8
-18
@@ -295,16 +295,7 @@ struct common_params_model {
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string docker_repo = ""; // Docker repo // NOLINT
|
||||
|
||||
std::string get_name() {
|
||||
if (!hf_repo.empty()) {
|
||||
return hf_repo;
|
||||
}
|
||||
if (!docker_repo.empty()) {
|
||||
return docker_repo;
|
||||
}
|
||||
return path;
|
||||
}
|
||||
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
|
||||
};
|
||||
|
||||
// draft-model-based speculative decoding parameters
|
||||
@@ -372,7 +363,7 @@ struct common_params_speculative {
|
||||
|
||||
uint32_t need_n_rs_seq() const {
|
||||
bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
|
||||
return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3;
|
||||
return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
|
||||
});
|
||||
|
||||
return needs_rs_seq ? draft.n_max : 0u;
|
||||
@@ -633,6 +624,12 @@ struct common_params {
|
||||
|
||||
// UI configs
|
||||
bool ui = true;
|
||||
|
||||
// Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
|
||||
bool webui = ui;
|
||||
bool webui_mcp_proxy = false;
|
||||
std::string webui_config_json;
|
||||
|
||||
bool ui_mcp_proxy = false;
|
||||
std::string ui_config_json;
|
||||
|
||||
@@ -851,9 +848,6 @@ struct common_file_info {
|
||||
};
|
||||
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
||||
|
||||
// fs open, also handle UTF8 on Windows
|
||||
std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode);
|
||||
|
||||
//
|
||||
// TTY utils
|
||||
//
|
||||
@@ -1071,10 +1065,6 @@ struct common_prompt_checkpoint {
|
||||
std::vector<uint8_t> data_tgt;
|
||||
std::vector<uint8_t> data_dft;
|
||||
|
||||
// (optional) speculative-decoding implementation state stashed with the checkpoint
|
||||
// (e.g. eagle3's deferred-boundary g_embd row)
|
||||
std::vector<uint8_t> data_spec;
|
||||
|
||||
size_t size() const;
|
||||
|
||||
bool empty() const;
|
||||
|
||||
+46
-89
@@ -686,62 +686,59 @@ value set_statement::execute_impl(context & ctx) {
|
||||
return mk_val<value_undefined>();
|
||||
}
|
||||
|
||||
static inline void bind_parameters(const std::string & name, const statements & this_args, const func_args & args, context & ctx) {
|
||||
const size_t expected_count = this_args.size();
|
||||
const size_t input_count = args.count();
|
||||
|
||||
JJ_DEBUG("Invoking '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
|
||||
for (size_t i = 0; i < expected_count; ++i) {
|
||||
if (i < input_count) {
|
||||
if (is_stmt<identifier>(this_args[i])) {
|
||||
// normal parameter
|
||||
std::string param_name = cast_stmt<identifier>(this_args[i])->val;
|
||||
value param_value = args.get_kwarg_or_pos(param_name, i);
|
||||
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
|
||||
ctx.set_val(param_name, param_value);
|
||||
} else if (is_stmt<keyword_argument_expression>(this_args[i])) {
|
||||
// default argument used as normal parameter
|
||||
auto kwarg = cast_stmt<keyword_argument_expression>(this_args[i]);
|
||||
if (!is_stmt<identifier>(kwarg->key)) {
|
||||
throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
|
||||
}
|
||||
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
|
||||
value param_value = args.get_kwarg_or_pos(param_name, i);
|
||||
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
|
||||
ctx.set_val(param_name, param_value);
|
||||
} else {
|
||||
throw std::runtime_error("Invalid parameter type in '" + name + "'");
|
||||
}
|
||||
} else {
|
||||
auto & default_arg = this_args[i];
|
||||
if (is_stmt<keyword_argument_expression>(default_arg)) {
|
||||
auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
|
||||
if (!is_stmt<identifier>(kwarg->key)) {
|
||||
throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'");
|
||||
}
|
||||
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
|
||||
JJ_DEBUG(" Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
|
||||
ctx.set_val(param_name, kwarg->val->execute(args.ctx));
|
||||
} else {
|
||||
throw std::runtime_error("Not enough arguments provided to '" + name + "'");
|
||||
}
|
||||
//std::string param_name = cast_stmt<identifier>(default_args[i])->val;
|
||||
//JJ_DEBUG(" Binding parameter '%s' to default", param_name.c_str());
|
||||
//ctx.var[param_name] = default_args[i]->execute(ctx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
value macro_statement::execute_impl(context & ctx) {
|
||||
if (!is_stmt<identifier>(this->name)) {
|
||||
throw std::runtime_error("Macro name must be an identifier");
|
||||
}
|
||||
std::string name = cast_stmt<identifier>(this->name)->val;
|
||||
|
||||
const func_handler func = [this, name](const func_args & args) -> value {
|
||||
context macro_ctx(args.ctx); // new scope for macro execution
|
||||
const func_handler func = [this, name, &ctx](const func_args & args) -> value {
|
||||
size_t expected_count = this->args.size();
|
||||
size_t input_count = args.count();
|
||||
|
||||
bind_parameters(name, this->args, args, macro_ctx);
|
||||
JJ_DEBUG("Invoking macro '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count);
|
||||
context macro_ctx(ctx); // new scope for macro execution
|
||||
|
||||
// bind parameters
|
||||
for (size_t i = 0; i < expected_count; ++i) {
|
||||
if (i < input_count) {
|
||||
if (is_stmt<identifier>(this->args[i])) {
|
||||
// normal parameter
|
||||
std::string param_name = cast_stmt<identifier>(this->args[i])->val;
|
||||
value param_value = args.get_kwarg_or_pos(param_name, i);
|
||||
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
|
||||
macro_ctx.set_val(param_name, param_value);
|
||||
} else if (is_stmt<keyword_argument_expression>(this->args[i])) {
|
||||
// default argument used as normal parameter
|
||||
auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
|
||||
if (!is_stmt<identifier>(kwarg->key)) {
|
||||
throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
|
||||
}
|
||||
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
|
||||
value param_value = args.get_kwarg_or_pos(param_name, i);
|
||||
JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
|
||||
macro_ctx.set_val(param_name, param_value);
|
||||
} else {
|
||||
throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
|
||||
}
|
||||
} else {
|
||||
auto & default_arg = this->args[i];
|
||||
if (is_stmt<keyword_argument_expression>(default_arg)) {
|
||||
auto kwarg = cast_stmt<keyword_argument_expression>(default_arg);
|
||||
if (!is_stmt<identifier>(kwarg->key)) {
|
||||
throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
|
||||
}
|
||||
std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
|
||||
JJ_DEBUG(" Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str());
|
||||
macro_ctx.set_val(param_name, kwarg->val->execute(ctx));
|
||||
} else {
|
||||
throw std::runtime_error("Not enough arguments provided to macro '" + name + "'");
|
||||
}
|
||||
//std::string param_name = cast_stmt<identifier>(default_args[i])->val;
|
||||
//JJ_DEBUG(" Binding parameter '%s' to default", param_name.c_str());
|
||||
//macro_ctx.var[param_name] = default_args[i]->execute(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
// execute macro body
|
||||
JJ_DEBUG("Executing macro '%s' body with %zu statements", name.c_str(), this->body.size());
|
||||
@@ -755,46 +752,6 @@ value macro_statement::execute_impl(context & ctx) {
|
||||
return mk_val<value_undefined>();
|
||||
}
|
||||
|
||||
value call_statement::execute_impl(context & ctx) {
|
||||
auto call_expr = cast_stmt<call_expression>(this->call);
|
||||
if (!call_expr) {
|
||||
throw std::runtime_error("Call statement requires a valid call expression");
|
||||
}
|
||||
|
||||
value callee_val = call_expr->callee->execute(ctx);
|
||||
if (!is_val<value_func>(callee_val)) {
|
||||
throw std::runtime_error("Callee is not a function: got " + callee_val->type());
|
||||
}
|
||||
auto * callee_func = cast_val<value_func>(callee_val);
|
||||
|
||||
context caller_ctx(ctx); // new scope for caller execution
|
||||
|
||||
const func_handler func = [this, caller_ctx = std::move(caller_ctx)](const func_args & args) -> value {
|
||||
context block_ctx(caller_ctx); // new scope for block execution
|
||||
|
||||
bind_parameters("caller", this->caller_args, args, block_ctx);
|
||||
|
||||
JJ_DEBUG("Executing call body with %zu statements", this->body.size());
|
||||
auto res = exec_statements(this->body, block_ctx);
|
||||
JJ_DEBUG("Call body execution complete, result: %s", res->val_str.str().c_str());
|
||||
return res;
|
||||
};
|
||||
|
||||
context call_ctx(ctx);
|
||||
call_ctx.set_val("caller", mk_val<value_func>("caller", func));
|
||||
|
||||
func_args args(call_ctx);
|
||||
|
||||
for (const auto & arg_expr : call_expr->args) {
|
||||
auto arg_val = arg_expr->execute(ctx);
|
||||
JJ_DEBUG(" Argument type: %s", arg_val->type().c_str());
|
||||
args.push_back(arg_val);
|
||||
}
|
||||
|
||||
JJ_DEBUG("Calling macro '%s' with %zu arguments", callee_func->name.c_str(), args.count());
|
||||
return callee_func->invoke(args);
|
||||
}
|
||||
|
||||
value member_expression::execute_impl(context & ctx) {
|
||||
value object = this->object->execute(ctx);
|
||||
|
||||
|
||||
@@ -552,7 +552,6 @@ struct call_statement : public statement {
|
||||
for (const auto & arg : this->caller_args) chk_type<expression>(arg);
|
||||
}
|
||||
std::string type() const override { return "CallStatement"; }
|
||||
value execute_impl(context & ctx) override;
|
||||
};
|
||||
|
||||
struct ternary_expression : public expression {
|
||||
|
||||
@@ -233,27 +233,27 @@ struct BuiltinRule {
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
||||
{"boolean", {"(\"true\" | \"false\")", {}}},
|
||||
{"boolean", {"(\"true\" | \"false\") space", {}}},
|
||||
{"decimal-part", {"[0-9]{1,16}", {}}},
|
||||
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
|
||||
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)?", {"integral-part", "decimal-part"}}},
|
||||
{"integer", {"(\"-\"? integral-part)", {"integral-part"}}},
|
||||
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
|
||||
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
|
||||
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
|
||||
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? space \"}\"", {"string", "value"}}},
|
||||
{"array", {"\"[\" space ( value (\",\" space value)* )? space \"]\"", {"value"}}},
|
||||
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\"", {}}},
|
||||
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
|
||||
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
|
||||
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
|
||||
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
|
||||
{"string", {"\"\\\"\" char* \"\\\"\"", {"char"}}},
|
||||
{"null", {"\"null\"", {}}},
|
||||
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
|
||||
{"null", {"\"null\" space", {}}},
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
||||
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
|
||||
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
|
||||
{"date-time", {"date \"T\" time", {"date", "time"}}},
|
||||
{"date-string", {"\"\\\"\" date \"\\\"\"", {"date"}}},
|
||||
{"time-string", {"\"\\\"\" time \"\\\"\"", {"time"}}},
|
||||
{"date-time-string", {"\"\\\"\" date-time \"\\\"\"", {"date-time"}}}
|
||||
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
|
||||
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
|
||||
{"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
|
||||
};
|
||||
|
||||
static bool is_reserved_name(const std::string & name) {
|
||||
@@ -551,16 +551,16 @@ private:
|
||||
}
|
||||
return join_seq();
|
||||
};
|
||||
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"");
|
||||
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
|
||||
}
|
||||
|
||||
/*
|
||||
Returns a rule that matches a JSON string that is none of the provided strings
|
||||
|
||||
not_strings({"a"})
|
||||
-> ["] ( [a] char+ | [^"a] char* )? ["]
|
||||
-> ["] ( [a] char+ | [^"a] char* )? ["] space
|
||||
not_strings({"and", "also"})
|
||||
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["]
|
||||
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
|
||||
*/
|
||||
std::string _not_strings(const std::vector<std::string> & strings) {
|
||||
|
||||
@@ -619,7 +619,7 @@ private:
|
||||
if (!trie.is_end_of_string) {
|
||||
out << "?";
|
||||
}
|
||||
out << " [\"]";
|
||||
out << " [\"] space";
|
||||
return out.str();
|
||||
}
|
||||
|
||||
@@ -725,7 +725,7 @@ private:
|
||||
rule += " )?";
|
||||
}
|
||||
|
||||
rule += " space \"}\"";
|
||||
rule += " \"}\" space";
|
||||
|
||||
return rule;
|
||||
}
|
||||
@@ -858,14 +858,14 @@ public:
|
||||
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
|
||||
}
|
||||
if (schema.contains("const")) {
|
||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
|
||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
|
||||
}
|
||||
if (schema.contains("enum")) {
|
||||
std::vector<std::string> enum_values;
|
||||
for (const auto & v : schema["enum"]) {
|
||||
enum_values.push_back(_generate_constant_rule(v));
|
||||
}
|
||||
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ")");
|
||||
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
|
||||
}
|
||||
if ((schema_type.is_null() || schema_type == "object")
|
||||
&& (schema.contains("properties") ||
|
||||
@@ -933,7 +933,7 @@ public:
|
||||
}
|
||||
}
|
||||
if (!enum_intersection.empty()) {
|
||||
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ")");
|
||||
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
|
||||
}
|
||||
}
|
||||
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
|
||||
@@ -948,7 +948,7 @@ public:
|
||||
}
|
||||
rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
|
||||
}
|
||||
rule += " space \"]\"";
|
||||
rule += " \"]\" space";
|
||||
return _add_rule(rule_name, rule);
|
||||
}
|
||||
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
|
||||
@@ -956,7 +956,7 @@ public:
|
||||
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
|
||||
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
|
||||
|
||||
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " space \"]\"");
|
||||
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
|
||||
}
|
||||
if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
|
||||
return _visit_pattern(schema["pattern"], rule_name);
|
||||
@@ -972,7 +972,7 @@ public:
|
||||
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
|
||||
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
||||
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
||||
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\"");
|
||||
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
||||
}
|
||||
if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
||||
int64_t min_value = std::numeric_limits<int64_t>::min();
|
||||
@@ -990,7 +990,7 @@ public:
|
||||
std::stringstream out;
|
||||
out << "(";
|
||||
build_min_max_int(min_value, max_value, out);
|
||||
out << ")";
|
||||
out << ") space";
|
||||
return _add_rule(rule_name, out.str());
|
||||
}
|
||||
if (schema.empty() || schema_type == "object") {
|
||||
|
||||
+78
-118
@@ -6,14 +6,13 @@
|
||||
#include "unicode.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <deque>
|
||||
#include <initializer_list>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <stdexcept>
|
||||
#include <unordered_set>
|
||||
|
||||
// Trick to catch missing branches
|
||||
template <typename T>
|
||||
@@ -89,7 +88,40 @@ struct trie {
|
||||
return match_result{match_result::NO_MATCH};
|
||||
}
|
||||
|
||||
struct prefix_and_next {
|
||||
std::vector<uint32_t> prefix;
|
||||
std::vector<uint32_t> next_chars;
|
||||
};
|
||||
|
||||
std::vector<prefix_and_next> collect_prefix_and_next() {
|
||||
std::vector<uint32_t> prefix;
|
||||
std::vector<prefix_and_next> result;
|
||||
collect_prefix_and_next(0, prefix, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
void collect_prefix_and_next(size_t index, std::vector<uint32_t> & prefix, std::vector<prefix_and_next> & out) {
|
||||
if (!nodes[index].is_word) {
|
||||
if (!nodes[index].children.empty()) {
|
||||
std::vector<uint32_t> chars;
|
||||
chars.reserve(nodes[index].children.size());
|
||||
for (const auto & p : nodes[index].children) {
|
||||
chars.push_back(p.first);
|
||||
}
|
||||
out.emplace_back(prefix_and_next{prefix, chars});
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & p : nodes[index].children) {
|
||||
uint32_t ch = p.first;
|
||||
auto child = p.second;
|
||||
prefix.push_back(ch);
|
||||
collect_prefix_and_next(child, prefix, out);
|
||||
prefix.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
size_t create_node() {
|
||||
size_t index = nodes.size();
|
||||
nodes.emplace_back();
|
||||
@@ -121,65 +153,6 @@ struct trie {
|
||||
}
|
||||
};
|
||||
|
||||
// Aho-Corasick automaton
|
||||
struct aho_corasick {
|
||||
trie t;
|
||||
std::vector<size_t> fail; // failure links
|
||||
std::vector<size_t> order; // states in BFS order
|
||||
std::vector<bool> terminal; // match states (directly or via a suffix link)
|
||||
std::set<uint32_t> alphabet; // every character with a transition
|
||||
|
||||
aho_corasick(const std::vector<std::string> & strings) : t(strings) {
|
||||
const auto & nodes = t.nodes;
|
||||
const size_t n = nodes.size();
|
||||
|
||||
fail.assign(n, 0);
|
||||
order.reserve(n);
|
||||
|
||||
std::deque<size_t> queue{ 0 };
|
||||
while (!queue.empty()) {
|
||||
size_t u = queue.front();
|
||||
queue.pop_front();
|
||||
order.push_back(u);
|
||||
for (const auto & [ch, v] : nodes[u].children) {
|
||||
if (u != 0) {
|
||||
size_t f = fail[u];
|
||||
while (f && nodes[f].children.find(ch) == nodes[f].children.end()) {
|
||||
f = fail[f];
|
||||
}
|
||||
auto it = nodes[f].children.find(ch);
|
||||
fail[v] = (it != nodes[f].children.end() && it->second != v) ? it->second : 0;
|
||||
}
|
||||
queue.push_back(v);
|
||||
}
|
||||
}
|
||||
|
||||
terminal.assign(n, false);
|
||||
for (size_t u : order) {
|
||||
terminal[u] = nodes[u].is_word || (u != 0 && terminal[fail[u]]);
|
||||
}
|
||||
|
||||
for (const auto & node : nodes) {
|
||||
for (const auto & [ch, v] : node.children) {
|
||||
alphabet.insert(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t num_states() const { return t.nodes.size(); }
|
||||
bool is_terminal(size_t s) const { return terminal[s]; }
|
||||
|
||||
// follow failure links until a transition on `ch` exists.
|
||||
size_t next(size_t state, uint32_t ch) const {
|
||||
const auto & nodes = t.nodes;
|
||||
while (state && nodes[state].children.find(ch) == nodes[state].children.end()) {
|
||||
state = fail[state];
|
||||
}
|
||||
auto it = nodes[state].children.find(ch);
|
||||
return it != nodes[state].children.end() ? it->second : 0;
|
||||
}
|
||||
};
|
||||
|
||||
static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
|
||||
if (pos + hex_count > str.length()) {
|
||||
return {0, 0};
|
||||
@@ -1019,12 +992,12 @@ void common_peg_arena::resolve_refs() {
|
||||
}
|
||||
|
||||
std::string common_peg_arena::dump(common_peg_parser_id id) const {
|
||||
std::set<common_peg_parser_id> visited;
|
||||
std::unordered_set<common_peg_parser_id> visited;
|
||||
return dump_impl(id, visited);
|
||||
}
|
||||
|
||||
std::string common_peg_arena::dump_impl(common_peg_parser_id id,
|
||||
std::set<common_peg_parser_id> & visited) const {
|
||||
std::unordered_set<common_peg_parser_id> & visited) const {
|
||||
// Check for cycles
|
||||
if (visited.count(id)) {
|
||||
return "[cycle]";
|
||||
@@ -1369,7 +1342,7 @@ common_peg_parser common_peg_parser_builder::json_object() {
|
||||
common_peg_parser common_peg_parser_builder::json_array() {
|
||||
return rule("json-array", [this]() {
|
||||
auto ws = space();
|
||||
auto elements = sequence({json(), zero_or_more(sequence({ws, literal(","), ws, json()}))});
|
||||
auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
|
||||
return sequence({
|
||||
literal("["),
|
||||
ws,
|
||||
@@ -1529,74 +1502,61 @@ static std::string gbnf_escape_char_class(uint32_t c) {
|
||||
return std::string(buf);
|
||||
}
|
||||
|
||||
// GBNF grammar matching strings that contain no string in `strings` as a
|
||||
// substring. Emits the complement of an Aho-Corasick automaton DFA and returns
|
||||
// the start state rule name.
|
||||
//
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/24839
|
||||
static std::string gbnf_excluding_grammar(const common_grammar_builder & builder,
|
||||
const std::string & prefix,
|
||||
const std::vector<std::string> & strings) {
|
||||
aho_corasick ac(strings);
|
||||
static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
|
||||
trie matcher(strings);
|
||||
auto pieces = matcher.collect_prefix_and_next();
|
||||
|
||||
auto state_name = [&](size_t s) -> std::string {
|
||||
if (s == 0) {
|
||||
return prefix;
|
||||
std::string pattern;
|
||||
std::string trailing; // optional proper-prefix of a delimiter, allowed only at the very end
|
||||
for (size_t i = 0; i < pieces.size(); ++i) {
|
||||
if (i > 0) {
|
||||
pattern += " | ";
|
||||
}
|
||||
std::string num = std::to_string(s);
|
||||
num = num.size() == 1 ? ("0" + num) : num;
|
||||
return prefix + "-" + num;
|
||||
};
|
||||
|
||||
auto char_class = [](const std::vector<uint32_t> & chars, bool negate) {
|
||||
std::string s = negate ? "[^" : "[";
|
||||
const auto & pre = pieces[i].prefix;
|
||||
const auto & chars = pieces[i].next_chars;
|
||||
|
||||
std::string cls;
|
||||
cls.reserve(chars.size());
|
||||
for (uint32_t ch : chars) {
|
||||
s += gbnf_escape_char_class(ch);
|
||||
}
|
||||
return s + "]";
|
||||
};
|
||||
|
||||
for (size_t q = 0; q < ac.num_states(); q++) {
|
||||
if (ac.is_terminal(q)) {
|
||||
continue; // match states are dropped
|
||||
cls += gbnf_escape_char_class(ch);
|
||||
}
|
||||
|
||||
std::map<size_t, std::vector<uint32_t>> buckets;
|
||||
std::vector<uint32_t> excluded;
|
||||
for (uint32_t c : ac.alphabet) {
|
||||
size_t d = ac.next(q, c);
|
||||
if (ac.is_terminal(d)) {
|
||||
excluded.push_back(c); // completes a forbidden string -> omit
|
||||
} else if (d != 0) {
|
||||
buckets[d].push_back(c); // specific non-root destination
|
||||
excluded.push_back(c);
|
||||
if (!pre.empty()) {
|
||||
std::string pre_literal = gbnf_format_literal(common_unicode_cpts_to_utf8(pre));
|
||||
pattern += pre_literal + " [^" + cls + "]";
|
||||
// Each interior alternative consumes a delimiter-prefix plus a disambiguating
|
||||
// char, so the repetition alone cannot match a value that *ends* on a proper
|
||||
// prefix of a delimiter (e.g. a trailing "\n" when the delimiter is
|
||||
// "\n</parameter>\n"). The runtime until() (greedy first-match) accepts such
|
||||
// values, so without this the grammar would reject input the parser accepts.
|
||||
// Allow the value to terminate on any proper prefix as an optional tail.
|
||||
// This makes the grammar a slight superset of the runtime language (a value
|
||||
// may end on the longest prefix, which greedy first-match would not itself
|
||||
// produce); harmless for constrained generation, which only needs to admit
|
||||
// every runtime-valid string.
|
||||
if (!trailing.empty()) {
|
||||
trailing += " | ";
|
||||
}
|
||||
trailing += pre_literal;
|
||||
} else {
|
||||
pattern += "[^" + cls + "]";
|
||||
}
|
||||
|
||||
std::string rhs = "|"; // every state is accepting
|
||||
for (const auto & [d, chars] : buckets) {
|
||||
rhs += " " + char_class(chars, false) + " " + state_name(d) + " |";
|
||||
}
|
||||
rhs += " " + char_class(excluded, true) + " " + state_name(0);
|
||||
|
||||
builder.add_rule(state_name(q), rhs);
|
||||
}
|
||||
|
||||
// An empty delimiter makes the start state terminal. Emit an entry rule
|
||||
// that matches nothing so the returned reference stays valid.
|
||||
if (ac.is_terminal(0)) {
|
||||
builder.add_rule(prefix, "|");
|
||||
std::string result = "(" + pattern + ")*";
|
||||
if (!trailing.empty()) {
|
||||
result += " (" + trailing + ")?";
|
||||
}
|
||||
|
||||
return state_name(0);
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::set<std::string> collect_reachable_rules(
|
||||
static std::unordered_set<std::string> collect_reachable_rules(
|
||||
const common_peg_arena & arena,
|
||||
const common_peg_parser_id & rule
|
||||
) {
|
||||
std::set<std::string> reachable;
|
||||
std::set<std::string> visited;
|
||||
std::unordered_set<std::string> reachable;
|
||||
std::unordered_set<std::string> visited;
|
||||
|
||||
std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
|
||||
const auto & parser = arena.get(id);
|
||||
@@ -1805,7 +1765,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
|
||||
if (p.delimiters.empty()) {
|
||||
return ".*";
|
||||
}
|
||||
return gbnf_excluding_grammar(builder, "until-" + std::to_string(id), p.delimiters);
|
||||
return gbnf_excluding_pattern(p.delimiters);
|
||||
} else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
|
||||
if (schema_delegates(p)) {
|
||||
return to_gbnf(p.child);
|
||||
@@ -1829,7 +1789,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
|
||||
};
|
||||
|
||||
// Collect reachable rules
|
||||
std::set<std::string> reachable_rules;
|
||||
std::unordered_set<std::string> reachable_rules;
|
||||
|
||||
if (lazy) {
|
||||
// Collect rules reachable from trigger rules
|
||||
|
||||
+2
-2
@@ -3,8 +3,8 @@
|
||||
#include <nlohmann/json_fwd.hpp>
|
||||
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <functional>
|
||||
@@ -335,7 +335,7 @@ class common_peg_arena {
|
||||
friend class common_peg_parser_builder;
|
||||
|
||||
private:
|
||||
std::string dump_impl(common_peg_parser_id id, std::set<common_peg_parser_id> & visited) const;
|
||||
std::string dump_impl(common_peg_parser_id id, std::unordered_set<common_peg_parser_id> & visited) const;
|
||||
|
||||
common_peg_parser_id add_parser(common_peg_parser_variant parser);
|
||||
void add_rule(const std::string & name, common_peg_parser_id id);
|
||||
|
||||
+35
-174
@@ -161,10 +161,6 @@ struct common_speculative_impl {
|
||||
|
||||
virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;
|
||||
|
||||
// (optional) serialize/restore per-seq internal state (e.g. eagle3's deferred boundary).
|
||||
virtual bool get_state(llama_seq_id /*seq_id*/, std::vector<uint8_t> & /*data*/) const { return false; }
|
||||
virtual void set_state(llama_seq_id /*seq_id*/, const std::vector<uint8_t> & /*data*/) {}
|
||||
|
||||
// true if this implementation requires the target context to extract post-norm embeddings
|
||||
virtual bool need_embd() const = 0;
|
||||
|
||||
@@ -845,49 +841,6 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
|
||||
(size_t) n_embd_dec * sizeof(float));
|
||||
}
|
||||
|
||||
// we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets:
|
||||
// their single-position checkpoints drop it on restore
|
||||
bool need_boundary_stash() const {
|
||||
const llama_model * model_tgt = llama_get_model(params.ctx_tgt);
|
||||
return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt);
|
||||
}
|
||||
|
||||
bool get_state(llama_seq_id seq_id, std::vector<uint8_t> & data) const override {
|
||||
if (!need_boundary_stash()) {
|
||||
return false;
|
||||
}
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const llama_pos pos = pending_pos_last[seq_id];
|
||||
const std::vector<float> & g = pending_g_last[seq_id];
|
||||
|
||||
data.resize(sizeof(llama_pos) + g.size() * sizeof(float));
|
||||
std::memcpy(data.data(), &pos, sizeof(llama_pos));
|
||||
std::memcpy(data.data() + sizeof(llama_pos), g.data(), g.size() * sizeof(float));
|
||||
return true;
|
||||
}
|
||||
|
||||
void set_state(llama_seq_id seq_id, const std::vector<uint8_t> & data) override {
|
||||
if (!need_boundary_stash()) {
|
||||
return;
|
||||
}
|
||||
if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
|
||||
return;
|
||||
}
|
||||
if (data.size() != sizeof(llama_pos) + (size_t) n_embd_dec * sizeof(float)) {
|
||||
return;
|
||||
}
|
||||
|
||||
llama_pos pos = -1;
|
||||
std::memcpy(&pos, data.data(), sizeof(llama_pos));
|
||||
|
||||
pending_pos_last[seq_id] = pos;
|
||||
pending_g_last[seq_id].resize(n_embd_dec);
|
||||
std::memcpy(pending_g_last[seq_id].data(), data.data() + sizeof(llama_pos), (size_t) n_embd_dec * sizeof(float));
|
||||
}
|
||||
|
||||
bool need_embd() const override {
|
||||
return false;
|
||||
}
|
||||
@@ -905,13 +858,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
|
||||
int32_t n_embd = 0;
|
||||
|
||||
// One MTP draft driver, three modes (set once in the ctor):
|
||||
// is_mem_shared (gemma4): shares the target KV, runs all heads in one graph.
|
||||
// chain_heads (step35): n_mtp_layers trained heads, one per draft step.
|
||||
// neither (qwen35 / qwen35moe): a single trained MTP head.
|
||||
int32_t n_mtp_layers = 1;
|
||||
bool is_mem_shared = false; // gemma4
|
||||
bool chain_heads = false; // derived in the ctor: n_mtp_layers > 1 && !is_mem_shared
|
||||
bool is_mem_shared = false;
|
||||
|
||||
// Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
|
||||
// The last h-row of one process() call needs the first token of the NEXT
|
||||
@@ -926,8 +873,10 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
std::vector<std::vector<float>> verify_h;
|
||||
std::vector<int32_t> verify_h_rows;
|
||||
|
||||
std::vector<int> i_last;
|
||||
std::vector<std::vector<float>> chain_h;
|
||||
// Per-seq draft length from the last draft() call, used in accept() to
|
||||
// roll back ctx_dft's recurrent state past the AR draft's redundant
|
||||
// pre-advancement before process() mirrored the verify batch.
|
||||
std::vector<uint16_t> last_n_drafted;
|
||||
|
||||
common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
|
||||
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq)
|
||||
@@ -940,7 +889,6 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft));
|
||||
GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) &&
|
||||
"MTP input row width must match the target h_nextn width");
|
||||
n_mtp_layers = std::max(1, (int) llama_model_n_layer_nextn(llama_get_model(ctx_dft)));
|
||||
|
||||
LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
|
||||
LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
|
||||
@@ -987,25 +935,16 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
|
||||
|
||||
is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt;
|
||||
chain_heads = n_mtp_layers > 1 && !is_mem_shared;
|
||||
|
||||
if (chain_heads) {
|
||||
this->params.n_max = std::min(this->params.n_max, n_mtp_layers);
|
||||
|
||||
chain_h.assign(n_seq, {});
|
||||
for (auto & c : chain_h) {
|
||||
c.reserve((size_t) (this->params.n_max + 1) * n_embd);
|
||||
}
|
||||
}
|
||||
|
||||
pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));
|
||||
|
||||
i_last.assign(n_seq, -1);
|
||||
i_batch_beg.assign(n_seq, -1);
|
||||
i_batch_end.assign(n_seq, -1);
|
||||
|
||||
verify_h.assign(n_seq, {});
|
||||
verify_h_rows.assign(n_seq, 0);
|
||||
|
||||
last_n_drafted.assign(n_seq, 0);
|
||||
}
|
||||
|
||||
~common_speculative_impl_draft_mtp() override {
|
||||
@@ -1111,34 +1050,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
|
||||
}
|
||||
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
|
||||
bool ok = true;
|
||||
for (int head = 0; head < n_mtp_layers; ++head) {
|
||||
if (chain_heads) {
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/24340/changes#r3413498544
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
if (i_batch_beg[seq_id] < 0) {
|
||||
continue;
|
||||
}
|
||||
llama_memory_seq_rm(mem_dft, seq_id, batch_in.pos[i_batch_beg[seq_id]], -1);
|
||||
}
|
||||
llama_set_nextn_layer_offset(ctx_dft, head);
|
||||
}
|
||||
|
||||
const int32_t rc = llama_decode(ctx_dft, batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_decode(ctx_dft) head=%d failed rc=%d (pos=%d)\n",
|
||||
__func__, head, (int) rc, (int) batch_in.pos[0]);
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (chain_heads) {
|
||||
llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes
|
||||
}
|
||||
if (!ok) {
|
||||
const int32_t rc = llama_decode(ctx_dft, batch);
|
||||
if (rc != 0) {
|
||||
LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -1173,6 +1087,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
int n_drafting = 0;
|
||||
std::vector<bool> drafting(n_seq);
|
||||
|
||||
const float * h_row = nullptr;
|
||||
const size_t row_bytes = (size_t) n_embd * sizeof(float);
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
@@ -1187,43 +1102,22 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
common_sampler_reset(smpls[seq_id].get());
|
||||
|
||||
common_batch_add(batch, dp.id_last, dp.n_past, { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, pending_h[seq_id].data(), row_bytes);
|
||||
|
||||
i_last[seq_id] = batch.n_tokens - 1;
|
||||
h_row = pending_h[seq_id].data();
|
||||
std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
|
||||
}
|
||||
|
||||
if (chain_heads) {
|
||||
chain_h[seq_id].assign(pending_h[seq_id].begin(), pending_h[seq_id].end());
|
||||
}
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
|
||||
return;
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
|
||||
while (n_drafting > 0) {
|
||||
// each step decodes under a different head, i.e. a different decoder layer, and
|
||||
// KV is per layer. process() filled this layer's KV only for positions < n_past
|
||||
// (prompt + accepted prefix) — nothing in the draft region yet. so reset the
|
||||
// draft region (the seq_rm lower bound is n_past, leaving the prompt KV intact)
|
||||
// and select head i so it rebuilds its own layer's KV there; decoding just the
|
||||
// latest token would leave its attention reading cells only another head wrote.
|
||||
if (chain_heads) {
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
if (drafting[seq_id]) {
|
||||
llama_memory_seq_rm(mem_dft, seq_id, dparams[seq_id].n_past, -1);
|
||||
}
|
||||
}
|
||||
llama_set_nextn_layer_offset(ctx_dft, i);
|
||||
}
|
||||
int i_batch = 0;
|
||||
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
// rebuild the batch for the next step: the growing-KV paths re-add only the
|
||||
// new token (the KV already holds the prefix), while chained heads re-add the
|
||||
// whole prefix at the next head. dropped sequences are simply not re-added.
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
@@ -1233,8 +1127,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
|
||||
auto * smpl = smpls[seq_id].get();
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, i_last[seq_id], true);
|
||||
const float * h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_last[seq_id]);
|
||||
common_sampler_sample(smpl, ctx_dft, i_batch, true);
|
||||
h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
|
||||
++i_batch;
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||
|
||||
@@ -1268,39 +1163,28 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (chain_heads) {
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448031546
|
||||
chain_h[seq_id].insert(chain_h[seq_id].end(), h_row, h_row + n_embd);
|
||||
|
||||
const int n_rows = (int) result.size() + 1; // id_last + tokens drafted so far
|
||||
for (int t = 0; t < n_rows; ++t) {
|
||||
const llama_token tok = (t == 0) ? dp.id_last : result[t - 1];
|
||||
common_batch_add(batch, tok, dp.n_past + t, { seq_id }, t == n_rows - 1);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd,
|
||||
chain_h[seq_id].data() + (size_t) t * n_embd, row_bytes);
|
||||
}
|
||||
} else if (is_mem_shared) {
|
||||
if (is_mem_shared) {
|
||||
// note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens
|
||||
// ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37
|
||||
common_batch_add(batch, id, dp.n_past, { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes);
|
||||
} else {
|
||||
common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
|
||||
std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes);
|
||||
}
|
||||
|
||||
i_last[seq_id] = batch.n_tokens - 1;
|
||||
std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
++i;
|
||||
}
|
||||
// evaluate the drafted tokens on the draft model
|
||||
ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0) {
|
||||
LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
if (chain_heads) {
|
||||
llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes
|
||||
++i;
|
||||
}
|
||||
|
||||
for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
|
||||
@@ -1312,6 +1196,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl {
|
||||
if (dp.result->size() < (size_t) params.n_min) {
|
||||
dp.result->clear();
|
||||
}
|
||||
|
||||
last_n_drafted[seq_id] = (uint16_t) dp.result->size();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1924,7 +1810,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
|
||||
bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
|
||||
bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
|
||||
bool has_draft_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
|
||||
bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
|
||||
|
||||
|
||||
|
||||
@@ -1962,7 +1848,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
|
||||
if (has_draft_eagle3) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params));
|
||||
}
|
||||
if (has_draft_mtp) {
|
||||
if (has_mtp) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, params));
|
||||
}
|
||||
}
|
||||
@@ -2232,31 +2118,6 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: support the case of more than one speculative implementations having a state
|
||||
bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data) {
|
||||
if (spec == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (auto & impl : spec->impls) {
|
||||
if (impl->get_state(seq_id, data)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data) {
|
||||
if (spec == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto & impl : spec->impls) {
|
||||
impl->set_state(seq_id, data);
|
||||
}
|
||||
}
|
||||
|
||||
void common_speculative_print_stats(const common_speculative * spec) {
|
||||
if (spec == nullptr) {
|
||||
return;
|
||||
|
||||
@@ -68,10 +68,6 @@ void common_speculative_draft(common_speculative * spec);
|
||||
// informs the speculative context that n_accepted tokens were accepted by the target model
|
||||
void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
|
||||
|
||||
// (optional) get/set internal state
|
||||
bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector<uint8_t> & data);
|
||||
void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector<uint8_t> & data);
|
||||
|
||||
// print statistics about the speculative decoding
|
||||
void common_speculative_print_stats(const common_speculative * spec);
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ class BailingMoeV2Model(TextModel):
|
||||
if (rope_dim := hparams.get("head_dim")) is None:
|
||||
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||
|
||||
+1
-7
@@ -1119,10 +1119,8 @@ class TextModel(ModelBase):
|
||||
|
||||
rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True)
|
||||
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True)
|
||||
partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True)
|
||||
original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True)
|
||||
|
||||
# Ensure global params are mirrored in rope_parameters
|
||||
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
|
||||
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
|
||||
if local_rope_theta is not None:
|
||||
self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
|
||||
@@ -1130,10 +1128,6 @@ class TextModel(ModelBase):
|
||||
self.rope_parameters["rope_theta"] = rope_theta
|
||||
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
|
||||
self.rope_parameters["rope_type"] = rope_type
|
||||
if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None:
|
||||
self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
|
||||
if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None:
|
||||
self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings
|
||||
|
||||
@classmethod
|
||||
def __init_subclass__(cls):
|
||||
|
||||
@@ -148,7 +148,7 @@ class ChatGLMModel(TextModel):
|
||||
rope_dim = self.hparams["attention_dim"]
|
||||
else:
|
||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
rope_freq = 10000
|
||||
if "rope_ratio" in self.hparams:
|
||||
|
||||
+1
-1
@@ -161,7 +161,7 @@ class DeciModel(TextModel):
|
||||
factor = rope_params.get("factor", 8.0)
|
||||
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
@@ -24,7 +24,7 @@ class ExaoneModel(TextModel):
|
||||
|
||||
assert (hparams["activation_function"] == "silu")
|
||||
|
||||
rotary_factor = self.rope_parameters.get("partial_rotary_factor")
|
||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
|
||||
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
|
||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
|
||||
@@ -39,7 +39,7 @@ class ExaoneModel(TextModel):
|
||||
factor = rope_params.get("factor", 8.0)
|
||||
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
@@ -104,7 +104,7 @@ class Exaone4Model(TextModel):
|
||||
factor = rope_params.get("factor", 16.0)
|
||||
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
+1
-1
@@ -693,7 +693,7 @@ class Gemma4Model(Gemma3Model):
|
||||
self.gguf_writer.add_head_count_kv(value_arr)
|
||||
|
||||
# handle n_rot differently for global vs swa layers
|
||||
partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
|
||||
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
|
||||
self.gguf_writer.add_rope_dimension_count(n_rot_full)
|
||||
|
||||
+2
-2
@@ -124,7 +124,7 @@ class Glm4MoeModel(TextModel):
|
||||
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
)
|
||||
self.gguf_writer.add_rope_dimension_count(
|
||||
int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))
|
||||
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
|
||||
)
|
||||
|
||||
# MoE parameters - Use only routed expert count (shared experts handled separately)
|
||||
@@ -226,7 +226,7 @@ class GlmMoeDsaModel(DeepseekV2Model):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
rope_dim = self.hparams["qk_rope_head_dim"]
|
||||
partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor))
|
||||
|
||||
# NextN/MTP prediction layers
|
||||
|
||||
+1
-1
@@ -289,7 +289,7 @@ class LlamaModel(TextModel):
|
||||
factor = rope_params.get("factor", 8.0)
|
||||
low_freq_factor = rope_params.get("low_freq_factor", 1.0)
|
||||
high_freq_factor = rope_params.get("high_freq_factor", 4.0)
|
||||
old_context_len = rope_params.get("original_max_position_embeddings", 8192)
|
||||
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
+1
-1
@@ -154,7 +154,7 @@ class MimoV2Model(TextModel):
|
||||
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
|
||||
|
||||
rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"])
|
||||
rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"])
|
||||
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5))
|
||||
|
||||
+10
-6
@@ -32,9 +32,11 @@ class MiniCPMModel(TextModel):
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
|
||||
long_factors = self.rope_parameters.get('long_factor')
|
||||
short_factors = self.rope_parameters.get('short_factor')
|
||||
if long_factors or short_factors:
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if rope_scaling is not None:
|
||||
long_factors = rope_scaling.get('long_factor', None)
|
||||
short_factors = rope_scaling.get('short_factor', None)
|
||||
|
||||
if long_factors is None or short_factors is None:
|
||||
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
@@ -83,11 +85,13 @@ class MiniCPM3Model(TextModel):
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
||||
long_factors = self.rope_parameters.get('long_factor')
|
||||
short_factors = self.rope_parameters.get('short_factor')
|
||||
if long_factors or short_factors:
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if rope_scaling is not None:
|
||||
rope_dims = self.hparams["qk_rope_head_dim"]
|
||||
|
||||
long_factors = rope_scaling.get('long_factor', None)
|
||||
short_factors = rope_scaling.get('short_factor', None)
|
||||
|
||||
if long_factors is None or short_factors is None:
|
||||
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
|
||||
@@ -125,18 +125,17 @@ class NemotronModel(TextModel):
|
||||
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
||||
|
||||
# * Partial RoPE
|
||||
rot_pct = self.rope_parameters["partial_rotary_factor"]
|
||||
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||
|
||||
# * RopeScaling for Nemotron
|
||||
factor = self.hparams.get("factor") or self.rope_parameters.get("factor")
|
||||
if factor is None:
|
||||
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
else:
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(factor)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
|
||||
|
||||
+11
-9
@@ -18,7 +18,7 @@ class Phi2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.PHI2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
rot_pct = self.rope_parameters["partial_rotary_factor"]
|
||||
rot_pct = self.find_hparam(["partial_rotary_factor"])
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
|
||||
@@ -149,8 +149,8 @@ class Phi3MiniModel(TextModel):
|
||||
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
|
||||
rms_eps = self.find_hparam(["rms_norm_eps"])
|
||||
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
||||
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
|
||||
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
rope_dims = int(rot_pct * n_embd) // n_head
|
||||
|
||||
self.gguf_writer.add_context_length(max_pos_embds)
|
||||
@@ -174,19 +174,18 @@ class Phi3MiniModel(TextModel):
|
||||
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
|
||||
orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"]
|
||||
rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0)
|
||||
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
|
||||
rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
|
||||
rope_dims = int(rot_pct * n_embd) // n_head
|
||||
|
||||
# write rope scaling for long context (128k) model
|
||||
long_factors = self.rope_parameters.get('long_factor')
|
||||
short_factors = self.rope_parameters.get('short_factor')
|
||||
if not long_factors:
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if rope_scaling is None:
|
||||
return
|
||||
|
||||
scale = max_pos_embds / orig_max_pos_embds
|
||||
|
||||
rope_scaling_type = self.rope_parameters.get('rope_type', '').lower()
|
||||
rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower()
|
||||
if len(rope_scaling_type) == 0:
|
||||
raise KeyError('Missing the required key rope_scaling.type')
|
||||
|
||||
@@ -199,6 +198,9 @@ class Phi3MiniModel(TextModel):
|
||||
|
||||
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
|
||||
|
||||
long_factors = rope_scaling.get('long_factor', None)
|
||||
short_factors = rope_scaling.get('short_factor', None)
|
||||
|
||||
if long_factors is None or short_factors is None:
|
||||
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
|
||||
+1
-1
@@ -280,7 +280,7 @@ class Qwen3NextModel(Qwen2MoeModel):
|
||||
self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4))
|
||||
if (rope_dim := self.hparams.get("head_dim")) is None:
|
||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25)))
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
|
||||
@@ -28,7 +28,7 @@ class StableLMModel(TextModel):
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
rotary_factor = self.rope_parameters["partial_rotary_factor"]
|
||||
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
|
||||
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||
|
||||
+1
-1
@@ -314,7 +314,7 @@ class Step35Model(TextModel):
|
||||
factor = float(rope_params.get("factor", 8.0))
|
||||
low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
|
||||
high_freq_factor = float(rope_params.get("high_freq_factor", 4.0))
|
||||
old_context_len = int(rope_params.get("original_max_position_embeddings", 8192))
|
||||
old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192)))
|
||||
|
||||
low_freq_wavelen = old_context_len / low_freq_factor
|
||||
high_freq_wavelen = old_context_len / high_freq_factor
|
||||
|
||||
@@ -198,18 +198,18 @@ class BuiltinRule:
|
||||
SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'
|
||||
|
||||
PRIMITIVE_RULES = {
|
||||
'boolean' : BuiltinRule('("true" | "false")', []),
|
||||
'boolean' : BuiltinRule('("true" | "false") space', []),
|
||||
'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
|
||||
'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
|
||||
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']),
|
||||
'integer' : BuiltinRule('("-"? integral-part)', ['integral-part']),
|
||||
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
|
||||
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
|
||||
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
|
||||
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']),
|
||||
'array' : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']),
|
||||
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []),
|
||||
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
|
||||
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
|
||||
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
|
||||
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
|
||||
'string' : BuiltinRule(r'"\"" char* "\""', ['char']),
|
||||
'null' : BuiltinRule('"null"', []),
|
||||
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
|
||||
'null' : BuiltinRule('"null" space', []),
|
||||
}
|
||||
|
||||
# TODO: support "uri", "email" string formats
|
||||
@@ -217,9 +217,9 @@ STRING_FORMAT_RULES = {
|
||||
'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
|
||||
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
|
||||
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
|
||||
'date-string' : BuiltinRule('"\\"" date "\\""', ['date']),
|
||||
'time-string' : BuiltinRule('"\\"" time "\\""', ['time']),
|
||||
'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']),
|
||||
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
|
||||
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
|
||||
'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
|
||||
}
|
||||
|
||||
DOTALL = '[\\U00000000-\\U0010FFFF]'
|
||||
@@ -319,7 +319,7 @@ class SchemaConverter:
|
||||
out.append(f'[^"{"".join(rejects)}] {char_rule}*')
|
||||
visit(trie)
|
||||
|
||||
out.append(f' ){"" if trie.is_end_of_string else "?"} ["]')
|
||||
out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
|
||||
return ''.join(out)
|
||||
|
||||
def _add_rule(self, name, rule):
|
||||
@@ -549,7 +549,7 @@ class SchemaConverter:
|
||||
return self._add_rule(
|
||||
name,
|
||||
to_rule(transform()) if self._raw_pattern \
|
||||
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"")
|
||||
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
|
||||
|
||||
|
||||
def _resolve_ref(self, ref):
|
||||
@@ -580,10 +580,10 @@ class SchemaConverter:
|
||||
return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
|
||||
|
||||
elif 'const' in schema:
|
||||
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
|
||||
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
|
||||
|
||||
elif 'enum' in schema:
|
||||
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')'
|
||||
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
|
||||
return self._add_rule(rule_name, rule)
|
||||
|
||||
elif schema_type in (None, 'object') and \
|
||||
@@ -624,7 +624,7 @@ class SchemaConverter:
|
||||
enum_intersection &= s
|
||||
|
||||
if enum_intersection:
|
||||
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')'
|
||||
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
|
||||
return self._add_rule(rule_name, rule)
|
||||
|
||||
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
|
||||
@@ -638,12 +638,12 @@ class SchemaConverter:
|
||||
' "," space '.join(
|
||||
self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
|
||||
for i, item in enumerate(items)) +
|
||||
' space "]"')
|
||||
' "]" space')
|
||||
else:
|
||||
item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
|
||||
min_items = schema.get("minItems", 0)
|
||||
max_items = schema.get("maxItems")
|
||||
return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"')
|
||||
return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
|
||||
|
||||
elif schema_type in (None, 'string') and 'pattern' in schema:
|
||||
return self._visit_pattern(schema['pattern'], rule_name)
|
||||
@@ -663,7 +663,7 @@ class SchemaConverter:
|
||||
min_len = schema.get('minLength', 0)
|
||||
max_len = schema.get('maxLength')
|
||||
|
||||
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""')
|
||||
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
|
||||
|
||||
elif schema_type in (None, 'integer') and \
|
||||
('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
|
||||
@@ -680,7 +680,7 @@ class SchemaConverter:
|
||||
|
||||
out = ["("]
|
||||
_generate_min_max_int(min_value, max_value, out)
|
||||
out.append(")")
|
||||
out.append(") space")
|
||||
return self._add_rule(rule_name, ''.join(out))
|
||||
|
||||
elif (schema_type == 'object') or (len(schema) == 0):
|
||||
@@ -765,7 +765,7 @@ class SchemaConverter:
|
||||
rule += ' )'
|
||||
rule += ' )?'
|
||||
|
||||
rule += ' space "}"'
|
||||
rule += ' "}" space'
|
||||
|
||||
return rule
|
||||
|
||||
|
||||
@@ -341,9 +341,6 @@ set(GGML_PUBLIC_HEADERS
|
||||
include/gguf.h)
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
#if (GGML_METAL)
|
||||
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
|
||||
#endif()
|
||||
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
|
||||
install(TARGETS ggml-base LIBRARY)
|
||||
|
||||
|
||||
@@ -2417,14 +2417,15 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te
|
||||
// Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size
|
||||
GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size);
|
||||
|
||||
parallel_for_ggml(params, n_batch * M, [&](int begin, int end) {
|
||||
for (int idx = begin; idx < end; ++idx) {
|
||||
int batch_idx = idx / M;
|
||||
int m = idx % M;
|
||||
parallel_for_ggml(params, n_batch, [&](int begin, int end) {
|
||||
for (int batch_idx = begin; batch_idx < end; ++batch_idx) {
|
||||
int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2);
|
||||
const float * A_data = (const float *)((const char *)src1->data + src1_offset);
|
||||
char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A;
|
||||
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
|
||||
|
||||
for (int m = 0; m < M; ++m) {
|
||||
from_float<vec_dot_type>(A_data + m * K, wdata_batch + m * row_size_A, K);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@@ -183,25 +183,24 @@ static inline void hvx_transpose_32x32_f32(HVX_Vector m[32]) {
|
||||
// transposed into VTCM.
|
||||
//
|
||||
// VTCM layouts (per thread):
|
||||
// src1_T : {d_inner_stride, d_conv} - staged once per launch (small).
|
||||
// src0_T : {d_inner_tile, ncs} - staged per d_inner-tile.
|
||||
// src1_T : {d_inner_per_thread, d_conv} — staged once per launch (small).
|
||||
// src0_T : {d_inner_tile, ncs} — staged per d_inner-tile.
|
||||
//
|
||||
// d_inner_tile is chosen so that per-thread VTCM stays under the budget.
|
||||
// Each thread iterates ceil(d_inner_per_thread d_inner_tile) tiles serially.
|
||||
#define HTP_SSM_CONV_VTCM_BUDGET (1u << 20) // 1 MiB per thread
|
||||
|
||||
// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_stride, d_conv} (VTCM)
|
||||
// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_per_thread, d_conv} (VTCM)
|
||||
static inline void transpose_src1(const float * src1_data,
|
||||
uint32_t src1_stride_inner,
|
||||
uint32_t i1_off,
|
||||
uint32_t d_inner_per_thread,
|
||||
uint32_t d_inner_stride,
|
||||
uint32_t d_conv,
|
||||
float * src1_T) {
|
||||
for (uint32_t i = 0; i < d_inner_per_thread; ++i) {
|
||||
const float * src_row = src1_data + (i1_off + i) * src1_stride_inner;
|
||||
for (uint32_t j = 0; j < d_conv; ++j) {
|
||||
src1_T[j * d_inner_stride + i] = src_row[j];
|
||||
src1_T[j * d_inner_per_thread + i] = src_row[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -281,7 +280,6 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
|
||||
}
|
||||
|
||||
const uint32_t d_inner_per_thread = ir1 - ir0;
|
||||
const uint32_t d_inner_stride = scctx->nrows_per_thread;
|
||||
const uint32_t d_inner_tile = scctx->d_inner_tile;
|
||||
|
||||
const float * src0_data = (const float *) src0->data;
|
||||
@@ -292,8 +290,8 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
|
||||
float * src0_T = (float *)(octx->src0_spad.data + ith * octx->src0_spad.size_per_thread);
|
||||
float * src1_T = (float *)(octx->src1_spad.data + ith * octx->src1_spad.size_per_thread);
|
||||
|
||||
// Stage src1 weights once into VTCM in {d_inner_stride, d_conv} layout.
|
||||
transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_inner_stride, d_conv, src1_T);
|
||||
// Stage src1 weights once into VTCM in {d_inner_per_thread, d_conv} layout.
|
||||
transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_conv, src1_T);
|
||||
|
||||
const uint32_t C_TILE = VLEN_FP32;
|
||||
|
||||
@@ -316,7 +314,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void
|
||||
HVX_Vector acc = hvx_vec_splat_f32(0.0f);
|
||||
for (uint32_t j = 0; j < d_conv; ++j) {
|
||||
HVX_Vector x = *(const HVX_Vector *) (src0_T + (t + j) * d_inner_tile + cb);
|
||||
HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_stride + tile_off + cb);
|
||||
HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_per_thread + tile_off + cb);
|
||||
acc = Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(x, w));
|
||||
}
|
||||
HVX_Vector res = Q6_Vsf_equals_Vqf32(acc);
|
||||
@@ -364,7 +362,8 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) {
|
||||
use_hvx = 1;
|
||||
}
|
||||
|
||||
scctx.nrows_per_thread = hex_round_up((d_inner + n_threads - 1) / n_threads, VLEN_FP32);
|
||||
scctx.nrows_per_thread = (d_inner + n_threads - 1) / n_threads;
|
||||
scctx.nrows_per_thread += (scctx.nrows_per_thread & 1);
|
||||
|
||||
const uint32_t d_inner_per_thread = scctx.nrows_per_thread;
|
||||
const uint32_t ncs = src0->ne[0];
|
||||
|
||||
@@ -24,62 +24,119 @@ if (GGML_METAL_NDEBUG)
|
||||
endif()
|
||||
|
||||
set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h")
|
||||
set(METALLIB_KERNELS_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/kernels/common.h")
|
||||
set(METALLIB_KERNELS_DEQUANTIZE "${CMAKE_CURRENT_SOURCE_DIR}/kernels/dequantize.h")
|
||||
set(METALLIB_KERNELS_QUANTIZE "${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantize.h")
|
||||
|
||||
set(METALLIB_KERNEL_SOURCES
|
||||
kernels/fa.metal
|
||||
kernels/mul_mv.metal
|
||||
kernels/mul_mm.metal
|
||||
kernels/quantize.metal
|
||||
kernels/softmax.metal
|
||||
kernels/norm.metal
|
||||
kernels/unary.metal
|
||||
kernels/binbcast.metal
|
||||
kernels/reduce.metal
|
||||
kernels/tri.metal
|
||||
kernels/ssm.metal
|
||||
kernels/wkv.metal
|
||||
kernels/gated_delta_net.metal
|
||||
kernels/solve_tri.metal
|
||||
kernels/rope.metal
|
||||
kernels/conv.metal
|
||||
kernels/upscale.metal
|
||||
kernels/argsort.metal
|
||||
kernels/pool.metal
|
||||
kernels/misc.metal
|
||||
)
|
||||
|
||||
if (GGML_METAL_EMBED_LIBRARY)
|
||||
enable_language(ASM)
|
||||
|
||||
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
|
||||
|
||||
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
|
||||
set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
|
||||
set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h")
|
||||
|
||||
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
|
||||
|
||||
# merge ggml-common.h and ggml-metal.metal into a single file
|
||||
set(METALLIB_EMBED_ASM "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.s")
|
||||
set(METALLIB_SOURCE_EMBED "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal")
|
||||
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
|
||||
set(METALLIB_EMBED_ASM_FILES "")
|
||||
foreach(src ${METALLIB_KERNEL_SOURCES})
|
||||
get_filename_component(kind ${src} NAME_WE)
|
||||
# symbol names must be valid C identifiers ('-' is not allowed)
|
||||
string(REPLACE "-" "_" kind_sym ${kind})
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "Embedding Metal library"
|
||||
COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}"
|
||||
COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}"
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}"
|
||||
COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}"
|
||||
DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
|
||||
COMMENT "Generate assembly for embedded Metal library"
|
||||
VERBATIM
|
||||
)
|
||||
set(SRC "${CMAKE_CURRENT_SOURCE_DIR}/kernels/${kind}.metal")
|
||||
set(EMBED "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed-${kind}.metal")
|
||||
set(ASM "${CMAKE_CURRENT_BINARY_DIR}/autogenerated/ggml-metal-embed-${kind}.s")
|
||||
|
||||
target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
|
||||
# only prepend headers that this source actually includes
|
||||
set(HEADERS_FOR_SRC ${METALLIB_KERNELS_COMMON})
|
||||
file(STRINGS ${SRC} _has_dequantize REGEX "#include \"dequantize\\.h\"")
|
||||
file(STRINGS ${SRC} _has_quantize REGEX "#include \"quantize\\.h\"")
|
||||
if(_has_dequantize)
|
||||
list(APPEND HEADERS_FOR_SRC ${METALLIB_KERNELS_DEQUANTIZE})
|
||||
endif()
|
||||
if(_has_quantize)
|
||||
list(APPEND HEADERS_FOR_SRC ${METALLIB_KERNELS_QUANTIZE})
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${ASM}"
|
||||
# Step 1: concatenate shared headers + this kernel source
|
||||
COMMAND cat ${HEADERS_FOR_SRC} ${SRC} > "${EMBED}.tmp1"
|
||||
# Step 2: remove internal #include and #pragma once
|
||||
COMMAND sed -e "/\#include \"common.h\"/d" -e "/\#include \"dequantize.h\"/d" -e "/\#include \"quantize.h\"/d" -e "/\#pragma once/d" < "${EMBED}.tmp1" > "${EMBED}.tmp2"
|
||||
# Step 3: inline ggml-common.h (replacing __embed_ggml-common.h__ sentinel)
|
||||
COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${EMBED}.tmp2" > "${EMBED}.tmp3"
|
||||
# Step 4: inline ggml-metal-impl.h
|
||||
COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${EMBED}.tmp3" > "${EMBED}"
|
||||
# Step 5: emit an asm chunk with kind-specific start/end symbols
|
||||
# note: '-' is illegal in C symbols, so we use kind_sym; the macOS
|
||||
# section name is limited to 16 chars so we keep it shared
|
||||
# across kinds (__ggml_metallib) and only vary the global symbols.
|
||||
COMMAND echo ".section __DATA,__ggml_metallib" > "${ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_${kind_sym}_start" >> "${ASM}"
|
||||
COMMAND echo "_ggml_metallib_${kind_sym}_start:" >> "${ASM}"
|
||||
COMMAND echo .incbin "\"${EMBED}\"" >> "${ASM}"
|
||||
COMMAND echo ".globl _ggml_metallib_${kind_sym}_end" >> "${ASM}"
|
||||
COMMAND echo "_ggml_metallib_${kind_sym}_end:" >> "${ASM}"
|
||||
DEPENDS ../ggml-common.h ggml-metal-impl.h
|
||||
kernels/common.h kernels/dequantize.h kernels/quantize.h
|
||||
kernels/${kind}.metal
|
||||
COMMENT "Generate embedded Metal library for ${kind}"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
list(APPEND METALLIB_EMBED_ASM_FILES "${ASM}")
|
||||
endforeach()
|
||||
|
||||
target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM_FILES})
|
||||
else()
|
||||
# copy metal files to bin directory
|
||||
# copy header files to bin directory
|
||||
configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
|
||||
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
|
||||
configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY)
|
||||
|
||||
file(MAKE_DIRECTORY "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels")
|
||||
configure_file(kernels/common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels/common.h COPYONLY)
|
||||
configure_file(kernels/dequantize.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels/dequantize.h COPYONLY)
|
||||
configure_file(kernels/quantize.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels/quantize.h COPYONLY)
|
||||
|
||||
foreach(src ${METALLIB_KERNEL_SOURCES})
|
||||
configure_file(${src} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${src} COPYONLY)
|
||||
endforeach()
|
||||
|
||||
if (GGML_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
# xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||
# xcrun -sdk macosx metallib ggml-metal.air -o default.metallib
|
||||
#
|
||||
# note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
|
||||
# disabling fast math is needed in order to pass tests/test-backend-ops
|
||||
# note: disabling fast math is needed in order to pass tests/test-backend-ops
|
||||
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
|
||||
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
|
||||
# ref: https://github.com/ggml-org/whisper.cpp/issues/1720
|
||||
# note: adding -g causes segmentation fault during compile
|
||||
#set(XC_FLAGS -fno-fast-math -fno-inline -g)
|
||||
set(XC_FLAGS -fno-fast-math -fno-inline)
|
||||
else()
|
||||
set(XC_FLAGS -O3)
|
||||
endif()
|
||||
|
||||
# Append macOS metal versioning flags
|
||||
if (GGML_METAL_MACOSX_VERSION_MIN)
|
||||
message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
|
||||
list (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
|
||||
@@ -90,35 +147,46 @@ else()
|
||||
list (APPEND XC_FLAGS -std=${GGML_METAL_STD})
|
||||
endif()
|
||||
|
||||
# Compile each kernel source to .air, then link into default.metallib
|
||||
set(AIR_FILES "")
|
||||
foreach(src ${METALLIB_KERNEL_SOURCES})
|
||||
get_filename_component(name ${src} NAME_WE)
|
||||
set(AIR "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${name}.air")
|
||||
list(APPEND AIR_FILES ${AIR})
|
||||
add_custom_command(
|
||||
OUTPUT ${AIR}
|
||||
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -I ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${src} -o ${AIR}
|
||||
DEPENDS ${src} kernels/common.h kernels/dequantize.h kernels/quantize.h ${METALLIB_COMMON} ggml-metal-impl.h
|
||||
COMMENT "Compiling ${src}"
|
||||
VERBATIM
|
||||
)
|
||||
endforeach()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o - |
|
||||
xcrun -sdk macosx metallib - -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
COMMAND xcrun -sdk macosx metallib ${AIR_FILES} -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
|
||||
DEPENDS ggml-metal.metal ${METALLIB_COMMON}
|
||||
COMMENT "Compiling Metal kernels"
|
||||
)
|
||||
COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h
|
||||
COMMAND rm -rf ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/kernels
|
||||
DEPENDS ${AIR_FILES}
|
||||
COMMENT "Linking Metal kernels into default.metallib"
|
||||
)
|
||||
|
||||
# FIXME: only add to the ggml-metal target?
|
||||
add_custom_target(
|
||||
ggml-metal-lib ALL
|
||||
DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
)
|
||||
)
|
||||
endif() # GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
if (NOT GGML_METAL_EMBED_LIBRARY)
|
||||
install(
|
||||
FILES src/ggml-metal/ggml-metal.metal
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
GROUP_READ
|
||||
WORLD_READ
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/kernels/
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}/kernels
|
||||
FILES_MATCHING PATTERN "*.metal" PATTERN "*.h"
|
||||
)
|
||||
|
||||
install(
|
||||
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
)
|
||||
install(
|
||||
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -94,8 +94,63 @@ int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_wi
|
||||
return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
|
||||
}
|
||||
|
||||
//
|
||||
// MTLLibrary collection (one library per op-source, compiled separately)
|
||||
//
|
||||
|
||||
// Single source of truth for the per-kind metal libraries. The order here
|
||||
// defines the enum values and every per-kind table below, so adding a library
|
||||
// is a one-line change here (plus adding its source to CMakeLists.txt).
|
||||
// X(suffix, name): name is both the kernels/<name>.metal basename and the
|
||||
// ggml_metallib_<name>_{start,end} embed-symbol stem.
|
||||
#define GGML_METAL_LIBS \
|
||||
X(FA, fa) \
|
||||
X(MUL_MV, mul_mv) \
|
||||
X(MUL_MM, mul_mm) \
|
||||
X(QUANTIZE, quantize) \
|
||||
X(SOFTMAX, softmax) \
|
||||
X(NORM, norm) \
|
||||
X(UNARY, unary) \
|
||||
X(BINBCAST, binbcast) \
|
||||
X(REDUCE, reduce) \
|
||||
X(TRI, tri) \
|
||||
X(SSM, ssm) \
|
||||
X(WKV, wkv) \
|
||||
X(GATED_DELTA_NET, gated_delta_net)\
|
||||
X(SOLVE_TRI, solve_tri) \
|
||||
X(ROPE, rope) \
|
||||
X(CONV, conv) \
|
||||
X(UPSCALE, upscale) \
|
||||
X(ARGSORT, argsort) \
|
||||
X(POOL, pool) \
|
||||
X(MISC, misc)
|
||||
|
||||
enum ggml_metal_lib_kind {
|
||||
#define X(e, s) GGML_METAL_LIB_##e,
|
||||
GGML_METAL_LIBS
|
||||
#undef X
|
||||
GGML_METAL_LIB_COUNT,
|
||||
};
|
||||
|
||||
static const char * const k_lib_names[GGML_METAL_LIB_COUNT] = {
|
||||
#define X(e, s) [GGML_METAL_LIB_##e] = #s,
|
||||
GGML_METAL_LIBS
|
||||
#undef X
|
||||
};
|
||||
|
||||
struct ggml_metal_library {
|
||||
id<MTLLibrary> obj;
|
||||
// Per-kind compiled libraries. When single_library is true, the whole library
|
||||
// (e.g. a pre-compiled default.metallib or a from-source build) lives at
|
||||
// objs[0] and the remaining slots are nil.
|
||||
id<MTLLibrary> objs[GGML_METAL_LIB_COUNT];
|
||||
bool single_library; // true: combined library at objs[0]; false: per-kind libs in objs[*]
|
||||
|
||||
// Routing table: kernel function name -> objs[] index, populated from each
|
||||
// compiled library's -[MTLLibrary functionNames]. The actual compiled
|
||||
// libraries are the single source of truth for which library owns a kernel,
|
||||
// so adding kernels later requires no manual routing maintenance.
|
||||
// nil in single_library mode (everything resolves to objs[0]).
|
||||
NSMutableDictionary<NSString *, NSNumber *> * fn_to_lib;
|
||||
|
||||
ggml_metal_device_t dev;
|
||||
ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
|
||||
@@ -103,160 +158,376 @@ struct ggml_metal_library {
|
||||
NSLock * lock;
|
||||
};
|
||||
|
||||
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||
id<MTLLibrary> library = nil;
|
||||
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
|
||||
// Build the fn_to_lib routing table by querying each compiled library's public
|
||||
// function names. Call once after all per-kind libraries have been compiled.
|
||||
static void ggml_metal_library_build_index(ggml_metal_library_t lib) {
|
||||
@autoreleasepool {
|
||||
NSMutableDictionary<NSString *, NSNumber *> * index = [[NSMutableDictionary alloc] init];
|
||||
for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) {
|
||||
for (NSString * fname in [lib->objs[kind] functionNames]) {
|
||||
index[fname] = @(kind);
|
||||
}
|
||||
}
|
||||
lib->fn_to_lib = index;
|
||||
}
|
||||
}
|
||||
|
||||
// load library
|
||||
//
|
||||
// - first check if the library is embedded
|
||||
// - then check if the library is in the bundle
|
||||
// - if not found, load the source and compile it
|
||||
// - if that fails, return NULL
|
||||
//
|
||||
// TODO: move to a function
|
||||
{
|
||||
const int64_t t_start = ggml_time_us();
|
||||
// Parse a `#include "name"` line. Returns the quoted name in *include_name on
|
||||
// success. Whitespace-tolerant; ignores `#include <...>` (system headers).
|
||||
static bool ggml_metal_library_parse_quoted_include(NSString * line, NSString ** include_name) {
|
||||
NSScanner * scanner = [NSScanner scannerWithString:line];
|
||||
scanner.charactersToBeSkipped = [NSCharacterSet whitespaceCharacterSet];
|
||||
|
||||
NSError * error = nil;
|
||||
NSString * src = nil;
|
||||
if (![scanner scanString:@"#" intoString:NULL] ||
|
||||
![scanner scanString:@"include" intoString:NULL] ||
|
||||
![scanner scanString:@"\"" intoString:NULL]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
NSString * name = nil;
|
||||
if (![scanner scanUpToString:@"\"" intoString:&name]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
extern const char ggml_metallib_start[];
|
||||
extern const char ggml_metallib_end[];
|
||||
if (include_name) {
|
||||
*include_name = name;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
#else
|
||||
// Recursively inline `#include "name"` directives. System includes (<...>),
|
||||
// `#if/#else/#endif`, and other preprocessor lines are passed through to the
|
||||
// Metal compiler unchanged. `#pragma once` is dropped since `seen` already
|
||||
// guards against double-inclusion.
|
||||
static bool ggml_metal_library_flatten_file(NSMutableString * dst, NSString * path,
|
||||
NSArray<NSString *> * search_paths,
|
||||
NSMutableSet<NSString *> * seen, NSError ** error) {
|
||||
NSString * key = [path stringByStandardizingPath];
|
||||
if ([seen containsObject:key]) {
|
||||
return true;
|
||||
}
|
||||
[seen addObject:key];
|
||||
|
||||
#ifdef SWIFT_PACKAGE
|
||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:error];
|
||||
if (!src) {
|
||||
return false;
|
||||
}
|
||||
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (path_lib == nil) {
|
||||
// Try to find the resource in the directory where the current binary located.
|
||||
NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0];
|
||||
NSString * bin_dir = [bin_cur stringByDeletingLastPathComponent];
|
||||
NSFileManager * fm = [NSFileManager defaultManager];
|
||||
for (NSString * line in [src componentsSeparatedByString:@"\n"]) {
|
||||
NSString * trimmed = [line stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceCharacterSet]];
|
||||
if ([trimmed isEqualToString:@"#pragma once"]) {
|
||||
continue;
|
||||
}
|
||||
|
||||
NSString * path_lib_default = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
|
||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [path_lib_default UTF8String]);
|
||||
|
||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:path_lib_default error:&error];
|
||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||
// Optionally, if this is a symlink, try to resolve it.
|
||||
path_lib_default = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:path_lib_default error:&error];
|
||||
if (path_lib_default && [path_lib_default length] > 0 && ![[path_lib_default substringToIndex:1] isEqualToString:@"/"]) {
|
||||
// It is a relative path, adding the binary directory as directory prefix.
|
||||
path_lib_default = [NSString pathWithComponents:@[bin_dir, path_lib_default]];
|
||||
}
|
||||
if (!path_lib_default || ![[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
|
||||
// Link to the resource could not be resolved.
|
||||
path_lib_default = nil;
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [path_lib_default UTF8String]);
|
||||
}
|
||||
NSString * include_name = nil;
|
||||
if (ggml_metal_library_parse_quoted_include(line, &include_name)) {
|
||||
NSString * resolved = nil;
|
||||
for (NSString * dir in search_paths) {
|
||||
NSString * candidate = [dir stringByAppendingPathComponent:include_name];
|
||||
if ([fm isReadableFileAtPath:candidate]) {
|
||||
resolved = candidate;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// The resource couldn't be found in the binary's directory.
|
||||
path_lib_default = nil;
|
||||
}
|
||||
|
||||
path_lib = path_lib_default;
|
||||
if (!resolved) {
|
||||
if (error) {
|
||||
NSString * msg = [NSString stringWithFormat:@"could not resolve include \"%@\" from '%@'", include_name, path];
|
||||
*error = [NSError errorWithDomain:@"ggml-metal-source-flatten" code:1
|
||||
userInfo:@{NSLocalizedDescriptionKey: msg}];
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (!ggml_metal_library_flatten_file(dst, resolved, search_paths, seen, error)) {
|
||||
return false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (path_lib != nil) {
|
||||
// pre-compiled library found
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
[dst appendString:line];
|
||||
[dst appendString:@"\n"];
|
||||
}
|
||||
|
||||
library = [device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return nil;
|
||||
}
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
return true;
|
||||
}
|
||||
|
||||
NSString * path_source;
|
||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
static NSString * ggml_metal_library_flatten_source(NSString * path_source, NSError ** error) {
|
||||
// Search paths cover both runtime layout (build/bin/kernels + build/bin)
|
||||
// and source-tree layout (ggml/src/ggml-metal/kernels + ggml/src/ggml-metal + ggml/src).
|
||||
NSString * path_kernels = [path_source stringByDeletingLastPathComponent];
|
||||
NSString * path_base = [path_kernels stringByDeletingLastPathComponent];
|
||||
NSArray<NSString *> * search_paths = @[
|
||||
path_kernels,
|
||||
path_base,
|
||||
[path_base stringByDeletingLastPathComponent],
|
||||
];
|
||||
|
||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||
NSMutableString * src = [[NSMutableString alloc] init];
|
||||
NSMutableSet<NSString *> * seen = [NSMutableSet set];
|
||||
|
||||
if (path_resource) {
|
||||
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal.metal"];
|
||||
} else {
|
||||
path_source = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
if (!ggml_metal_library_flatten_file(src, path_source, search_paths, seen, error)) {
|
||||
[src release];
|
||||
return nil;
|
||||
}
|
||||
return src;
|
||||
}
|
||||
|
||||
// Compile all per-kind libraries in parallel. `source_for_kind` returns the MSL
|
||||
// source for a kind (the helper takes ownership and releases it), or nil with
|
||||
// *err set on failure. On success the objs[] slots are populated and the routing
|
||||
// index is built; on any failure every error is logged and false is returned
|
||||
// (the caller is responsible for freeing `res`).
|
||||
static bool ggml_metal_library_compile_all(
|
||||
ggml_metal_library_t res,
|
||||
id<MTLDevice> device,
|
||||
NSDictionary * prep,
|
||||
NSString * (^source_for_kind)(int kind, NSError ** err),
|
||||
const char * origin) {
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
int64_t * t_per_lib = calloc(GGML_METAL_LIB_COUNT, sizeof(int64_t));
|
||||
NSError ** err_per_lib = calloc(GGML_METAL_LIB_COUNT, sizeof(NSError *));
|
||||
__block atomic_bool any_failure = false;
|
||||
|
||||
dispatch_group_t group = dispatch_group_create();
|
||||
dispatch_queue_t queue = dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0);
|
||||
|
||||
for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) {
|
||||
dispatch_group_async(group, queue, ^{
|
||||
|
||||
const int64_t t0 = ggml_time_us();
|
||||
|
||||
NSError * error = nil;
|
||||
|
||||
NSString * src = source_for_kind(kind, &error);
|
||||
if (!src) {
|
||||
err_per_lib[kind] = [error retain];
|
||||
atomic_store(&any_failure, true);
|
||||
return;
|
||||
}
|
||||
|
||||
if (path_source == nil) {
|
||||
GGML_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
|
||||
path_source = @"ggml-metal.metal";
|
||||
}
|
||||
id<MTLLibrary> lib = nil;
|
||||
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return nil;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!library) {
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
if (ggml_metal_device_get_props(dev)->has_bfloat) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
|
||||
}
|
||||
|
||||
if (ggml_metal_device_get_props(dev)->has_tensor) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
MTLCompileOptions * options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
//[options setFastMathEnabled:false];
|
||||
lib = [device newLibraryWithSource:src options:options error:&error];
|
||||
|
||||
library = [device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return nil;
|
||||
}
|
||||
|
||||
#if !__has_feature(objc_arc)
|
||||
[options release];
|
||||
#endif
|
||||
|
||||
// retain the error before the autorelease pool drains it
|
||||
if (!lib) {
|
||||
err_per_lib[kind] = [error retain];
|
||||
}
|
||||
}
|
||||
|
||||
[src release];
|
||||
|
||||
t_per_lib[kind] = ggml_time_us() - t0;
|
||||
|
||||
if (!lib) {
|
||||
atomic_store(&any_failure, true);
|
||||
return;
|
||||
}
|
||||
|
||||
res->objs[kind] = lib;
|
||||
});
|
||||
}
|
||||
dispatch_group_wait(group, DISPATCH_TIME_FOREVER);
|
||||
dispatch_release(group);
|
||||
|
||||
const bool ok = !atomic_load(&any_failure);
|
||||
|
||||
if (ok) {
|
||||
const int64_t t_total = ggml_time_us() - t_start;
|
||||
int64_t t_max = 0;
|
||||
for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) {
|
||||
GGML_LOG_DEBUG("%s: compiled '%s' library in %.3f sec\n",
|
||||
__func__, k_lib_names[kind], t_per_lib[kind] / 1e6);
|
||||
if (t_per_lib[kind] > t_max) t_max = t_per_lib[kind];
|
||||
}
|
||||
GGML_LOG_INFO("%s: loaded %d libraries from %s in %.3f sec (max single = %.3f sec)\n",
|
||||
__func__, GGML_METAL_LIB_COUNT, origin, t_total / 1e6, t_max / 1e6);
|
||||
|
||||
ggml_metal_library_build_index(res);
|
||||
} else {
|
||||
for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) {
|
||||
if (err_per_lib[kind]) {
|
||||
GGML_LOG_ERROR("%s: failed to build '%s' library: %s\n", __func__,
|
||||
k_lib_names[kind], [[err_per_lib[kind] description] UTF8String]);
|
||||
[err_per_lib[kind] release];
|
||||
}
|
||||
}
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[src release];
|
||||
#endif // GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
GGML_LOG_INFO("%s: loaded in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
|
||||
}
|
||||
|
||||
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
|
||||
free(err_per_lib);
|
||||
free(t_per_lib);
|
||||
|
||||
res->obj = library;
|
||||
return ok;
|
||||
}
|
||||
|
||||
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
|
||||
|
||||
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
|
||||
res->dev = dev;
|
||||
res->pipelines = ggml_metal_pipelines_init();
|
||||
res->lock = [NSLock new];
|
||||
|
||||
// shared MTLCompileOptions preprocessor macros (matches the build-time defines)
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
if (ggml_metal_device_get_props(dev)->has_bfloat) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
|
||||
}
|
||||
if (ggml_metal_device_get_props(dev)->has_tensor) {
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
|
||||
}
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
[prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
|
||||
#endif
|
||||
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
// start/end symbols emitted by CMake (see CMakeLists.txt), one pair per kind
|
||||
#define X(e, s) extern const char ggml_metallib_##s##_start[]; extern const char ggml_metallib_##s##_end[];
|
||||
GGML_METAL_LIBS
|
||||
#undef X
|
||||
|
||||
static const char * const lib_start[GGML_METAL_LIB_COUNT] = {
|
||||
#define X(e, s) [GGML_METAL_LIB_##e] = ggml_metallib_##s##_start,
|
||||
GGML_METAL_LIBS
|
||||
#undef X
|
||||
};
|
||||
static const char * const lib_end[GGML_METAL_LIB_COUNT] = {
|
||||
#define X(e, s) [GGML_METAL_LIB_##e] = ggml_metallib_##s##_end,
|
||||
GGML_METAL_LIBS
|
||||
#undef X
|
||||
};
|
||||
|
||||
const bool ok = ggml_metal_library_compile_all(res, device, prep,
|
||||
^NSString * (int kind, NSError ** err) {
|
||||
(void) err;
|
||||
return [[NSString alloc] initWithBytes:lib_start[kind]
|
||||
length:(lib_end[kind] - lib_start[kind])
|
||||
encoding:NSUTF8StringEncoding];
|
||||
}, "embedded data");
|
||||
|
||||
if (!ok) {
|
||||
ggml_metal_library_free(res);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return res;
|
||||
#else
|
||||
#ifdef SWIFT_PACKAGE
|
||||
NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
NSError * error = nil;
|
||||
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (path_lib == nil) {
|
||||
// Try to find the resource in the directory where the current binary located.
|
||||
NSString * bin_cur = [[NSProcessInfo processInfo] arguments][0];
|
||||
NSString * bin_dir = [bin_cur stringByDeletingLastPathComponent];
|
||||
|
||||
NSString * path_lib_default = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]];
|
||||
if ([[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
|
||||
GGML_LOG_INFO("%s: found '%s'\n", __func__, [path_lib_default UTF8String]);
|
||||
|
||||
NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:path_lib_default error:&error];
|
||||
if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) {
|
||||
// Optionally, if this is a symlink, try to resolve it.
|
||||
path_lib_default = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:path_lib_default error:&error];
|
||||
if (path_lib_default && [path_lib_default length] > 0 && ![[path_lib_default substringToIndex:1] isEqualToString:@"/"]) {
|
||||
// It is a relative path, adding the binary directory as directory prefix.
|
||||
path_lib_default = [NSString pathWithComponents:@[bin_dir, path_lib_default]];
|
||||
}
|
||||
if (!path_lib_default || ![[NSFileManager defaultManager] isReadableFileAtPath:path_lib_default]) {
|
||||
// Link to the resource could not be resolved.
|
||||
path_lib_default = nil;
|
||||
} else {
|
||||
GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [path_lib_default UTF8String]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// The resource couldn't be found in the binary's directory.
|
||||
path_lib_default = nil;
|
||||
}
|
||||
|
||||
path_lib = path_lib_default;
|
||||
}
|
||||
|
||||
if (path_lib != nil) {
|
||||
// pre-compiled library found: a single combined default.metallib
|
||||
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||
GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||
|
||||
res->objs[0] = [device newLibraryWithURL:libURL error:&error];
|
||||
res->single_library = true;
|
||||
if (!res->objs[0]) {
|
||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
ggml_metal_library_free(res);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loaded in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
|
||||
return res;
|
||||
}
|
||||
|
||||
// no pre-compiled metallib: fall back to compiling each kernel source separately
|
||||
GGML_LOG_INFO("%s: default.metallib not found, loading kernel sources\n", __func__);
|
||||
|
||||
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||
if (path_resource) {
|
||||
GGML_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, [path_resource UTF8String]);
|
||||
}
|
||||
|
||||
// resolve each kind's source path up front (file lookup/logging stays on the calling thread)
|
||||
NSString ** path_per_kind = calloc(GGML_METAL_LIB_COUNT, sizeof(NSString *));
|
||||
for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) {
|
||||
NSString * rel = [NSString stringWithFormat:@"kernels/%s.metal", k_lib_names[kind]];
|
||||
|
||||
NSString * path_source = nil;
|
||||
if (path_resource) {
|
||||
path_source = [path_resource stringByAppendingPathComponent:rel];
|
||||
} else {
|
||||
NSString * stem = [NSString stringWithFormat:@"kernels/%s", k_lib_names[kind]];
|
||||
path_source = [bundle pathForResource:stem ofType:@"metal"];
|
||||
}
|
||||
|
||||
if (path_source == nil || ![[NSFileManager defaultManager] isReadableFileAtPath:path_source]) {
|
||||
GGML_LOG_WARN("%s: could not locate %s in bundle, falling back to cwd\n", __func__, [rel UTF8String]);
|
||||
path_source = rel;
|
||||
}
|
||||
|
||||
GGML_LOG_DEBUG("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||
|
||||
path_per_kind[kind] = [path_source retain];
|
||||
}
|
||||
|
||||
const bool ok = ggml_metal_library_compile_all(res, device, prep,
|
||||
^NSString * (int kind, NSError ** err) {
|
||||
return ggml_metal_library_flatten_source(path_per_kind[kind], err);
|
||||
}, "source");
|
||||
|
||||
for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) {
|
||||
[path_per_kind[kind] release];
|
||||
}
|
||||
free(path_per_kind);
|
||||
|
||||
if (!ok) {
|
||||
ggml_metal_library_free(res);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return res;
|
||||
#endif
|
||||
}
|
||||
|
||||
ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
|
||||
@@ -318,10 +589,11 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
|
||||
return NULL;
|
||||
}
|
||||
|
||||
res->obj = library;
|
||||
res->dev = dev;
|
||||
res->pipelines = ggml_metal_pipelines_init();
|
||||
res->lock = [NSLock new];
|
||||
res->objs[0] = library;
|
||||
res->single_library = true;
|
||||
res->dev = dev;
|
||||
res->pipelines = ggml_metal_pipelines_init();
|
||||
res->lock = [NSLock new];
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -331,8 +603,14 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (lib->obj) {
|
||||
[lib->obj release];
|
||||
for (int kind = 0; kind < GGML_METAL_LIB_COUNT; ++kind) {
|
||||
if (lib->objs[kind]) {
|
||||
[lib->objs[kind] release];
|
||||
}
|
||||
}
|
||||
|
||||
if (lib->fn_to_lib) {
|
||||
[lib->fn_to_lib release];
|
||||
}
|
||||
|
||||
ggml_metal_pipelines_free(lib->pipelines);
|
||||
@@ -393,11 +671,28 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_
|
||||
|
||||
GGML_LOG_DEBUG("%s: compiling pipeline: base = '%s', name = '%s'\n", __func__, base, name);
|
||||
|
||||
// route to the library that actually defines this kernel; fn_to_lib is
|
||||
// built from -[MTLLibrary functionNames] so it's always in sync
|
||||
int lib_idx = 0;
|
||||
if (!lib->single_library) {
|
||||
NSNumber * idx = lib->fn_to_lib[base_func];
|
||||
if (!idx) {
|
||||
[lib->lock unlock];
|
||||
|
||||
GGML_LOG_ERROR("%s: kernel not found in any metal library: base = '%s', name = '%s'\n", __func__, base, name);
|
||||
|
||||
return res;
|
||||
}
|
||||
lib_idx = [idx intValue];
|
||||
}
|
||||
|
||||
id<MTLLibrary> mtl_lib = lib->objs[lib_idx];
|
||||
|
||||
id<MTLFunction> mtl_function;
|
||||
if (!cv) {
|
||||
mtl_function = [lib->obj newFunctionWithName:base_func];
|
||||
mtl_function = [mtl_lib newFunctionWithName:base_func];
|
||||
} else {
|
||||
mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
|
||||
mtl_function = [mtl_lib newFunctionWithName:base_func constantValues:cv->obj error:&error];
|
||||
}
|
||||
if (!mtl_function) {
|
||||
[lib->lock unlock];
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,232 @@
|
||||
#include "common.h"
|
||||
|
||||
// bitonic sort implementation following the CUDA kernels as reference
|
||||
typedef void (argsort_t)(
|
||||
constant ggml_metal_kargs_argsort & args,
|
||||
device const char * src0,
|
||||
device int32_t * dst,
|
||||
threadgroup int32_t * shmem_i32 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template<ggml_sort_order order>
|
||||
kernel void kernel_argsort_f32_i32(
|
||||
constant ggml_metal_kargs_argsort & args,
|
||||
device const char * src0,
|
||||
device int32_t * dst,
|
||||
threadgroup int32_t * shmem_i32 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
// bitonic sort
|
||||
const int col = tpitg[0];
|
||||
const int ib = tgpig[0] / args.ne01;
|
||||
|
||||
const int i00 = ib*ntg.x;
|
||||
const int i01 = tgpig[0] % args.ne01;
|
||||
const int i02 = tgpig[1];
|
||||
const int i03 = tgpig[2];
|
||||
|
||||
device const float * src0_row = (device const float *) (src0 + args.nb01*i01 + args.nb02*i02 + args.nb03*i03);
|
||||
|
||||
// initialize indices
|
||||
shmem_i32[col] = i00 + col;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (int k = 2; k <= ntg.x; k *= 2) {
|
||||
for (int j = k / 2; j > 0; j /= 2) {
|
||||
int ixj = col ^ j;
|
||||
if (ixj > col) {
|
||||
if ((col & k) == 0) {
|
||||
if (shmem_i32[col] >= args.ne00 ||
|
||||
(shmem_i32[ixj] < args.ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||
src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]] :
|
||||
src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]]))
|
||||
) {
|
||||
SWAP(shmem_i32[col], shmem_i32[ixj]);
|
||||
}
|
||||
} else {
|
||||
if (shmem_i32[ixj] >= args.ne00 ||
|
||||
(shmem_i32[col] < args.ne00 && (order == GGML_SORT_ORDER_ASC ?
|
||||
src0_row[shmem_i32[col]] < src0_row[shmem_i32[ixj]] :
|
||||
src0_row[shmem_i32[col]] > src0_row[shmem_i32[ixj]]))
|
||||
) {
|
||||
SWAP(shmem_i32[col], shmem_i32[ixj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t i0 = ib*args.top_k;
|
||||
|
||||
// copy the result to dst without the padding
|
||||
if (i0 + col < args.ne0 && col < args.top_k) {
|
||||
dst += i0 + args.ne0*i01 + args.ne0*args.ne1*i02 + args.ne0*args.ne1*args.ne2*i03;
|
||||
|
||||
dst[col] = shmem_i32[col];
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_ASC>;
|
||||
template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32<GGML_SORT_ORDER_DESC>;
|
||||
|
||||
typedef void (argsort_merge_t)(
|
||||
constant ggml_metal_kargs_argsort_merge & args,
|
||||
device const char * src0,
|
||||
device const int32_t * tmp,
|
||||
device int32_t * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template<ggml_sort_order order>
|
||||
kernel void kernel_argsort_merge_f32_i32(
|
||||
constant ggml_metal_kargs_argsort_merge & args,
|
||||
device const char * src0,
|
||||
device const int32_t * tmp,
|
||||
device int32_t * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int im = tgpig[0] / args.ne01;
|
||||
const int i01 = tgpig[0] % args.ne01;
|
||||
const int i02 = tgpig[1];
|
||||
const int i03 = tgpig[2];
|
||||
|
||||
const int start = im * (2 * args.len);
|
||||
|
||||
const int len0 = MIN(args.len, MAX(0, args.ne0 - (int)(start)));
|
||||
const int len1 = MIN(args.len, MAX(0, args.ne0 - (int)(start + args.len)));
|
||||
|
||||
const int total = len0 + len1;
|
||||
|
||||
device const int32_t * tmp0 = tmp + start
|
||||
+ i01*args.ne0
|
||||
+ i02*args.ne0*args.ne01
|
||||
+ i03*args.ne0*args.ne01*args.ne02;
|
||||
|
||||
device const int32_t * tmp1 = tmp0 + args.len;
|
||||
|
||||
dst += start
|
||||
+ i01*args.top_k
|
||||
+ i02*args.top_k*args.ne01
|
||||
+ i03*args.top_k*args.ne01*args.ne02;
|
||||
|
||||
device const float * src0_row = (device const float *)(src0
|
||||
+ args.nb01*i01
|
||||
+ args.nb02*i02
|
||||
+ args.nb03*i03);
|
||||
|
||||
if (total == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int chunk = (total + ntg.x - 1) / ntg.x;
|
||||
|
||||
const int k0 = tpitg.x * chunk;
|
||||
const int k1 = MIN(MIN(k0 + chunk, total), args.top_k);
|
||||
|
||||
if (k0 >= args.top_k) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (k0 >= total) {
|
||||
return;
|
||||
}
|
||||
|
||||
int low = k0 > len1 ? k0 - len1 : 0;
|
||||
int high = MIN(k0, len0);
|
||||
|
||||
// binary-search partition (i, j) such that i + j = k
|
||||
while (low < high) {
|
||||
const int mid = (low + high) >> 1;
|
||||
|
||||
const int32_t idx0 = tmp0[mid];
|
||||
const int32_t idx1 = tmp1[k0 - mid - 1];
|
||||
|
||||
const float val0 = src0_row[idx0];
|
||||
const float val1 = src0_row[idx1];
|
||||
|
||||
bool take_left;
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
take_left = (val0 <= val1);
|
||||
} else {
|
||||
take_left = (val0 >= val1);
|
||||
}
|
||||
|
||||
if (take_left) {
|
||||
low = mid + 1;
|
||||
} else {
|
||||
high = mid;
|
||||
}
|
||||
}
|
||||
|
||||
int i = low;
|
||||
int j = k0 - i;
|
||||
|
||||
// keep the merge fronts into registers
|
||||
int32_t idx0 = 0;
|
||||
float val0 = 0.0f;
|
||||
if (i < len0) {
|
||||
idx0 = tmp0[i];
|
||||
val0 = src0_row[idx0];
|
||||
}
|
||||
|
||||
int32_t idx1 = 0;
|
||||
float val1 = 0.0f;
|
||||
if (j < len1) {
|
||||
idx1 = tmp1[j];
|
||||
val1 = src0_row[idx1];
|
||||
}
|
||||
|
||||
for (int k = k0; k < k1; ++k) {
|
||||
int32_t out_idx;
|
||||
|
||||
if (i >= len0) {
|
||||
while (k < k1) {
|
||||
dst[k++] = tmp1[j++];
|
||||
}
|
||||
break;
|
||||
} else if (j >= len1) {
|
||||
while (k < k1) {
|
||||
dst[k++] = tmp0[i++];
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
bool take_left;
|
||||
|
||||
if (order == GGML_SORT_ORDER_ASC) {
|
||||
take_left = (val0 <= val1);
|
||||
} else {
|
||||
take_left = (val0 >= val1);
|
||||
}
|
||||
|
||||
if (take_left) {
|
||||
out_idx = idx0;
|
||||
++i;
|
||||
if (i < len0) {
|
||||
idx0 = tmp0[i];
|
||||
val0 = src0_row[idx0];
|
||||
}
|
||||
} else {
|
||||
out_idx = idx1;
|
||||
++j;
|
||||
if (j < len1) {
|
||||
idx1 = tmp1[j];
|
||||
val1 = src0_row[idx1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dst[k] = out_idx;
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_argsort_merge_f32_i32_asc")]] kernel argsort_merge_t kernel_argsort_merge_f32_i32<GGML_SORT_ORDER_ASC>;
|
||||
template [[host_name("kernel_argsort_merge_f32_i32_desc")]] kernel argsort_merge_t kernel_argsort_merge_f32_i32<GGML_SORT_ORDER_DESC>;
|
||||
@@ -0,0 +1,226 @@
|
||||
#include "common.h"
|
||||
|
||||
// OP: 0 - add, 1 - sub, 2 - mul, 3 - div
|
||||
constant short FC_bin_op [[function_constant(FC_BIN + 0)]];
|
||||
constant short FC_bin_f [[function_constant(FC_BIN + 1)]];
|
||||
constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]];
|
||||
constant bool FC_bin_cb [[function_constant(FC_BIN + 3)]];
|
||||
|
||||
template <typename T0, typename T1, typename T>
|
||||
kernel void kernel_bin_fuse_impl(
|
||||
constant ggml_metal_kargs_bin & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
#define FC_OP FC_bin_op
|
||||
#define FC_F FC_bin_f
|
||||
#define FC_RB FC_bin_rb
|
||||
#define FC_CB FC_bin_cb
|
||||
|
||||
if (FC_RB) {
|
||||
// row broadcast
|
||||
const uint i0 = tgpig.y*args.ne00 + tgpig.x;
|
||||
const uint i1 = FC_CB ? tgpig.x%args.ne10 : tgpig.x;
|
||||
|
||||
device const T0 * src0_row = (device const T0 *) (src0);
|
||||
device T * dst_row = (device T *) (dst);
|
||||
|
||||
if (FC_F == 1) {
|
||||
device const T1 * src1_row = (device const T1 *) (src1 + args.o1[0]);
|
||||
|
||||
if (FC_OP == 0) {
|
||||
dst_row[i0] = src0_row[i0] + src1_row[i1];
|
||||
}
|
||||
|
||||
if (FC_OP == 1) {
|
||||
dst_row[i0] = src0_row[i0] - src1_row[i1];
|
||||
}
|
||||
|
||||
if (FC_OP == 2) {
|
||||
dst_row[i0] = src0_row[i0] * src1_row[i1];
|
||||
}
|
||||
|
||||
if (FC_OP == 3) {
|
||||
dst_row[i0] = src0_row[i0] / src1_row[i1];
|
||||
}
|
||||
} else {
|
||||
T0 res = src0_row[i0];
|
||||
|
||||
if (FC_OP == 0) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res += ((device const T1 *) (src1 + args.o1[j]))[i1];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 1) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res -= ((device const T1 *) (src1 + args.o1[j]))[i1];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 2) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res *= ((device const T1 *) (src1 + args.o1[j]))[i1];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 3) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res /= ((device const T1 *) (src1 + args.o1[j]))[i1];
|
||||
}
|
||||
}
|
||||
|
||||
dst_row[i0] = res;
|
||||
}
|
||||
} else {
|
||||
const int i03 = tgpig.z;
|
||||
const int i02 = tgpig.y;
|
||||
const int i01 = tgpig.x;
|
||||
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i13 = i03%args.ne13;
|
||||
const int i12 = i02%args.ne12;
|
||||
const int i11 = i01%args.ne11;
|
||||
|
||||
device const T0 * src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
|
||||
device T * dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs);
|
||||
|
||||
if (FC_F == 1) {
|
||||
device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = FC_CB ? i0%args.ne10 : i0;
|
||||
|
||||
if (FC_OP == 0) {
|
||||
dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10];
|
||||
}
|
||||
|
||||
if (FC_OP == 1) {
|
||||
dst_ptr[i0] = src0_ptr[i0] - src1_ptr[i10];
|
||||
}
|
||||
|
||||
if (FC_OP == 2) {
|
||||
dst_ptr[i0] = src0_ptr[i0] * src1_ptr[i10];
|
||||
}
|
||||
|
||||
if (FC_OP == 3) {
|
||||
dst_ptr[i0] = src0_ptr[i0] / src1_ptr[i10];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
device const T1 * src1_ptr[8];
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
src1_ptr[j] = (device const T1 *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
|
||||
}
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i10 = FC_CB ? i0%args.ne10 : i0;
|
||||
|
||||
T res = src0_ptr[i0];
|
||||
|
||||
if (FC_OP == 0) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res += src1_ptr[j][i10];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 1) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res -= src1_ptr[j][i10];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 2) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res *= src1_ptr[j][i10];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_OP == 3) {
|
||||
FOR_UNROLL (short j = 0; j < FC_F; ++j) {
|
||||
res /= src1_ptr[j][i10];
|
||||
}
|
||||
}
|
||||
|
||||
dst_ptr[i0] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef FC_OP
|
||||
#undef FC_F
|
||||
#undef FC_RB
|
||||
#undef FC_CB
|
||||
}
|
||||
|
||||
typedef decltype(kernel_bin_fuse_impl<float, float, float>) kernel_bin_fuse_t;
|
||||
|
||||
template [[host_name("kernel_bin_fuse_f32_f32_f32")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl<float, float, float>;
|
||||
template [[host_name("kernel_bin_fuse_f32_f32_f32_4")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl<float4, float4, float4>;
|
||||
|
||||
kernel void kernel_add_id(
|
||||
constant ggml_metal_kargs_add_id & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i1 = tgpig.x;
|
||||
const int i2 = tgpig.y;
|
||||
|
||||
const int i11 = *((device const int32_t *) (src2 + i1*sizeof(int32_t) + i2*args.nb21));
|
||||
|
||||
const size_t nb1 = args.ne0 * sizeof(float);
|
||||
const size_t nb2 = args.ne1 * nb1;
|
||||
|
||||
device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2);
|
||||
device const float * src0_row = (device const float *)((device char *)src0 + i1*args.nb01 + i2*args.nb02);
|
||||
device const float * src1_row = (device const float *)((device char *)src1 + i11*args.nb11);
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
dst_row[i0] = src0_row[i0] + src1_row[i0];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_repeat(
|
||||
constant ggml_metal_kargs_repeat & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i3 = tgpig.z;
|
||||
const int i2 = tgpig.y;
|
||||
const int i1 = tgpig.x;
|
||||
|
||||
const int i03 = i3%args.ne03;
|
||||
const int i02 = i2%args.ne02;
|
||||
const int i01 = i1%args.ne01;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01;
|
||||
device char * dst_ptr = dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int i00 = i0%args.ne00;
|
||||
*((device T *)(dst_ptr + i0*args.nb0)) = *((device T *)(src0_ptr + i00*args.nb00));
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_repeat<float>) kernel_repeat_t;
|
||||
|
||||
template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
|
||||
template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_repeat_bf16")]] kernel kernel_repeat_t kernel_repeat<bfloat>;
|
||||
#endif
|
||||
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||
@@ -0,0 +1,126 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-metal-impl.h"
|
||||
|
||||
#include <metal_stdlib>
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
#include <metal_tensor>
|
||||
|
||||
#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h>
|
||||
#endif
|
||||
|
||||
using namespace metal;
|
||||
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; }
|
||||
|
||||
#define PAD2(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||
|
||||
#define FOR_UNROLL(x) _Pragma("clang loop unroll(full)") for (x)
|
||||
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
|
||||
// ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
|
||||
//
|
||||
// cmd:
|
||||
// .../usr/bin/metal -dM -E -c ggml/src/ggml-metal/kernels/<src>.metal
|
||||
// .../usr/bin/metal -dM -E -c -target air64-apple-ios14.0 ggml/src/ggml-metal/kernels/<src>.metal
|
||||
//
|
||||
#if __METAL_VERSION__ < 310 && defined(GGML_METAL_HAS_BF16)
|
||||
#undef GGML_METAL_HAS_BF16
|
||||
#endif
|
||||
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
typedef matrix<bfloat, 4, 4> bfloat4x4;
|
||||
typedef matrix<bfloat, 2, 4> bfloat2x4;
|
||||
#endif
|
||||
|
||||
constexpr constant static float kvalues_iq4nl_f[16] = {
|
||||
-127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f
|
||||
};
|
||||
|
||||
constexpr constant static float kvalues_mxfp4_f[16] = {
|
||||
0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
|
||||
};
|
||||
|
||||
static inline int best_index_int8(int n, constant float * val, float x) {
|
||||
if (x <= val[0]) return 0;
|
||||
if (x >= val[n-1]) return n-1;
|
||||
int ml = 0, mu = n-1;
|
||||
while (mu-ml > 1) {
|
||||
int mav = (ml+mu)/2;
|
||||
if (x < val[mav]) mu = mav; else ml = mav;
|
||||
}
|
||||
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
||||
}
|
||||
|
||||
static inline float e8m0_to_fp32(uint8_t x) {
|
||||
uint32_t bits;
|
||||
|
||||
if (x == 0) {
|
||||
bits = 0x00400000;
|
||||
} else {
|
||||
bits = (uint32_t) x << 23;
|
||||
}
|
||||
|
||||
return as_type<float>(bits);
|
||||
}
|
||||
|
||||
static inline float dot(float x, float y) {
|
||||
return x*y;
|
||||
}
|
||||
|
||||
static inline float sum(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline float sum(float4 x) {
|
||||
return x[0] + x[1] + x[2] + x[3];
|
||||
}
|
||||
|
||||
enum ggml_sort_order {
|
||||
GGML_SORT_ORDER_ASC,
|
||||
GGML_SORT_ORDER_DESC,
|
||||
};
|
||||
|
||||
constant float GELU_COEF_A = 0.044715f;
|
||||
constant float GELU_QUICK_COEF = -1.702f;
|
||||
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
constant float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||
|
||||
// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
|
||||
// ref: https://www.johndcook.com/blog/python_erf/
|
||||
constant float p_erf = 0.3275911f;
|
||||
constant float a1_erf = 0.254829592f;
|
||||
constant float a2_erf = -0.284496736f;
|
||||
constant float a3_erf = 1.421413741f;
|
||||
constant float a4_erf = -1.453152027f;
|
||||
constant float a5_erf = 1.061405429f;
|
||||
|
||||
template<typename T>
|
||||
inline T erf_approx(T x) {
|
||||
T sign_x = sign(x);
|
||||
x = fabs(x);
|
||||
T t = 1.0f / (1.0f + p_erf * x);
|
||||
T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
|
||||
return sign_x * y;
|
||||
}
|
||||
|
||||
template<typename T> T elu_approx(T x);
|
||||
|
||||
template<> inline float elu_approx<float>(float x) {
|
||||
return (x > 0.f) ? x : (exp(x) - 1);
|
||||
}
|
||||
|
||||
template<> inline float4 elu_approx<float4>(float4 x) {
|
||||
float4 res;
|
||||
|
||||
res[0] = (x[0] > 0.0f) ? x[0] : (exp(x[0]) - 1.0f);
|
||||
res[1] = (x[1] > 0.0f) ? x[1] : (exp(x[1]) - 1.0f);
|
||||
res[2] = (x[2] > 0.0f) ? x[2] : (exp(x[2]) - 1.0f);
|
||||
res[3] = (x[3] > 0.0f) ? x[3] : (exp(x[3]) - 1.0f);
|
||||
|
||||
return res;
|
||||
}
|
||||
@@ -0,0 +1,485 @@
|
||||
#include "common.h"
|
||||
|
||||
typedef void (im2col_t)(
|
||||
constant ggml_metal_kargs_im2col & args,
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_im2col(
|
||||
constant ggml_metal_kargs_im2col & args,
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
// const int64_t IC = tgpg[0];
|
||||
const int64_t OH = tgpg[1];
|
||||
const int64_t OW = tgpg[2];
|
||||
|
||||
const int64_t KH = ntg[1];
|
||||
const int64_t KW = ntg[2];
|
||||
|
||||
int64_t in = tpitg[0];
|
||||
const int64_t ikh = tpitg[1];
|
||||
const int64_t ikw = tpitg[2];
|
||||
|
||||
const int64_t iic = tgpig[0];
|
||||
const int64_t ioh = tgpig[1];
|
||||
const int64_t iow = tgpig[2];
|
||||
|
||||
const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0;
|
||||
const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1;
|
||||
|
||||
int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw);
|
||||
|
||||
device T * pdst = (device T *) (dst);
|
||||
|
||||
if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
|
||||
while (in < args.N) {
|
||||
pdst[offset_dst] = 0.0f;
|
||||
offset_dst += ntg[0]*args.CHW*OH*OW;
|
||||
|
||||
in += ntg[0];
|
||||
}
|
||||
} else {
|
||||
int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw;
|
||||
|
||||
while (in < args.N) {
|
||||
pdst[offset_dst] = x[offset_src];
|
||||
|
||||
offset_dst += ntg[0]*args.CHW*OH*OW;
|
||||
offset_src += ntg[0]*args.ofs0;
|
||||
|
||||
in += ntg[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
|
||||
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
|
||||
|
||||
// TODO: optimize
|
||||
typedef void (im2col_ext_t)(
|
||||
constant ggml_metal_kargs_im2col & args,
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_im2col_ext(
|
||||
constant ggml_metal_kargs_im2col & args,
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1]
|
||||
const int64_t KHW = (int64_t)args.KHW;
|
||||
|
||||
const int64_t d = tgpig[0] / args.CHW;
|
||||
const int64_t chw = tgpig[0] % args.CHW;
|
||||
const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1)
|
||||
const int64_t HW = tgpig[0] % KHW;
|
||||
|
||||
const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
|
||||
if (tpitg_0 >= args.N) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t tpitg_1 = HW / args.KW;
|
||||
const int64_t tpitg_2 = HW % args.KW;
|
||||
|
||||
const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
|
||||
const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
|
||||
|
||||
const int64_t offset_dst =
|
||||
(tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
|
||||
(tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
|
||||
|
||||
device T * pdst = (device T *) (dst);
|
||||
|
||||
if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
|
||||
pdst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
|
||||
pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
|
||||
template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
|
||||
|
||||
template <typename TK>
|
||||
kernel void kernel_conv_2d(
|
||||
constant ggml_metal_kargs_conv_2d & args,
|
||||
device const char * weights,
|
||||
device const char * src,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const uint threads_per_tg = ntg.x * ntg.y * ntg.z;
|
||||
const uint tg_index = (tgpig.z * tgpg.y + tgpig.y) * tgpg.x + tgpig.x;
|
||||
const uint local_thread = tpitg.z * (ntg.x * ntg.y) + tpitg.y * ntg.x + tpitg.x;
|
||||
const uint thread_index = tg_index * threads_per_tg + local_thread;
|
||||
const uint64_t total_threads = (uint64_t) threads_per_tg * tgpg.x * tgpg.y * tgpg.z;
|
||||
const uint64_t total_outputs = (uint64_t) args.N * args.OC * args.OH * args.OW;
|
||||
|
||||
for (uint64_t index = thread_index; index < total_outputs; index += total_threads) {
|
||||
uint64_t tmp = index;
|
||||
|
||||
const int32_t ow = tmp % args.OW; tmp /= args.OW;
|
||||
const int32_t oh = tmp % args.OH; tmp /= args.OH;
|
||||
const int32_t oc = tmp % args.OC; tmp /= args.OC;
|
||||
const int32_t n = tmp;
|
||||
|
||||
float acc = 0.0f;
|
||||
|
||||
const int32_t base_x = ow*args.s0 - args.p0;
|
||||
const int32_t base_y = oh*args.s1 - args.p1;
|
||||
|
||||
int32_t ky_start = 0;
|
||||
if (base_y < 0) {
|
||||
ky_start = (-base_y + args.d1 - 1)/args.d1;
|
||||
}
|
||||
int32_t ky_end = args.KH;
|
||||
const int32_t y_max = args.IH - 1 - base_y;
|
||||
if (y_max < 0) {
|
||||
ky_end = ky_start;
|
||||
} else if (base_y + (args.KH - 1)*args.d1 >= args.IH) {
|
||||
ky_end = min(ky_end, y_max/args.d1 + 1);
|
||||
}
|
||||
|
||||
int32_t kx_start = 0;
|
||||
if (base_x < 0) {
|
||||
kx_start = (-base_x + args.d0 - 1)/args.d0;
|
||||
}
|
||||
int32_t kx_end = args.KW;
|
||||
const int32_t x_max = args.IW - 1 - base_x;
|
||||
if (x_max < 0) {
|
||||
kx_end = kx_start;
|
||||
} else if (base_x + (args.KW - 1)*args.d0 >= args.IW) {
|
||||
kx_end = min(kx_end, x_max/args.d0 + 1);
|
||||
}
|
||||
|
||||
if (ky_start < ky_end && kx_start < kx_end) {
|
||||
const uint64_t src_base_n = (uint64_t) n * args.nb13;
|
||||
const uint64_t w_base_oc = (uint64_t) oc * args.nb03;
|
||||
|
||||
for (int32_t ic = 0; ic < args.IC; ++ic) {
|
||||
const uint64_t src_base_nc = src_base_n + (uint64_t) ic * args.nb12;
|
||||
const uint64_t w_base_ocic = w_base_oc + (uint64_t) ic * args.nb02;
|
||||
|
||||
for (int32_t ky = ky_start; ky < ky_end; ++ky) {
|
||||
const int32_t iy = base_y + ky*args.d1;
|
||||
const uint64_t src_base_row = src_base_nc + (uint64_t) iy * args.nb11;
|
||||
const uint64_t w_base_row = w_base_ocic + (uint64_t) ky * args.nb01;
|
||||
|
||||
for (int32_t kx = kx_start; kx < kx_end; ++kx) {
|
||||
const int32_t ix = base_x + kx*args.d0;
|
||||
const uint64_t src_offs = src_base_row + (uint64_t) ix * args.nb10;
|
||||
const uint64_t w_offs = w_base_row + (uint64_t) kx * args.nb00;
|
||||
|
||||
const float x = *(device const float *)(src + src_offs);
|
||||
const float w = (float) (*(device const TK *)(weights + w_offs));
|
||||
|
||||
acc += x * w;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint64_t dst_offs =
|
||||
(uint64_t) n * args.nb3 +
|
||||
(uint64_t) oc * args.nb2 +
|
||||
(uint64_t) oh * args.nb1 +
|
||||
(uint64_t) ow * args.nb0;
|
||||
|
||||
*(device float *)(dst + dst_offs) = acc;
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_conv_2d_f32_f32")]]
|
||||
kernel void kernel_conv_2d<float>(
|
||||
constant ggml_metal_kargs_conv_2d & args,
|
||||
device const char * weights,
|
||||
device const char * src,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template [[host_name("kernel_conv_2d_f16_f32")]]
|
||||
kernel void kernel_conv_2d<half>(
|
||||
constant ggml_metal_kargs_conv_2d & args,
|
||||
device const char * weights,
|
||||
device const char * src,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
typedef void (conv_transpose_1d_t)(
|
||||
constant ggml_metal_kargs_conv_transpose_1d & args,
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_conv_transpose_1d(
|
||||
constant ggml_metal_kargs_conv_transpose_1d & args,
|
||||
device const T * src0,
|
||||
device const float * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]) {
|
||||
|
||||
// For output position j on the time axis, only input positions
|
||||
// i such that i*s0 <= j < i*s0 + K
|
||||
// contribute -- i.e. i in [ceil((j - K + 1)/s0), floor(j/s0)]
|
||||
// intersected with [0, IL-1]. That's at most ceil(K/s0) values
|
||||
// (typically 2 for stride==K/2 transposed convs).
|
||||
const int32_t j = tgpig[0];
|
||||
const int32_t s0 = args.s0;
|
||||
const int32_t K = args.K;
|
||||
const int32_t IL = args.IL;
|
||||
|
||||
int32_t i_min;
|
||||
{
|
||||
int32_t a = j - K + 1;
|
||||
i_min = a <= 0 ? 0 : (a + s0 - 1) / s0; // ceil(a/s0) for a>0
|
||||
}
|
||||
int32_t i_max = j / s0;
|
||||
if (i_max > IL - 1) i_max = IL - 1;
|
||||
|
||||
float v = 0.0f;
|
||||
if (i_min <= i_max) {
|
||||
for (int64_t c = 0; c < args.IC; c++) {
|
||||
const int32_t kernel_offset = c * tgpg[1] * K + K * tgpig[1];
|
||||
const int32_t input_offset = c * IL;
|
||||
|
||||
for (int32_t i = i_min; i <= i_max; i++) {
|
||||
v += float(src0[kernel_offset + j - i * s0]) * src1[input_offset + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
device float * dst_ptr = (device float *) (dst + tgpig[0] * args.nb0 + tgpig[1] * args.nb1);
|
||||
|
||||
dst_ptr[0] = v;
|
||||
}
|
||||
|
||||
template [[host_name("kernel_conv_transpose_1d_f32_f32")]]
|
||||
kernel void kernel_conv_transpose_1d<float>(
|
||||
constant ggml_metal_kargs_conv_transpose_1d & args,
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]);
|
||||
|
||||
template [[host_name("kernel_conv_transpose_1d_f16_f32")]]
|
||||
kernel void kernel_conv_transpose_1d<half>(
|
||||
constant ggml_metal_kargs_conv_transpose_1d & args,
|
||||
device const half * src0,
|
||||
device const float * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]);
|
||||
|
||||
|
||||
typedef void (conv_transpose_2d_t)(
|
||||
constant ggml_metal_kargs_conv_transpose_2d & args,
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_conv_transpose_2d(
|
||||
constant ggml_metal_kargs_conv_transpose_2d & args,
|
||||
device const T * src0,
|
||||
device const float * src1,
|
||||
device char * dst,
|
||||
threadgroup float * shared_sum [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t out_x = tgpig[0];
|
||||
const int64_t out_y = tgpig[1];
|
||||
const int64_t out_c = tgpig[2];
|
||||
|
||||
const int64_t kw = tpitg[0];
|
||||
const int64_t kh = tpitg[1];
|
||||
|
||||
float v = 0.0f;
|
||||
|
||||
for (int64_t in_c = 0; in_c < args.IC; in_c++) {
|
||||
int64_t in_y = out_y - kh;
|
||||
|
||||
if (in_y < 0 || in_y % args.s0) continue;
|
||||
|
||||
in_y /= args.s0;
|
||||
|
||||
if (in_y >= args.IH) continue;
|
||||
|
||||
int64_t in_x = out_x - kw;
|
||||
|
||||
if (in_x < 0 || in_x % args.s0) continue;
|
||||
|
||||
in_x /= args.s0;
|
||||
|
||||
if (in_x >= args.IW) continue;
|
||||
|
||||
const int64_t input_idx = (args.IW * args.IH) * in_c + (args.IW) * in_y + in_x;
|
||||
const int64_t kernel_idx = (args.KH * args.KW * args.OC) * in_c + (args.KH * args.KW) * out_c + (args.KW) * kh + kw;
|
||||
|
||||
v += (float)src0[kernel_idx] * src1[input_idx];
|
||||
}
|
||||
|
||||
const uint tid = tpitg.y * ntg.x + tpitg.x;
|
||||
shared_sum[tid] = v;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tid == 0) {
|
||||
float total = 0.0f;
|
||||
const uint num_threads = ntg.x * ntg.y;
|
||||
for (uint i = 0; i < num_threads; i++) {
|
||||
total += shared_sum[i];
|
||||
}
|
||||
|
||||
device float * dst_ptr = (device float *) (dst + out_x*args.nb0 + out_y * args.nb1 + out_c*args.nb2);
|
||||
dst_ptr[0] = total;
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_conv_transpose_2d_f32_f32")]]
|
||||
kernel void kernel_conv_transpose_2d<float>(
|
||||
constant ggml_metal_kargs_conv_transpose_2d & args,
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device char * dst,
|
||||
threadgroup float * shared_sum [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template [[host_name("kernel_conv_transpose_2d_f16_f32")]]
|
||||
kernel void kernel_conv_transpose_2d<half>(
|
||||
constant ggml_metal_kargs_conv_transpose_2d & args,
|
||||
device const half * src0,
|
||||
device const float * src1,
|
||||
device char * dst,
|
||||
threadgroup float * shared_sum [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_conv_3d(
|
||||
constant ggml_metal_kargs_conv_3d & args,
|
||||
device const char * src0, // Weights [IC * OC, KD, KH, KW]
|
||||
device const char * src1, // Inputs [IC * N, ID, IH, IW]
|
||||
device char * dst, // Outputs [OC * N, OD, OH, OW]
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]]) {
|
||||
|
||||
// 1. Un-flatten the spatial dimension from Grid X
|
||||
int64_t spatial_idx = tgpig.x * 32 + tpitg.x;
|
||||
|
||||
if (spatial_idx >= args.OW * args.OH * args.OD) {
|
||||
return; // Thread falls outside the spatial volume
|
||||
}
|
||||
|
||||
int64_t od = spatial_idx / (args.OW * args.OH);
|
||||
int64_t oh = (spatial_idx / args.OW) % args.OH;
|
||||
int64_t ow = spatial_idx % args.OW;
|
||||
|
||||
// 2. Map Y to Channels, Z to Batch
|
||||
int64_t oc = tgpig.y;
|
||||
int64_t batch_idx = tgpig.z;
|
||||
|
||||
// 3. Calculate anchor coordinates in the Input volume
|
||||
int64_t i_w_base = ow * args.s0 - args.p0;
|
||||
int64_t i_h_base = oh * args.s1 - args.p1;
|
||||
int64_t i_d_base = od * args.s2 - args.p2;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
// 4. Gather Loop (Iterate over Input Channels -> Depth -> Height -> Width)
|
||||
for (int64_t ic = 0; ic < args.IC; ++ic) {
|
||||
|
||||
// ggml packs batch and channel together in the 4th dimension
|
||||
int64_t src_cn_idx = batch_idx * args.IC + ic;
|
||||
int64_t w_cn_idx = oc * args.IC + ic;
|
||||
|
||||
for (int64_t kz = 0; kz < args.KD; ++kz) {
|
||||
int64_t id = i_d_base + kz * args.d2;
|
||||
if (id < 0 || id >= args.ID) continue; // Boundary check (Padding)
|
||||
|
||||
for (int64_t ky = 0; ky < args.KH; ++ky) {
|
||||
int64_t ih = i_h_base + ky * args.d1;
|
||||
if (ih < 0 || ih >= args.IH) continue;
|
||||
|
||||
for (int64_t kx = 0; kx < args.KW; ++kx) {
|
||||
int64_t iw = i_w_base + kx * args.d0;
|
||||
if (iw < 0 || iw >= args.IW) continue;
|
||||
|
||||
// Convert multi-dimensional coordinates to flat byte offsets
|
||||
int64_t w_idx = kx*args.nb00 + ky*args.nb01 + kz*args.nb02 + w_cn_idx*args.nb03;
|
||||
int64_t i_idx = iw*args.nb10 + ih*args.nb11 + id*args.nb12 + src_cn_idx*args.nb13;
|
||||
|
||||
// Dereference memory and cast weights to f32 if they were f16
|
||||
float w_val = (float)*(device const T*)((device const char*)src0 + w_idx);
|
||||
float i_val = *(device const float*)((device const char*)src1 + i_idx);
|
||||
|
||||
sum += w_val * i_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Write the accumulated value out to RAM
|
||||
int64_t dst_cn_idx = batch_idx * args.OC + oc;
|
||||
int64_t d_idx = ow*args.nb0 + oh*args.nb1 + od*args.nb2 + dst_cn_idx*args.nb3;
|
||||
|
||||
*(device float*)(dst + d_idx) = sum;
|
||||
}
|
||||
|
||||
// Explicit instantiations so the JIT compiler can find them by name
|
||||
template [[host_name("kernel_conv_3d_f32_f32")]]
|
||||
kernel void kernel_conv_3d<float>(
|
||||
constant ggml_metal_kargs_conv_3d & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]]);
|
||||
|
||||
// Explicit instantiation for f16 weights
|
||||
template [[host_name("kernel_conv_3d_f16_f32")]]
|
||||
kernel void kernel_conv_3d<half>(
|
||||
constant ggml_metal_kargs_conv_3d & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]]);
|
||||
@@ -0,0 +1,686 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define GGML_COMMON_DECL_METAL
|
||||
#define GGML_COMMON_IMPL_METAL
|
||||
#if defined(GGML_METAL_EMBED_LIBRARY)
|
||||
__embed_ggml-common.h__
|
||||
#else
|
||||
#include "ggml-common.h"
|
||||
#endif
|
||||
|
||||
#define QK_NL 16 // shared by mul_mm and get_rows_q instantiations
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
template <typename type4x4>
|
||||
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
||||
reg = (type4x4)(*src);
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_f32_t4(device const float4 * src, short il, thread type4 & reg) {
|
||||
reg = (type4)(*src);
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
|
||||
reg = (type4x4)(*src);
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_f16_t4(device const half4 * src, short il, thread type4 & reg) {
|
||||
reg = (type4)(*(src));
|
||||
}
|
||||
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template <typename type4x4>
|
||||
void dequantize_bf16(device const bfloat4x4 * src, short il, thread type4x4 & reg) {
|
||||
reg = (type4x4)(*src);
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_bf16_t4(device const bfloat4 * src, short il, thread type4 & reg) {
|
||||
reg = (type4)(*(src));
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q1_0(device const block_q1_0 * xb, short il, thread type4x4 & reg) {
|
||||
device const uint8_t * qs = xb->qs;
|
||||
const float d = xb->d;
|
||||
const float neg_d = -d;
|
||||
|
||||
const int byte_offset = il * 2; // il*16 bits = il*2 bytes
|
||||
const uint8_t b0 = qs[byte_offset];
|
||||
const uint8_t b1 = qs[byte_offset + 1];
|
||||
|
||||
float4x4 reg_f;
|
||||
|
||||
reg_f[0][0] = select(neg_d, d, bool(b0 & 0x01));
|
||||
reg_f[0][1] = select(neg_d, d, bool(b0 & 0x02));
|
||||
reg_f[0][2] = select(neg_d, d, bool(b0 & 0x04));
|
||||
reg_f[0][3] = select(neg_d, d, bool(b0 & 0x08));
|
||||
reg_f[1][0] = select(neg_d, d, bool(b0 & 0x10));
|
||||
reg_f[1][1] = select(neg_d, d, bool(b0 & 0x20));
|
||||
reg_f[1][2] = select(neg_d, d, bool(b0 & 0x40));
|
||||
reg_f[1][3] = select(neg_d, d, bool(b0 & 0x80));
|
||||
|
||||
reg_f[2][0] = select(neg_d, d, bool(b1 & 0x01));
|
||||
reg_f[2][1] = select(neg_d, d, bool(b1 & 0x02));
|
||||
reg_f[2][2] = select(neg_d, d, bool(b1 & 0x04));
|
||||
reg_f[2][3] = select(neg_d, d, bool(b1 & 0x08));
|
||||
reg_f[3][0] = select(neg_d, d, bool(b1 & 0x10));
|
||||
reg_f[3][1] = select(neg_d, d, bool(b1 & 0x20));
|
||||
reg_f[3][2] = select(neg_d, d, bool(b1 & 0x40));
|
||||
reg_f[3][3] = select(neg_d, d, bool(b1 & 0x80));
|
||||
|
||||
reg = (type4x4) reg_f;
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_q1_0_t4(device const block_q1_0 * xb, short il, thread type4 & reg) {
|
||||
const float d = xb->d;
|
||||
const float neg_d = -d;
|
||||
const int base = il * 4;
|
||||
const uint8_t byte = xb->qs[base / 8];
|
||||
const int s = base % 8;
|
||||
|
||||
float4 reg_f;
|
||||
reg_f[0] = select(neg_d, d, bool((byte >> (s )) & 1));
|
||||
reg_f[1] = select(neg_d, d, bool((byte >> (s + 1)) & 1));
|
||||
reg_f[2] = select(neg_d, d, bool((byte >> (s + 2)) & 1));
|
||||
reg_f[3] = select(neg_d, d, bool((byte >> (s + 3)) & 1));
|
||||
|
||||
reg = (type4) reg_f;
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_0(device const block_q4_0 * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.h * xb->d;
|
||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = mask0 << 8;
|
||||
|
||||
float4x4 reg_f;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
reg_f[i/2][2*(i%2) + 0] = d1 * (qs[i] & mask0) + md;
|
||||
reg_f[i/2][2*(i%2) + 1] = d2 * (qs[i] & mask1) + md;
|
||||
}
|
||||
|
||||
reg = (type4x4) reg_f;
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
|
||||
const float d1 = (il/4) ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.h * xb->d;
|
||||
const ushort mask0 = (il/4) ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = mask0 << 8;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + md;
|
||||
reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + md;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
||||
const float d1 = il ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb->m;
|
||||
const ushort mask0 = il ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = mask0 << 8;
|
||||
|
||||
float4x4 reg_f;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
reg_f[i/2][2*(i%2) + 0] = ((qs[i] & mask0) * d1) + m;
|
||||
reg_f[i/2][2*(i%2) + 1] = ((qs[i] & mask1) * d2) + m;
|
||||
}
|
||||
|
||||
reg = (type4x4) reg_f;
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_q4_1_t4(device const block_q4_1 * xb, short il, thread type4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
|
||||
const float d1 = (il/4) ? (xb->d / 16.h) : xb->d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb->m;
|
||||
const ushort mask0 = (il/4) ? 0x00F0 : 0x000F;
|
||||
const ushort mask1 = mask0 << 8;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
reg[2*i + 0] = d1 * (qs[2*(il%4) + i] & mask0) + m;
|
||||
reg[2*i + 1] = d2 * (qs[2*(il%4) + i] & mask1) + m;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_0(device const block_q5_0 * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
||||
const float d = xb->d;
|
||||
const float md = -16.h * xb->d;
|
||||
const ushort mask = il ? 0x00F0 : 0x000F;
|
||||
|
||||
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||
|
||||
const int x_mv = il ? 4 : 0;
|
||||
|
||||
const int gh_mv = il ? 12 : 0;
|
||||
const int gh_bk = il ? 0 : 4;
|
||||
|
||||
float4x4 reg_f;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// extract the 5-th bits for x0 and x1
|
||||
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||
|
||||
// combine the 4-bits from qs with the 5th bit
|
||||
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||
|
||||
reg_f[i/2][2*(i%2) + 0] = d * x0 + md;
|
||||
reg_f[i/2][2*(i%2) + 1] = d * x1 + md;
|
||||
}
|
||||
|
||||
reg = (type4x4) reg_f;
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_q5_0_t4(device const block_q5_0 * xb, short il, thread type4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 3);
|
||||
const float d = xb->d;
|
||||
const float md = -16.h * xb->d;
|
||||
const ushort mask = (il/4) ? 0x00F0 : 0x000F;
|
||||
|
||||
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||
|
||||
const int x_mv = (il/4) ? 4 : 0;
|
||||
|
||||
const int gh_mv = (il/4) ? 12 : 0;
|
||||
const int gh_bk = (il/4) ? 0 : 4;
|
||||
|
||||
for (int ii = 0; ii < 2; ii++) {
|
||||
int i = 2*(il%4) + ii;
|
||||
|
||||
// extract the 5-th bits for x0 and x1
|
||||
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||
|
||||
// combine the 4-bits from qs with the 5th bit
|
||||
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||
|
||||
reg[2*ii + 0] = d * x0 + md;
|
||||
reg[2*ii + 1] = d * x1 + md;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_1(device const block_q5_1 * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 4);
|
||||
const float d = xb->d;
|
||||
const float m = xb->m;
|
||||
const ushort mask = il ? 0x00F0 : 0x000F;
|
||||
|
||||
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||
|
||||
const int x_mv = il ? 4 : 0;
|
||||
|
||||
const int gh_mv = il ? 12 : 0;
|
||||
const int gh_bk = il ? 0 : 4;
|
||||
|
||||
float4x4 reg_f;
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
// extract the 5-th bits for x0 and x1
|
||||
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||
|
||||
// combine the 4-bits from qs with the 5th bit
|
||||
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||
|
||||
reg_f[i/2][2*(i%2) + 0] = d * x0 + m;
|
||||
reg_f[i/2][2*(i%2) + 1] = d * x1 + m;
|
||||
}
|
||||
|
||||
reg = (type4x4) reg_f;
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_q5_1_t4(device const block_q5_1 * xb, short il, thread type4 & reg) {
|
||||
device const uint16_t * qs = ((device const uint16_t *)xb + 4);
|
||||
const float d = xb->d;
|
||||
const float m = xb->m;
|
||||
const ushort mask = (il/4) ? 0x00F0 : 0x000F;
|
||||
|
||||
const uint32_t qh = *((device const uint32_t *)xb->qh);
|
||||
|
||||
const int x_mv = (il/4) ? 4 : 0;
|
||||
|
||||
const int gh_mv = (il/4) ? 12 : 0;
|
||||
const int gh_bk = (il/4) ? 0 : 4;
|
||||
|
||||
for (int ii = 0; ii < 2; ii++) {
|
||||
int i = 2*(il%4) + ii;
|
||||
|
||||
// extract the 5-th bits for x0 and x1
|
||||
const uint8_t xh_0 = ((qh >> (gh_mv + 2*i )) << gh_bk) & 0x10;
|
||||
const uint8_t xh_1 = ((qh >> (gh_mv + 2*i+1)) << gh_bk) & 0x10;
|
||||
|
||||
// combine the 4-bits from qs with the 5th bit
|
||||
const int32_t x0 = ((((qs[i] ) & mask) >> x_mv) | xh_0);
|
||||
const int32_t x1 = ((((qs[i] >> 8) & mask) >> x_mv) | xh_1);
|
||||
|
||||
reg[2*ii + 0] = d * x0 + m;
|
||||
reg[2*ii + 1] = d * x1 + m;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q8_0(device const block_q8_0 *xb, short il, thread type4x4 & reg) {
|
||||
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
||||
const float d = xb->d;
|
||||
|
||||
float4x4 reg_f;
|
||||
|
||||
for (int i = 0; i < 16; i++) {
|
||||
reg_f[i/4][i%4] = (qs[i + 16*il] * d);
|
||||
}
|
||||
|
||||
reg = (type4x4) reg_f;
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & reg) {
|
||||
device const int8_t * qs = ((device const int8_t *)xb->qs);
|
||||
const float d = xb->d;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
reg[i] = (qs[4*(il%4) + i + 16*(il/4)] * d);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_mxfp4(device const block_mxfp4 * xb, short il, thread type4x4 & reg) {
|
||||
device const uint8_t * q2 = (device const uint8_t *)xb->qs;
|
||||
|
||||
const float d = e8m0_to_fp32(xb->e);
|
||||
const uint8_t shr = il >= 1 ? 4 : 0;
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[i][0] = d * kvalues_mxfp4_f[(q2[4*i + 0] >> shr) & 0x0F];
|
||||
reg[i][1] = d * kvalues_mxfp4_f[(q2[4*i + 1] >> shr) & 0x0F];
|
||||
reg[i][2] = d * kvalues_mxfp4_f[(q2[4*i + 2] >> shr) & 0x0F];
|
||||
reg[i][3] = d * kvalues_mxfp4_f[(q2[4*i + 3] >> shr) & 0x0F];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_mxfp4_t4(device const block_mxfp4 * xb, short il, thread type4 & reg) {
|
||||
device const uint8_t * q2 = (device const uint8_t *)xb->qs;
|
||||
|
||||
const float d = e8m0_to_fp32(xb->e);
|
||||
const short il4 = il%4;
|
||||
|
||||
const uint8_t shr = il >= 4 ? 4 : 0;
|
||||
|
||||
reg[0] = d * kvalues_mxfp4_f[(q2[4*il4 + 0] >> shr) & 0x0F];
|
||||
reg[1] = d * kvalues_mxfp4_f[(q2[4*il4 + 1] >> shr) & 0x0F];
|
||||
reg[2] = d * kvalues_mxfp4_f[(q2[4*il4 + 2] >> shr) & 0x0F];
|
||||
reg[3] = d * kvalues_mxfp4_f[(q2[4*il4 + 3] >> shr) & 0x0F];
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) {
|
||||
const float d = xb->d;
|
||||
const float min = xb->dmin;
|
||||
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
||||
float dl, ml;
|
||||
uint8_t sc = xb->scales[il];
|
||||
|
||||
q = q + 32*(il/8) + 16*(il&1);
|
||||
il = (il/2)%4;
|
||||
|
||||
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) {
|
||||
const half d_all = xb->d;
|
||||
device const uint8_t * q = (device const uint8_t *)xb->qs;
|
||||
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
q = q + 32 * (il/8) + 16 * (il&1);
|
||||
h = h + 16 * (il&1);
|
||||
uint8_t m = 1 << (il/2);
|
||||
uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \
|
||||
((il/4)>0 ? 12 : 3);
|
||||
uint16_t kmask2 = il/8 ? 0xF0 : 0x0F;
|
||||
uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4];
|
||||
int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2)
|
||||
: (scale_2&kmask2) | ((scale_1&kmask1) << 4);
|
||||
float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f);
|
||||
const float ml = 4.f * dl;
|
||||
|
||||
il = (il/2) & 3;
|
||||
const half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
const uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
dl *= coef;
|
||||
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||
}
|
||||
}
|
||||
|
||||
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
||||
return j < 4 ? uchar2{uchar(q[j+0+k] & 63), uchar(q[j+4+k] & 63)}
|
||||
: uchar2{uchar((q[j+4+k] & 0xF) | ((q[j-4+k] & 0xc0) >> 2)), uchar((q[j+4+k] >> 4) | ((q[j-0+k] & 0xc0) >> 2))};
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q4_K(device const block_q4_K * xb, short il, thread type4x4 & reg) {
|
||||
device const uchar * q = xb->qs;
|
||||
|
||||
short is = (il/4) * 2;
|
||||
q = q + (il/4) * 32 + 16 * (il&1);
|
||||
il = il & 3;
|
||||
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
||||
const float d = il < 2 ? xb->d : xb->d / 16.h;
|
||||
const float min = xb->dmin;
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
|
||||
const ushort mask = il < 2 ? 0x0F : 0xF0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) {
|
||||
device const uint8_t * q = xb->qs;
|
||||
device const uint8_t * qh = xb->qh;
|
||||
|
||||
short is = (il/4) * 2;
|
||||
q = q + 32 * (il/4) + 16 * (il&1);
|
||||
qh = qh + 16 * (il&1);
|
||||
uint8_t ul = 1 << (il/2);
|
||||
il = il & 3;
|
||||
const uchar2 sc = get_scale_min_k4_just2(is, il/2, xb->scales);
|
||||
const float d = il < 2 ? xb->d : xb->d / 16.f;
|
||||
const float min = xb->dmin;
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
const float qh_val = il<2 ? 16.f : 256.f;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) {
|
||||
const half d_all = xb->d;
|
||||
device const uint16_t * ql = (device const uint16_t *)xb->ql;
|
||||
device const uint16_t * qh = (device const uint16_t *)xb->qh;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
ql = ql + 32*(il/8) + 16*((il/2)&1) + 8*(il&1);
|
||||
qh = qh + 16*(il/8) + 8*(il&1);
|
||||
float sc = scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
|
||||
const uint32_t kmask1 = il>1 ? (il>2 ? 0xC0C0C0C0 : 0x30303030) : (il>0 ? 0x0C0C0C0C : 0x03030303);
|
||||
const uint32_t kmask2 = il>1 ? 0xF0F0F0F0 : 0x0F0F0F0F;
|
||||
const float ml = d_all * sc * 32.f;
|
||||
const float dl0 = d_all * sc;
|
||||
const float dl1 = dl0 / 256.f;
|
||||
const float dl2 = dl0 / (256.f * 256.f);
|
||||
const float dl3 = dl0 / (256.f * 256.f * 256.f);
|
||||
const uint8_t shr_h = il>2 ? 2 : 0;
|
||||
const uint8_t shl_h = il>1 ? 0 : (il>0 ? 2 : 4);
|
||||
const uint8_t shr_l = il>1 ? 4 : 0;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
const uint32_t low = (ql[2*i] | (uint32_t)(ql[2*i+1] << 16)) & kmask2;
|
||||
const uint32_t high = (qh[2*i] | (uint32_t)(qh[2*i+1] << 16)) & kmask1;
|
||||
const uint32_t q = ((high << shl_h) >> shr_h) | (low >> shr_l);
|
||||
reg[i][0] = dl0 * ((half)(q & 0xFF)) - ml;
|
||||
reg[i][1] = dl1 * ((float)(q & 0xFF00)) - ml;
|
||||
reg[i][2] = dl2 * ((float)(q & 0xFF0000)) - ml;
|
||||
reg[i][3] = dl3 * ((float)(q & 0xFF000000)) - ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
// each block of 32 needs 2 uint32_t's for the quants & scale, so 4 uint16_t's.
|
||||
device const uint16_t * q2 = xb->qs + 4*ib32;
|
||||
const uint32_t aux32_g = q2[0] | (q2[1] << 16);
|
||||
const uint32_t aux32_s = q2[2] | (q2[3] << 16);
|
||||
thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g;
|
||||
const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f;
|
||||
constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
||||
uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
||||
signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint16_t * q2 = xb->qs + 4*ib32;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511));
|
||||
uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511));
|
||||
signs = ksigns_iq2xs[q2[2*il+1] >> 9];
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * q3 = xb->qs + 8*ib32;
|
||||
device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
|
||||
uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
|
||||
grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
|
||||
signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 8*ib32;
|
||||
device const uint8_t * signs = xb->signs + 4*ib32 + 2*il;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf));
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]);
|
||||
reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256)));
|
||||
grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]);
|
||||
reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * signs = qs + QK_K/8;
|
||||
const uint8_t qh = xb->qh[ib32] >> 4*il;
|
||||
const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq2s_grid + (qs[0] | ((qh << 8) & 0x300)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq2s_grid + (qs[1] | ((qh << 6) & 0x300)));
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
reg[i/4+0][i%4] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i]);
|
||||
reg[i/4+2][i%4] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
const float d = xb->d;
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint16_t * qh = xb->qh;
|
||||
const float dl = d * (2*((qh[ib32] >> 12) & 7) + 1);
|
||||
const float ml = dl * (qh[ib32] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA);
|
||||
const uint16_t h = qh[ib32] >> 6*il;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((h << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((h << 5) & 0x700)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * (grid1[i] & 0xf) + ml;
|
||||
reg[1][i] = dl * (grid1[i] >> 4) + ml;
|
||||
reg[2][i] = dl * (grid2[i] & 0xf) + ml;
|
||||
reg[3][i] = dl * (grid2[i] >> 4) + ml;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
||||
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const float d = scale.f16;
|
||||
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
||||
|
||||
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
||||
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * (grid1[i] & 0xf) + ml1;
|
||||
reg[1][i] = dl * (grid1[i] >> 4) + ml1;
|
||||
reg[2][i] = dl * (grid2[i] & 0xf) + ml2;
|
||||
reg[3][i] = dl * (grid2[i] >> 4) + ml2;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) {
|
||||
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
||||
const float d = xb->d;
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
aux32 = ((q4[2*i] | (q4[2*i+1] << 16)) >> 4*il) & 0x0f0f0f0f;
|
||||
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename type4>
|
||||
void dequantize_iq4_nl_t4(device const block_iq4_nl * xb, short il, thread type4 & reg) {
|
||||
device const uint16_t * q4 = (device const uint16_t *)xb->qs;
|
||||
const float d = xb->d;
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
aux32 = ((q4[2*(il%4)] | (q4[2*(il%4)+1] << 16)) >> 4*(il/4)) & 0x0f0f0f0f;
|
||||
reg[0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint32_t * q4 = (device const uint32_t *)xb->qs + 4*ib32;
|
||||
const int ls = ((xb->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((xb->scales_h >> 2*ib32) & 3) << 4);
|
||||
const float d = (float)xb->d * (ls - 32);
|
||||
uint32_t aux32;
|
||||
thread const uint8_t * q8 = (thread const uint8_t *)&aux32;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
aux32 = (q4[i] >> 4*il) & 0x0f0f0f0f;
|
||||
reg[i][0] = d * kvalues_iq4nl_f[q8[0]];
|
||||
reg[i][1] = d * kvalues_iq4nl_f[q8[1]];
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,250 @@
|
||||
#include "common.h"
|
||||
|
||||
constant short FC_gated_delta_net_ne20 [[function_constant(FC_GATED_DELTA_NET + 0)]];
|
||||
constant short FC_gated_delta_net_ne30 [[function_constant(FC_GATED_DELTA_NET + 1)]];
|
||||
constant short FC_gated_delta_net_K [[function_constant(FC_GATED_DELTA_NET + 2)]];
|
||||
|
||||
#if 1
|
||||
template<short NSG>
|
||||
kernel void kernel_gated_delta_net_impl(
|
||||
constant ggml_metal_kargs_gated_delta_net & args,
|
||||
device const char * q,
|
||||
device const char * k,
|
||||
device const char * v,
|
||||
device const char * g,
|
||||
device const char * b,
|
||||
device const char * s,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
#define S_v FC_gated_delta_net_ne20
|
||||
#define G FC_gated_delta_net_ne30
|
||||
#define K FC_gated_delta_net_K
|
||||
|
||||
const uint tx = tpitg.x;
|
||||
const uint ty = tpitg.y;
|
||||
|
||||
const uint i23 = tgpig.z; // B (n_seqs)
|
||||
const uint i21 = tgpig.y; // H (head)
|
||||
const uint i20 = tgpig.x*NSG + ty; // row within S_v
|
||||
|
||||
const uint i01 = i21 % args.ne01;
|
||||
const uint i11 = i21 % args.ne11;
|
||||
|
||||
const float scale = 1.0f / sqrt((float)S_v);
|
||||
|
||||
// input state layout [S_v, S_v, H, n_seqs] (s0 only): per-seq stride is H*D.
|
||||
// state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous
|
||||
const uint state_in_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
|
||||
device const float * s_ptr = (device const float *) (s) + state_in_base;
|
||||
|
||||
float ls[NSG];
|
||||
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
ls[j] = s_ptr[is];
|
||||
}
|
||||
|
||||
device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
|
||||
|
||||
device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01);
|
||||
device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11);
|
||||
device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21);
|
||||
|
||||
device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21);
|
||||
device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G;
|
||||
|
||||
// snapshot slot mapping: slot 0 = most recent state, slot s = s tokens back.
|
||||
// When n_tokens < K, only slots 0..n_tokens-1 are written; older slots are caller-owned.
|
||||
|
||||
// output state base offset: after attention scores
|
||||
const uint attn_size = args.ne22 * args.ne21 * S_v * args.ne23;
|
||||
// output state per-slot size: S_v * S_v * H * n_seqs
|
||||
const uint state_size_per_snap = S_v * S_v * args.ne21 * args.ne23;
|
||||
// per-(seq,head) offset within a slot
|
||||
const uint state_out_base = (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;
|
||||
|
||||
for (short t = 0; t < args.ne22; t++) {
|
||||
float s_k = 0.0f;
|
||||
|
||||
if (G == 1) {
|
||||
const float g_exp = exp(g_ptr[0]);
|
||||
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
ls[j] *= g_exp;
|
||||
|
||||
s_k += ls[j]*k_ptr[is];
|
||||
}
|
||||
} else {
|
||||
// KDA
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
ls[j] *= exp(g_ptr[is]);
|
||||
|
||||
s_k += ls[j]*k_ptr[is];
|
||||
}
|
||||
}
|
||||
|
||||
s_k = simd_sum(s_k);
|
||||
|
||||
const float d = (v_ptr[i20] - s_k)*b_ptr[0];
|
||||
|
||||
float y = 0.0f;
|
||||
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
ls[j] += k_ptr[is]*d;
|
||||
|
||||
y += ls[j]*q_ptr[is];
|
||||
}
|
||||
|
||||
y = simd_sum(y);
|
||||
|
||||
if (tx == 0) {
|
||||
dst_attn[t*args.ne21*S_v] = y*scale;
|
||||
}
|
||||
|
||||
q_ptr += args.ns02;
|
||||
k_ptr += args.ns12;
|
||||
v_ptr += args.ns22;
|
||||
|
||||
b_ptr += args.ne21;
|
||||
g_ptr += args.ne21*G;
|
||||
|
||||
if (K > 1) {
|
||||
const int target_slot = (int)args.ne22 - 1 - (int)t;
|
||||
if (target_slot >= 0 && target_slot < (int)K) {
|
||||
device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base;
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
dst_state[is] = ls[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (K == 1) {
|
||||
device float * dst_state = (device float *) (dst) + attn_size + state_out_base;
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
dst_state[is] = ls[j];
|
||||
}
|
||||
}
|
||||
|
||||
#undef S_v
|
||||
#undef G
|
||||
#undef K
|
||||
}
|
||||
|
||||
typedef decltype(kernel_gated_delta_net_impl<4>) kernel_gated_delta_net_t;
|
||||
|
||||
template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<1>;
|
||||
template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<2>;
|
||||
template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<4>;
|
||||
|
||||
#else
|
||||
// a simplified version of the above
|
||||
// no performance improvement, so keep the above version for now
|
||||
|
||||
template<typename T, short NSG>
|
||||
kernel void kernel_gated_delta_net_impl(
|
||||
constant ggml_metal_kargs_gated_delta_net & args,
|
||||
device const char * q,
|
||||
device const char * k,
|
||||
device const char * v,
|
||||
device const char * g,
|
||||
device const char * b,
|
||||
device const char * s,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
#define S_v FC_gated_delta_net_ne20
|
||||
#define G FC_gated_delta_net_ne30
|
||||
|
||||
const uint tx = tpitg.x;
|
||||
const uint ty = tpitg.y;
|
||||
|
||||
const uint i23 = tgpig.z; // B
|
||||
const uint i21 = tgpig.y; // H
|
||||
const uint i20 = tgpig.x*NSG + ty;
|
||||
|
||||
const uint i01 = i21 % args.ne01;
|
||||
const uint i11 = i21 % args.ne11;
|
||||
|
||||
const float scale = 1.0f / sqrt((float)S_v);
|
||||
|
||||
device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20;
|
||||
|
||||
float lsf[NSG];
|
||||
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
lsf[j] = s_ptr[is*S_v];
|
||||
}
|
||||
|
||||
thread T * ls = (thread T *) (lsf);
|
||||
|
||||
device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
|
||||
|
||||
device const float * q_ptr = (device const float *) (q + i23*args.nb03 + i01*args.nb01);
|
||||
device const float * k_ptr = (device const float *) (k + i23*args.nb13 + i11*args.nb11);
|
||||
device const float * v_ptr = (device const float *) (v + i23*args.nb23 + i21*args.nb21);
|
||||
|
||||
device const float * b_ptr = (device const float *) (b) + (i23*args.ne22*args.ne21 + i21);
|
||||
device const float * g_ptr = (device const float *) (g) + (i23*args.ne22*args.ne21 + i21)*G;
|
||||
|
||||
for (short t = 0; t < args.ne22; t++) {
|
||||
device const T * qt_ptr = (device const T *) (q_ptr);
|
||||
device const T * kt_ptr = (device const T *) (k_ptr);
|
||||
device const T * gt_ptr = (device const T *) (g_ptr);
|
||||
|
||||
if (G == 1) {
|
||||
*ls *= exp(g_ptr[0]);
|
||||
} else {
|
||||
// KDA
|
||||
*ls *= exp(gt_ptr[tx]);
|
||||
}
|
||||
|
||||
const float s_k = simd_sum(dot(*ls, kt_ptr[tx]));
|
||||
|
||||
const float d = (v_ptr[i20] - s_k)*b_ptr[0];
|
||||
|
||||
*ls += kt_ptr[tx]*d;
|
||||
|
||||
const float y = simd_sum(dot(*ls, qt_ptr[tx]));
|
||||
|
||||
if (tx == 0) {
|
||||
*dst_attn = y*scale;
|
||||
}
|
||||
|
||||
q_ptr += args.ns02;
|
||||
k_ptr += args.ns12;
|
||||
v_ptr += args.ns22;
|
||||
|
||||
b_ptr += args.ne21;
|
||||
g_ptr += args.ne21*G;
|
||||
|
||||
dst_attn += args.ne21*S_v;
|
||||
}
|
||||
|
||||
device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20;
|
||||
device T * dstt_state = (device T *) (dst_state);
|
||||
|
||||
FOR_UNROLL (short j = 0; j < NSG; j++) {
|
||||
const short is = tx*NSG + j;
|
||||
dst_state[is*S_v] = lsf[j];
|
||||
}
|
||||
|
||||
#undef S_v
|
||||
#undef G
|
||||
}
|
||||
|
||||
typedef decltype(kernel_gated_delta_net_impl<float4, 4>) kernel_gated_delta_net_t;
|
||||
|
||||
template [[host_name("kernel_gated_delta_net_f32_1")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float, 1>;
|
||||
template [[host_name("kernel_gated_delta_net_f32_2")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float2, 2>;
|
||||
template [[host_name("kernel_gated_delta_net_f32_4")]] kernel kernel_gated_delta_net_t kernel_gated_delta_net_impl<float4, 4>;
|
||||
#endif
|
||||
@@ -0,0 +1,347 @@
|
||||
#include "common.h"
|
||||
|
||||
kernel void kernel_argmax_f32(
|
||||
constant ggml_metal_kargs_argmax & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float * x_row = (device const float *) ((device const char *) src0 + tgpig * args.nb01);
|
||||
|
||||
float lmax = -INFINITY;
|
||||
int32_t larg = -1;
|
||||
|
||||
for (int i00 = tpitg; i00 < args.ne00; i00 += ntg) {
|
||||
if (x_row[i00] > lmax) {
|
||||
lmax = x_row[i00];
|
||||
larg = i00;
|
||||
}
|
||||
}
|
||||
|
||||
// find the argmax value in the block
|
||||
float max_val = simd_max(lmax);
|
||||
int32_t arg_val = simd_max(select(-1, larg, lmax == max_val));
|
||||
|
||||
device int32_t * dst_i32 = (device int32_t *) dst;
|
||||
|
||||
threadgroup float * shared_maxval = (threadgroup float *) shmem;
|
||||
threadgroup int32_t * shared_argmax = (threadgroup int32_t *) shmem + N_SIMDWIDTH;
|
||||
|
||||
if (ntg > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
shared_maxval[tiisg] = -INFINITY;
|
||||
shared_argmax[tiisg] = -1;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shared_maxval[sgitg] = max_val;
|
||||
shared_argmax[sgitg] = arg_val;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
max_val = shared_maxval[tiisg];
|
||||
arg_val = shared_argmax[tiisg];
|
||||
|
||||
float max_val_reduced = simd_max(max_val);
|
||||
int32_t arg_val_reduced = simd_max(select(-1, arg_val, max_val == max_val_reduced));
|
||||
|
||||
dst_i32[tgpig] = arg_val_reduced;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
dst_i32[tgpig] = arg_val;
|
||||
}
|
||||
|
||||
kernel void kernel_diag_f32(
|
||||
constant ggml_metal_kargs_diag & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]]) {
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
|
||||
const int32_t i3 = tgpig.z;
|
||||
const int32_t i2 = tgpig.y;
|
||||
const int32_t i1 = tgpig.x;
|
||||
|
||||
device const float * src0_ptr = (device const float *)(src0 + i2*args.nb02 + i3*args.nb03);
|
||||
device float * dst_ptr = (device float *)(dst + i1*args.nb01 + i2*args.nb2 + i3*args.nb3);
|
||||
|
||||
for (int i0 = tiitg; i0 < args.ne0; i0 += NW) {
|
||||
dst_ptr[i0] = i0 == i1 ? src0_ptr[i0] : 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_roll_f32(
|
||||
constant ggml_metal_kargs_roll & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
device const float * src0_ptr = (device const float *) src0;
|
||||
device float * dst_ptr = (device float *) dst;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
// apply shifts and wrap around
|
||||
int64_t i00 = i0 - args.s0;
|
||||
int64_t i01 = i1 - args.s1;
|
||||
int64_t i02 = i2 - args.s2;
|
||||
int64_t i03 = i3 - args.s3;
|
||||
|
||||
if (i00 < 0) { i00 += args.ne00; } else if (i00 >= args.ne00) { i00 -= args.ne00; }
|
||||
if (i01 < 0) { i01 += args.ne01; } else if (i01 >= args.ne01) { i01 -= args.ne01; }
|
||||
if (i02 < 0) { i02 += args.ne02; } else if (i02 >= args.ne02) { i02 -= args.ne02; }
|
||||
if (i03 < 0) { i03 += args.ne03; } else if (i03 >= args.ne03) { i03 -= args.ne03; }
|
||||
|
||||
int64_t src_idx = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00 + i00;
|
||||
int64_t dst_idx = i3 *args.ne2 *args.ne1 *args.ne0 + i2 *args.ne1 *args.ne0 + i1 *args.ne0 + i0;
|
||||
|
||||
dst_ptr[dst_idx] = src0_ptr[src_idx];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_pad_impl(
|
||||
constant ggml_metal_kargs_pad & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int32_t i3 = tgpig.z;
|
||||
const int32_t i2 = tgpig.y;
|
||||
const int32_t k0 = tgpig.x/args.ne1;
|
||||
const int32_t i1 = tgpig.x - k0*args.ne1;
|
||||
|
||||
const int32_t i03 = i3;
|
||||
const int32_t i02 = i2;
|
||||
const int32_t i01 = i1;
|
||||
|
||||
device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
|
||||
device T * dst_ptr = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1);
|
||||
|
||||
for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) {
|
||||
const int32_t i0 = k0*1024 + tpitg.x + l0;
|
||||
if (i0 >= args.ne0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
|
||||
dst_ptr[i0] = src0_ptr[i0];
|
||||
} else {
|
||||
dst_ptr[i0] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_pad_impl<float>) kernel_pad_t;
|
||||
|
||||
template [[host_name("kernel_pad_f32")]] kernel kernel_pad_t kernel_pad_impl<float>;
|
||||
template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl<float4>;
|
||||
|
||||
// TODO: this is slow - optimize
|
||||
kernel void kernel_pad_reflect_1d_f32(
|
||||
constant ggml_metal_kargs_pad_reflect_1d & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i01 = i1;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
|
||||
device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1);
|
||||
|
||||
if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
if (i0 < args.p0) {
|
||||
dst_ptr[i0] = src0_ptr[args.p0 - i0];
|
||||
} else if (i0 < args.ne0 - args.p1) {
|
||||
dst_ptr[i0] = src0_ptr[i0 - args.p0];
|
||||
} else {
|
||||
dst_ptr[i0] = src0_ptr[(args.ne0 - args.p1 - args.p0) - (args.p1 + 1 - (args.ne0 - i0)) - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_arange_f32(
|
||||
constant ggml_metal_kargs_arange & args,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
device float * dst_ptr = (device float *) dst;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
dst_ptr[i0] = args.start + args.step * i0;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_timestep_embedding_f32(
|
||||
constant ggml_metal_kargs_timestep_embedding & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
int i = tgpig.x;
|
||||
device float * embed_data = (device float *)(dst + i*args.nb1);
|
||||
|
||||
int half_ = args.dim / 2;
|
||||
for (int j = tpitg.x; j < half_; j += ntg.x) {
|
||||
float timestep = ((device float *)src0)[i];
|
||||
float freq = (float)exp(-log((float)args.max_period) * j / half_);
|
||||
float arg = timestep * freq;
|
||||
embed_data[j ] = cos(arg);
|
||||
embed_data[j + half_] = sin(arg);
|
||||
}
|
||||
|
||||
if (args.dim % 2 != 0 && tpitg.x == 0) {
|
||||
embed_data[2 * half_] = 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_opt_step_adamw_f32(
|
||||
constant ggml_metal_kargs_opt_step_adamw & args,
|
||||
device float * x,
|
||||
device const float * g,
|
||||
device float * g_m,
|
||||
device float * g_v,
|
||||
device const float * pars,
|
||||
uint gid[[thread_position_in_grid]]) {
|
||||
|
||||
if (gid >= args.np) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float alpha = pars[0];
|
||||
const float beta1 = pars[1];
|
||||
const float beta2 = pars[2];
|
||||
const float eps = pars[3];
|
||||
const float wd = pars[4];
|
||||
const float beta1h = pars[5];
|
||||
const float beta2h = pars[6];
|
||||
|
||||
const float gi = g[gid];
|
||||
const float gmi = g_m[gid] * beta1 + gi * (1.0f - beta1);
|
||||
const float gvi = g_v[gid] * beta2 + gi * gi * (1.0f - beta2);
|
||||
|
||||
g_m[gid] = gmi;
|
||||
g_v[gid] = gvi;
|
||||
|
||||
const float mh = gmi * beta1h;
|
||||
const float vh = sqrt(gvi * beta2h) + eps;
|
||||
|
||||
x[gid] = x[gid] * (1.0f - alpha * wd) - alpha * mh / vh;
|
||||
}
|
||||
|
||||
kernel void kernel_opt_step_sgd_f32(
|
||||
constant ggml_metal_kargs_opt_step_sgd & args,
|
||||
device float * x,
|
||||
device const float * g,
|
||||
device const float * pars,
|
||||
uint gid[[thread_position_in_grid]]) {
|
||||
|
||||
if (gid >= args.np) {
|
||||
return;
|
||||
}
|
||||
|
||||
x[gid] = x[gid] * (1.0f - pars[0] * pars[1]) - pars[0] * g[gid];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_memset(
|
||||
constant ggml_metal_kargs_memset & args,
|
||||
device T * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = args.val;
|
||||
}
|
||||
|
||||
typedef decltype(kernel_memset<int64_t>) kernel_memset_t;
|
||||
|
||||
template [[host_name("kernel_memset_i64")]] kernel kernel_memset_t kernel_memset<int64_t>;
|
||||
|
||||
constant short FC_count_equal_nsg [[function_constant(FC_COUNT_EQUAL + 0)]];
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_count_equal(
|
||||
constant ggml_metal_kargs_count_equal & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device atomic_int * dst,
|
||||
threadgroup int32_t * shmem_i32 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const short NSG = FC_count_equal_nsg;
|
||||
|
||||
const int i3 = tgpig.z;
|
||||
const int i2 = tgpig.y;
|
||||
const int i1 = tgpig.x;
|
||||
|
||||
if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int sum = 0;
|
||||
|
||||
device const char * base0 = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03;
|
||||
device const char * base1 = src1 + i1*args.nb11 + i2*args.nb12 + i3*args.nb13;
|
||||
|
||||
for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
|
||||
const T v0 = *(device const T *)(base0 + i0*args.nb00);
|
||||
const T v1 = *(device const T *)(base1 + i0*args.nb10);
|
||||
sum += (v0 == v1);
|
||||
}
|
||||
|
||||
sum = simd_sum(sum);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shmem_i32[sgitg] = sum;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (sgitg == 0) {
|
||||
float v = 0.0f;
|
||||
if (tpitg.x < NSG) {
|
||||
v = shmem_i32[tpitg.x];
|
||||
}
|
||||
|
||||
float total = simd_sum(v);
|
||||
if (tpitg.x == 0) {
|
||||
atomic_fetch_add_explicit(dst, (int32_t) total, memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_count_equal<int32_t>) kernel_count_equal_t;
|
||||
|
||||
template [[host_name("kernel_count_equal_i32")]] kernel kernel_count_equal_t kernel_count_equal<int32_t>;
|
||||
@@ -0,0 +1,838 @@
|
||||
#include "common.h"
|
||||
#include "dequantize.h"
|
||||
|
||||
constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
|
||||
constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
|
||||
constant short FC_mul_mm_ne12 [[function_constant(FC_MUL_MM + 2)]];
|
||||
constant short FC_mul_mm_ne13 [[function_constant(FC_MUL_MM + 3)]];
|
||||
constant short FC_mul_mm_r2 [[function_constant(FC_MUL_MM + 4)]];
|
||||
constant short FC_mul_mm_r3 [[function_constant(FC_MUL_MM + 5)]];
|
||||
|
||||
// each block_q contains 16*nl weights
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
template<
|
||||
typename SA, typename SA_4x4, typename SA_8x8,
|
||||
typename SB, typename SB_2x4, typename SB_8x8,
|
||||
typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread SA_4x4 &),
|
||||
typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||
kernel void kernel_mul_mm(
|
||||
constant ggml_metal_kargs_mul_mm & args,
|
||||
device const char * srcA,
|
||||
device const char * srcB,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint3 tgpig [[threadgroup_position_in_grid]],
|
||||
ushort tiitg [[thread_index_in_threadgroup]],
|
||||
ushort sgitg [[simdgroup_index_in_threadgroup]]) {
|
||||
(void) sgitg;
|
||||
|
||||
// Matrix dimensions: A(M,K) x B(K,N) -> C(M,N)
|
||||
const int K = args.ne00;
|
||||
const int M = args.ne0;
|
||||
const int N = args.ne1;
|
||||
|
||||
// Batch dimension handling
|
||||
const int im = tgpig.z;
|
||||
const int i12 = im % FC_mul_mm_ne12;
|
||||
const int i13 = im / FC_mul_mm_ne12;
|
||||
|
||||
// Batch offsets for srcA and srcB
|
||||
const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03;
|
||||
|
||||
// Tile dimensions
|
||||
constexpr int NRB = SZ_SIMDGROUP * N_MM_BLOCK_X * N_MM_SIMD_GROUP_X;
|
||||
constexpr int NRA = SZ_SIMDGROUP * N_MM_BLOCK_Y * N_MM_SIMD_GROUP_Y;
|
||||
|
||||
// Tile offsets in output matrix
|
||||
const int ra = tgpig.y * NRA;
|
||||
const int rb = tgpig.x * NRB;
|
||||
|
||||
// Threadgroup memory for dequantized A tile only
|
||||
threadgroup SA * sa = (threadgroup SA *)(shmem);
|
||||
|
||||
// Work-item count for A loading
|
||||
constexpr int A_WORK_ITEMS = NRA * N_MM_NK;
|
||||
constexpr int NUM_THREADS = N_SIMDWIDTH * N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y;
|
||||
|
||||
// tA wraps threadgroup memory
|
||||
auto tA = tensor(sa, dextents<int32_t, 2>(N_MM_NK_TOTAL, NRA));
|
||||
|
||||
// tB wraps device memory directly
|
||||
device T1 * ptrB = (device T1 *)(srcB + args.nb12*i12 + args.nb13*i13);
|
||||
const int strideB = args.nb11 / sizeof(T1);
|
||||
auto tB = tensor(ptrB, dextents<int32_t, 2>(K, N), array<int, 2>({1, strideB}));
|
||||
|
||||
// Configure matmul operation
|
||||
mpp::tensor_ops::matmul2d<
|
||||
mpp::tensor_ops::matmul2d_descriptor(
|
||||
NRB, NRA, N_MM_NK_TOTAL, false, true, true,
|
||||
mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
|
||||
execution_simdgroups<N_MM_SIMD_GROUP_X * N_MM_SIMD_GROUP_Y>> mm;
|
||||
|
||||
auto cT = mm.get_destination_cooperative_tensor<decltype(tB), decltype(tA), float>();
|
||||
|
||||
// Accumulate partial results over K dimension
|
||||
for (int loop_k = 0; loop_k < K; loop_k += N_MM_NK_TOTAL) {
|
||||
// === PHASE 1: Dequantization of A into threadgroup memory ===
|
||||
for (int work = tiitg; work < A_WORK_ITEMS; work += NUM_THREADS) {
|
||||
const int row = work / N_MM_NK;
|
||||
const int k_chunk = work % N_MM_NK;
|
||||
const int k_pos = loop_k + k_chunk * 16;
|
||||
const short k_base = k_chunk * 16;
|
||||
|
||||
// Bounds check: skip device read if row is out of matrix bounds
|
||||
if (ra + row < M) {
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
// Element-wise reads when K is not aligned (nb01 not aligned for half4x4/float4x4).
|
||||
// MSL spec Table 2.5: half4x4 requires 8-byte alignment. When K is odd,
|
||||
// nb01 = K*2 is not 8-byte aligned, so odd-row pointers are misaligned.
|
||||
// Mirrors the legacy kernel's existing guard.
|
||||
device const T0 * row_ptr = (device const T0 *)(srcA + args.nb01 * (ra + row) + offset0);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? (SA) row_ptr[k_pos + i] : (SA)0;
|
||||
}
|
||||
} else {
|
||||
const int block_idx = k_pos / (16 * nl);
|
||||
const short il = (k_pos / 16) % nl;
|
||||
|
||||
device const block_q * row_ptr = (device const block_q *)(srcA + args.nb01 * (ra + row) + offset0);
|
||||
|
||||
SA_4x4 temp_a;
|
||||
dequantize_func(row_ptr + block_idx, il, temp_a);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
// Zero-pad A for K positions beyond valid range (handles partial K iterations)
|
||||
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (k_pos + i < K) ? temp_a[i/4][i%4] : (SA)0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Zero-pad rows beyond matrix bounds
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
sa[row * N_MM_NK_TOTAL + (k_base + i)] = (SA)0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// === PHASE 2: Tensor matmul ===
|
||||
auto mA = tA.slice(0, 0);
|
||||
auto mB = tB.slice(loop_k, rb);
|
||||
|
||||
mm.run(mB, mA, cT);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
// Store result tile to output matrix (with batch offset)
|
||||
// cT.store handles bounds checking via tD's extents (M, N)
|
||||
device float * dstBatch = (device float *)dst + im * N * M;
|
||||
|
||||
auto tD = tensor(dstBatch, dextents<int32_t, 2>(M, N), array<int, 2>({1, M}));
|
||||
cT.store(tD.slice(ra, rb));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
template<
|
||||
typename S0, typename S0_4x4, typename S0_8x8,
|
||||
typename S1, typename S1_2x4, typename S1_8x8,
|
||||
typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &),
|
||||
typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||
kernel void kernel_mul_mm(
|
||||
constant ggml_metal_kargs_mul_mm & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
threadgroup S0 * sa = (threadgroup S0 *)(shmem);
|
||||
threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
|
||||
|
||||
constexpr int NR0 = 64;
|
||||
constexpr int NR1 = 32;
|
||||
|
||||
constexpr int NK = 32;
|
||||
constexpr int NL0 = NK/16;
|
||||
constexpr int NL1 = NK/8;
|
||||
|
||||
const int im = tgpig.z;
|
||||
const int r0 = tgpig.y*NR0;
|
||||
const int r1 = tgpig.x*NR1;
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
|
||||
const short nr1 = (args.ne1 - r1 < NR1) ? (args.ne1 - r1) : NR1;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
|
||||
const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
|
||||
|
||||
const short il0 = (tiitg % NL0);
|
||||
|
||||
short il = il0;
|
||||
|
||||
const int i12 = im % FC_mul_mm_ne12;
|
||||
const int i13 = im / FC_mul_mm_ne12;
|
||||
|
||||
const uint64_t offset0 = (i12/FC_mul_mm_r2)*args.nb02 + (i13/FC_mul_mm_r3)*args.nb03;
|
||||
const short offset1 = il0/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
|
||||
|
||||
const short iy = 8*(tiitg % NL1);
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*(r1 + lr1)
|
||||
+ args.nb10*iy);
|
||||
|
||||
S0_8x8 ma[4];
|
||||
S1_8x8 mb[2];
|
||||
|
||||
simdgroup_float8x8 mc[8];
|
||||
|
||||
for (short i = 0; i < 8; i++){
|
||||
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
|
||||
}
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
// NOTE: this is massively slower.. WTF?
|
||||
//sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
//const short dx = sx;
|
||||
//const short dy = sy;
|
||||
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
|
||||
y += NK;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
|
||||
|
||||
FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
|
||||
}
|
||||
|
||||
lsma += 8*64;
|
||||
lsmb += 4*64;
|
||||
}
|
||||
}
|
||||
|
||||
if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
|
||||
// if no bounds checks on the output are needed, we can directly write to device memory
|
||||
device float * C = (device float *) dst +
|
||||
(r0 + 32*(sgitg & 1)) + \
|
||||
(r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
|
||||
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false);
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
|
||||
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (sgitg == 0) {
|
||||
for (int j = tiitg; j < nr1; j += NR1) {
|
||||
device float * D = (device float *) dst + r0 + (r1 + j)*args.ne0 + im*args.ne1*args.ne0;
|
||||
device float4 * D4 = (device float4 *) D;
|
||||
|
||||
threadgroup float * C = temp_str + (j*NR0);
|
||||
threadgroup float4 * C4 = (threadgroup float4 *) C;
|
||||
|
||||
int i = 0;
|
||||
for (; i < nr0/4; i++) {
|
||||
*(D4 + i) = *(C4 + i);
|
||||
}
|
||||
|
||||
i *= 4;
|
||||
for (; i < nr0; i++) {
|
||||
*(D + i) = *(C + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif // GGML_METAL_HAS_TENSOR
|
||||
|
||||
template<short ne20> // n_expert_used
|
||||
kernel void kernel_mul_mm_id_map0(
|
||||
constant ggml_metal_kargs_mul_mm_id_map0 & args,
|
||||
device const char * src2,
|
||||
device char * htpe,
|
||||
device char * hids,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
ushort tpitg[[thread_position_in_threadgroup]],
|
||||
ushort ntg[[threads_per_threadgroup]]) {
|
||||
const short ide = tpitg; // expert id
|
||||
|
||||
uint32_t n_all = 0;
|
||||
|
||||
device int32_t * ids_i32 = (device int32_t *) hids + ide*args.ne21;
|
||||
|
||||
for (int i21 = 0; i21 < args.ne21; i21 += ntg) { // n_tokens
|
||||
if (i21 + tpitg < args.ne21) {
|
||||
device const int32_t * src2_i32 = (device const int32_t *) (src2 + (i21 + tpitg)*args.nb21);
|
||||
|
||||
threadgroup uint16_t * sids = (threadgroup uint16_t *) shmem + tpitg*ne20;
|
||||
|
||||
#pragma unroll(ne20)
|
||||
for (short i20 = 0; i20 < ne20; i20++) {
|
||||
sids[i20] = src2_i32[i20];
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (short t = 0; t < ntg; t++) {
|
||||
if (i21 + t >= args.ne21) {
|
||||
break;
|
||||
}
|
||||
|
||||
threadgroup const uint16_t * sids = (threadgroup const uint16_t *) shmem + t*ne20;
|
||||
|
||||
short sel = 0;
|
||||
#pragma unroll(ne20)
|
||||
for (short i20 = 0; i20 < ne20; i20++) {
|
||||
sel += (sids[i20] == ide)*(i20 + 1);
|
||||
}
|
||||
|
||||
ids_i32[n_all] = (i21 + t)*ne20 + sel - 1;
|
||||
|
||||
n_all += sel > 0;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
device uint32_t * tpe_u32 = (device uint32_t *) (htpe);
|
||||
tpe_u32[ide] = n_all;
|
||||
}
|
||||
|
||||
typedef decltype(kernel_mul_mm_id_map0<1>) kernel_mul_mm_id_map0_t;
|
||||
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_1" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<1>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_2" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<2>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_4" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<4>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_5" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<5>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_6" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<6>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<8>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_10")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16>;
|
||||
template [[host_name("kernel_mul_mm_id_map0_ne20_22")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<22>;
|
||||
|
||||
template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
|
||||
kernel void kernel_mul_mm_id(
|
||||
constant ggml_metal_kargs_mul_mm_id & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * htpe,
|
||||
device const char * hids,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
threadgroup S0 * sa = (threadgroup S0 *)(shmem);
|
||||
threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
threadgroup float * sc = (threadgroup float *)(shmem);
|
||||
#endif
|
||||
|
||||
constexpr int NR0 = 64;
|
||||
constexpr int NR1 = 32;
|
||||
|
||||
constexpr int NK = 32;
|
||||
constexpr int NL0 = NK/16;
|
||||
constexpr int NL1 = NK/8;
|
||||
|
||||
const int im = tgpig.z; // expert
|
||||
const int r0 = tgpig.y*NR0;
|
||||
const int r1 = tgpig.x*NR1;
|
||||
|
||||
device const uint32_t * tpe_u32 = (device const uint32_t *) (htpe);
|
||||
device const int32_t * ids_i32 = (device const int32_t *) (hids);
|
||||
|
||||
const int32_t neh1 = tpe_u32[im];
|
||||
|
||||
if (r1 >= neh1) {
|
||||
return;
|
||||
}
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
|
||||
const short nr1 = ( neh1 - r1 < NR1) ? ( neh1 - r1) : NR1;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
|
||||
const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
|
||||
|
||||
const short il0 = (tiitg % NL0);
|
||||
|
||||
short il = il0;
|
||||
|
||||
const int id = ids_i32[im*args.ne21 + r1 + lr1];
|
||||
|
||||
const short i11 = (id % args.ne20) % args.ne11;
|
||||
const short i12 = (id / args.ne20);
|
||||
const short i13 = 0;
|
||||
|
||||
const uint64_t offset0 = im*args.nb02 + i13*args.nb03;
|
||||
const short offset1 = il0/nl;
|
||||
|
||||
device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
|
||||
|
||||
const short iy = 8*(tiitg % NL1);
|
||||
|
||||
device const T1 * y = (device const T1 *)(src1
|
||||
+ args.nb13*i13
|
||||
+ args.nb12*i12
|
||||
+ args.nb11*i11
|
||||
+ args.nb10*iy);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
S0_8x8 ma[4];
|
||||
S1_8x8 mb[2];
|
||||
|
||||
simdgroup_float8x8 mc[8];
|
||||
|
||||
for (short i = 0; i < 8; i++){
|
||||
mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
|
||||
}
|
||||
#else
|
||||
auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK, NR0));
|
||||
auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
|
||||
|
||||
mpp::tensor_ops::matmul2d<
|
||||
mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
|
||||
execution_simdgroups<4>> mm;
|
||||
|
||||
auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
|
||||
#endif
|
||||
|
||||
for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? (S0) *((device T0 *) x + i) : (S0) 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
//const short lx = i%8;
|
||||
//const short ly = (tiitg/NL0)%8;
|
||||
const short lx = (tiitg/NL0)%8;
|
||||
const short ly = i%8;
|
||||
|
||||
const short ib = 8*sx + sy;
|
||||
|
||||
// NOTE: this is massively slower.. WTF?
|
||||
//sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
|
||||
|
||||
*(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
//const short dx = sx;
|
||||
//const short dy = sy;
|
||||
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
|
||||
const short ib = 4*sx + sy;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#else
|
||||
// load data and store to threadgroup memory
|
||||
if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// no need for dequantization
|
||||
for (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
|
||||
}
|
||||
} else {
|
||||
S0_4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 16; i++) {
|
||||
const short sx = 2*il0 + i/8;
|
||||
const short sy = (tiitg/NL0)/8;
|
||||
|
||||
const short lx = i%8;
|
||||
const short ly = (tiitg/NL0)%8;
|
||||
//const short lx = (tiitg/NL0)%8;
|
||||
//const short ly = i%8;
|
||||
|
||||
*(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
|
||||
}
|
||||
}
|
||||
|
||||
if (FC_mul_mm_bc_inp) {
|
||||
for (short i = 0; i < 8; ++i) {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
|
||||
}
|
||||
} else {
|
||||
const short sx = (tiitg%NL1);
|
||||
const short sy = (tiitg/NL1)/8;
|
||||
|
||||
//const short lx = i;
|
||||
const short ly = (tiitg/NL1)%8;
|
||||
//const short lx = (tiitg/NL1)%8;
|
||||
//const short ly = i;
|
||||
|
||||
*(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
|
||||
}
|
||||
#endif
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2 + nl - 1)/nl : x;
|
||||
|
||||
y += NK;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#ifndef GGML_METAL_HAS_TENSOR
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
|
||||
threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
|
||||
|
||||
FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 4; i++) {
|
||||
simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 2; i++) {
|
||||
simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
|
||||
}
|
||||
|
||||
simdgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
FOR_UNROLL (short i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
|
||||
}
|
||||
|
||||
lsma += 8*64;
|
||||
lsmb += 4*64;
|
||||
}
|
||||
#else
|
||||
auto sA = tA.slice(0, 0);
|
||||
auto sB = tB.slice(0, 0);
|
||||
|
||||
mm.run(sB, sA, cT);
|
||||
#endif
|
||||
}
|
||||
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#ifdef GGML_METAL_HAS_TENSOR
|
||||
auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
|
||||
cT.store(tC);
|
||||
#else
|
||||
threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
|
||||
|
||||
for (short i = 0; i < 8; i++) {
|
||||
simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
|
||||
}
|
||||
#endif
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (short j = sgitg; j < nr1; j += 4) {
|
||||
const int id = ids_i32[im*args.ne21 + r1 + j];
|
||||
|
||||
const short ide = id % args.ne20;
|
||||
const short idt = id / args.ne20;
|
||||
|
||||
device float * D = (device float *) dst + r0 + ide*args.ne0 + idt*args.ne1*args.ne0;
|
||||
device float4 * D4 = (device float4 *) D;
|
||||
|
||||
threadgroup float * C = (threadgroup float *) shmem + j*NR0;
|
||||
threadgroup float4 * C4 = (threadgroup float4 *) C;
|
||||
|
||||
int i = tiisg;
|
||||
for (; i < nr0/4; i += 32) {
|
||||
*(D4 + i) = *(C4 + i);
|
||||
}
|
||||
|
||||
i = (4*(nr0/4)) + tiisg;
|
||||
for (; i < nr0; i += 32) {
|
||||
*(D + i) = *(C + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
//
|
||||
|
||||
typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, float, float2x4>) mul_mm_t;
|
||||
|
||||
template [[host_name("kernel_mul_mm_f32_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16, half, half4x4, float, float2x4>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_mul_mm_bf16_f32")]] kernel mul_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat, bfloat2x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16, bfloat, bfloat4x4, float, float2x4>;
|
||||
#endif
|
||||
template [[host_name("kernel_mul_mm_q1_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_1, 2, dequantize_q5_1, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_mxfp4_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_mxfp4, 2, dequantize_mxfp4, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs, float, float4x4, float, float2x4>;
|
||||
|
||||
template [[host_name("kernel_mul_mm_f32_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_f16_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16, half, half4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q1_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q4_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q4_1_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q5_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q5_1_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_1, 2, dequantize_q5_1, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q8_0_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_mxfp4_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_mxfp4, 2, dequantize_mxfp4, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q2_K_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q3_K_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q4_K_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q5_K_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_q6_K_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq3_s_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq2_s_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq1_m_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f16")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs, float, float4x4, half, half2x4>;
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
//
|
||||
|
||||
typedef decltype(kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, float, float2x4>) mul_mm_id;
|
||||
|
||||
template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16, half, half4x4, float, float2x4>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_mul_mm_id_bf16_f32")]] kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat, bfloat2x4, simdgroup_bfloat8x8, bfloat4x4, 1, dequantize_bf16, bfloat, bfloat4x4, float, float2x4>;
|
||||
#endif
|
||||
template [[host_name("kernel_mul_mm_id_q1_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_1, 2, dequantize_q5_1, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_mxfp4_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_mxfp4, 2, dequantize_mxfp4, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl, float, float4x4, float, float2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs, float, float4x4, float, float2x4>;
|
||||
|
||||
template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16, half, half4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q1_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q1_0, 8, dequantize_q1_0, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_0, 2, dequantize_q4_0, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_1, 2, dequantize_q4_1, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_0, 2, dequantize_q5_0, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_1_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_1, 2, dequantize_q5_1, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q8_0_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_mxfp4_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_mxfp4, 2, dequantize_mxfp4, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q2_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q3_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q4_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q5_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_q6_K_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_s_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_s_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_m_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq4_nl, 2, dequantize_iq4_nl, float, float4x4, half, half2x4>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs, float, float4x4, half, half2x4>;
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,308 @@
|
||||
#include "common.h"
|
||||
|
||||
// F == 1 : norm (no fuse)
|
||||
// F == 2 : norm + mul
|
||||
// F == 3 : norm + mul + add
|
||||
template <typename T, short F>
|
||||
kernel void kernel_norm_fuse_impl(
|
||||
constant ggml_metal_kargs_norm & args,
|
||||
device const char * src0,
|
||||
device const char * src1_0,
|
||||
device const char * src1_1,
|
||||
device char * dst,
|
||||
threadgroup float * shmem_f32 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
if (sgitg == 0) {
|
||||
shmem_f32[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
const int i01 = tgpig.x;
|
||||
const int i02 = tgpig.y;
|
||||
const int i03 = tgpig.z;
|
||||
|
||||
device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
|
||||
device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
|
||||
T sumft(0.0f);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
sumft += x[i00];
|
||||
}
|
||||
sumf = dot(sumft, T(1.0f));
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shmem_f32[sgitg] = sumf;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
sumf = shmem_f32[tiisg];
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
const float mean = sumf/args.ne00;
|
||||
|
||||
device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
|
||||
sumf = 0.0f;
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
y[i00] = x[i00] - mean;
|
||||
sumf += dot(y[i00], y[i00]);
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shmem_f32[sgitg] = sumf;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
sumf = shmem_f32[tiisg];
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
const float variance = sumf/args.ne00;
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + args.eps);
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
if (F == 1) {
|
||||
y[i00] = (y[i00]*scale);
|
||||
}
|
||||
if (F == 2) {
|
||||
y[i00] = (y[i00]*scale)*f0[i00];
|
||||
}
|
||||
if (F == 3) {
|
||||
y[i00] = (y[i00]*scale)*f0[i00] + f1[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_norm_fuse_impl<float4, 1>) kernel_norm_fuse_t;
|
||||
|
||||
template [[host_name("kernel_norm_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 1>;
|
||||
template [[host_name("kernel_norm_mul_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 2>;
|
||||
template [[host_name("kernel_norm_mul_add_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 3>;
|
||||
|
||||
template [[host_name("kernel_norm_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 1>;
|
||||
template [[host_name("kernel_norm_mul_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 2>;
|
||||
template [[host_name("kernel_norm_mul_add_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 3>;
|
||||
|
||||
// F == 1 : rms_norm (no fuse)
|
||||
// F == 2 : rms_norm + mul
|
||||
// F == 3 : rms_norm + mul + add
|
||||
template <typename T, short F>
|
||||
kernel void kernel_rms_norm_fuse_impl(
|
||||
constant ggml_metal_kargs_norm & args,
|
||||
device const char * src0,
|
||||
device const char * src1_0,
|
||||
device const char * src1_1,
|
||||
device char * dst,
|
||||
threadgroup float * shmem_f32 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
if (sgitg == 0) {
|
||||
shmem_f32[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
const int i01 = tgpig.x;
|
||||
const int i02 = tgpig.y;
|
||||
const int i03 = tgpig.z;
|
||||
|
||||
device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
|
||||
device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
sumf += dot(x[i00], x[i00]);
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shmem_f32[sgitg] = sumf;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
sumf = shmem_f32[tiisg];
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
const float mean = sumf/args.ne00;
|
||||
const float scale = 1.0f/sqrt(mean + args.eps);
|
||||
|
||||
device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
if (F == 1) {
|
||||
y[i00] = (x[i00]*scale);
|
||||
}
|
||||
if (F == 2) {
|
||||
y[i00] = (x[i00]*scale)*f0[i00];
|
||||
}
|
||||
if (F == 3) {
|
||||
y[i00] = (x[i00]*scale)*f0[i00] + f1[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_rms_norm_fuse_impl<float4, 1>) kernel_rms_norm_fuse_t;
|
||||
|
||||
template [[host_name("kernel_rms_norm_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 1>;
|
||||
template [[host_name("kernel_rms_norm_mul_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 3>;
|
||||
|
||||
template [[host_name("kernel_rms_norm_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 1>;
|
||||
template [[host_name("kernel_rms_norm_mul_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 3>;
|
||||
|
||||
template <typename T0, typename T>
|
||||
kernel void kernel_l2_norm_impl(
|
||||
constant ggml_metal_kargs_l2_norm & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
threadgroup float * shmem_f32 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i03 = tgpig.z;
|
||||
const int i02 = tgpig.y;
|
||||
const int i01 = tgpig.x;
|
||||
|
||||
if (sgitg == 0) {
|
||||
shmem_f32[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
device const T0 * x = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
|
||||
device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
|
||||
sumf += dot(x[i00], x[i00]);
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shmem_f32[sgitg] = sumf;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
sumf = shmem_f32[tiisg];
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
const float scale = 1.0f/max(sqrt(sumf), args.eps);
|
||||
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) {
|
||||
y[i00] = x[i00] * scale;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_l2_norm_impl<float, float>) kernel_l2_norm_t;
|
||||
|
||||
template [[host_name("kernel_l2_norm_f32_f32")]] kernel kernel_l2_norm_t kernel_l2_norm_impl<float, float>;
|
||||
template [[host_name("kernel_l2_norm_f32_f32_4")]] kernel kernel_l2_norm_t kernel_l2_norm_impl<float4, float4>;
|
||||
|
||||
kernel void kernel_group_norm_f32(
|
||||
constant ggml_metal_kargs_group_norm & args,
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
threadgroup float * buf [[threadgroup(0)]],
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t ne = args.ne00*args.ne01*args.ne02;
|
||||
const int64_t gs = args.ne00*args.ne01*((args.ne02 + args.ngrp - 1) / args.ngrp);
|
||||
|
||||
int start = tgpig * gs;
|
||||
int end = start + gs;
|
||||
|
||||
start += tpitg;
|
||||
|
||||
if (end >= ne) {
|
||||
end = ne;
|
||||
}
|
||||
|
||||
float tmp = 0.0f; // partial sum for thread in warp
|
||||
|
||||
for (int j = start; j < end; j += ntg) {
|
||||
tmp += src0[j];
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
tmp = simd_sum(tmp);
|
||||
if (ntg > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
buf[sgitg] = tmp;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
tmp = buf[tiisg];
|
||||
tmp = simd_sum(tmp);
|
||||
}
|
||||
|
||||
const float mean = tmp / gs;
|
||||
tmp = 0.0f;
|
||||
|
||||
for (int j = start; j < end; j += ntg) {
|
||||
float xi = src0[j] - mean;
|
||||
dst[j] = xi;
|
||||
tmp += xi * xi;
|
||||
}
|
||||
|
||||
tmp = simd_sum(tmp);
|
||||
if (ntg > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
buf[sgitg] = tmp;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
tmp = buf[tiisg];
|
||||
tmp = simd_sum(tmp);
|
||||
}
|
||||
|
||||
const float variance = tmp / gs;
|
||||
const float scale = 1.0f/sqrt(variance + args.eps);
|
||||
for (int j = start; j < end; j += ntg) {
|
||||
dst[j] *= scale;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,148 @@
|
||||
#include "common.h"
|
||||
|
||||
kernel void kernel_pool_2d_max_f32(
|
||||
constant ggml_metal_kargs_pool_2d & args,
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint gid[[thread_position_in_grid]]) {
|
||||
|
||||
if (gid >= args.np) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int idx = gid;
|
||||
const int I_HW = args.IH * args.IW;
|
||||
const int O_HW = args.OH * args.OW;
|
||||
const int nc = idx / O_HW;
|
||||
const int cur_oh = idx % O_HW / args.OW;
|
||||
const int cur_ow = idx % O_HW % args.OW;
|
||||
|
||||
device const float * i_ptr = src0 + nc * I_HW;
|
||||
device float * o_ptr = dst + nc * O_HW;
|
||||
|
||||
const int start_h = cur_oh * args.s1 - args.p1;
|
||||
const int bh = MAX(0, start_h);
|
||||
const int eh = MIN(args.IH, start_h + args.k1);
|
||||
const int start_w = cur_ow * args.s0 - args.p0;
|
||||
const int bw = MAX(0, start_w);
|
||||
const int ew = MIN(args.IW, start_w + args.k0);
|
||||
|
||||
float res = -INFINITY;
|
||||
|
||||
for (int i = bh; i < eh; i += 1) {
|
||||
for (int j = bw; j < ew; j += 1) {
|
||||
res = MAX(res, i_ptr[i * args.IW + j]);
|
||||
}
|
||||
}
|
||||
|
||||
o_ptr[cur_oh * args.OW + cur_ow] = res;
|
||||
}
|
||||
|
||||
kernel void kernel_pool_2d_avg_f32(
|
||||
constant ggml_metal_kargs_pool_2d & args,
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint gid[[thread_position_in_grid]]) {
|
||||
|
||||
if (gid >= args.np) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int idx = gid;
|
||||
const int I_HW = args.IH * args.IW;
|
||||
const int O_HW = args.OH * args.OW;
|
||||
const int nc = idx / O_HW;
|
||||
const int cur_oh = idx % O_HW / args.OW;
|
||||
const int cur_ow = idx % O_HW % args.OW;
|
||||
|
||||
device const float * i_ptr = src0 + nc * I_HW;
|
||||
device float * o_ptr = dst + nc * O_HW;
|
||||
|
||||
const int start_h = cur_oh * args.s1 - args.p1;
|
||||
const int bh = MAX(0, start_h);
|
||||
const int eh = MIN(args.IH, start_h + args.k1);
|
||||
const int start_w = cur_ow * args.s0 - args.p0;
|
||||
const int bw = MAX(0, start_w);
|
||||
const int ew = MIN(args.IW, start_w + args.k0);
|
||||
// const float scale = 1. / ((eh - bh) * (ew - bw));
|
||||
const float scale = 1. / (args.k0 * args.k1);
|
||||
|
||||
float res = 0;
|
||||
|
||||
for (int i = bh; i < eh; i += 1) {
|
||||
for (int j = bw; j < ew; j += 1) {
|
||||
float cur = i_ptr[i * args.IW + j];
|
||||
res += cur * scale;
|
||||
}
|
||||
}
|
||||
|
||||
o_ptr[cur_oh * args.OW + cur_ow] = res;
|
||||
}
|
||||
|
||||
|
||||
kernel void kernel_pool_1d_max_f32(
|
||||
constant ggml_metal_kargs_pool_1d & args,
|
||||
device const float * src,
|
||||
device float * dst,
|
||||
uint gid [[thread_position_in_grid]]
|
||||
) {
|
||||
|
||||
if (gid >= args.np) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int ow = (int)gid % args.OW;
|
||||
const int row = (int)gid / args.OW;
|
||||
|
||||
const int base = ow * args.s0 - args.p0;
|
||||
|
||||
float acc = -INFINITY;
|
||||
|
||||
const int src_off = row * args.IW;
|
||||
const int dst_off = row * args.OW;
|
||||
|
||||
for (int ki = 0; ki < args.k0; ++ki) {
|
||||
int j = base + ki;
|
||||
if (j < 0 || j >= args.IW){
|
||||
continue;
|
||||
}
|
||||
float v = src[src_off + j];
|
||||
acc = max(acc, v);
|
||||
}
|
||||
|
||||
dst[dst_off + ow] = acc;
|
||||
}
|
||||
|
||||
kernel void kernel_pool_1d_avg_f32(
|
||||
constant ggml_metal_kargs_pool_1d & args,
|
||||
device const float * src,
|
||||
device float * dst,
|
||||
uint gid [[thread_position_in_grid]]
|
||||
) {
|
||||
|
||||
if (gid >= args.np) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int ow = (int)gid % args.OW;
|
||||
const int row = (int)gid / args.OW;
|
||||
|
||||
const int base = ow * args.s0 - args.p0;
|
||||
|
||||
float acc = 0.0f;
|
||||
int cnt = 0;
|
||||
|
||||
const int src_off = row * args.IW;
|
||||
const int dst_off = row * args.OW;
|
||||
|
||||
for (int ki = 0; ki < args.k0; ++ki) {
|
||||
const int j = base + ki;
|
||||
if (j < 0 || j >= args.IW) {
|
||||
continue;
|
||||
}
|
||||
acc += src[src_off + j];
|
||||
cnt += 1;
|
||||
}
|
||||
|
||||
dst[dst_off + ow] = (cnt > 0) ? (acc / (float)cnt) : 0.0f;
|
||||
}
|
||||
@@ -0,0 +1,213 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
void quantize_q1_0(device const float * src, device block_q1_0 & dst) {
|
||||
float sum_abs = 0.0f;
|
||||
for (int j = 0; j < QK1_0; j++) {
|
||||
sum_abs += fabs(src[j]);
|
||||
}
|
||||
dst.d = sum_abs / QK1_0;
|
||||
|
||||
for (int j = 0; j < QK1_0 / 8; j++) {
|
||||
dst.qs[j] = 0;
|
||||
}
|
||||
for (int j = 0; j < QK1_0; j++) {
|
||||
if (src[j] >= 0.0f) {
|
||||
dst.qs[j / 8] |= (1 << (j % 8));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_0; j++) {
|
||||
const float v = src[j];
|
||||
if (amax < fabs(v)) {
|
||||
amax = fabs(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = max / -8;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
dst.d = d;
|
||||
|
||||
for (int j = 0; j < QK4_0/2; ++j) {
|
||||
const float x0 = src[0 + j]*id;
|
||||
const float x1 = src[QK4_0/2 + j]*id;
|
||||
|
||||
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
|
||||
|
||||
dst.qs[j] = xi0;
|
||||
dst.qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float min = FLT_MAX;
|
||||
float max = -FLT_MAX;
|
||||
|
||||
for (int j = 0; j < QK4_1; j++) {
|
||||
const float v = src[j];
|
||||
if (min > v) min = v;
|
||||
if (max < v) max = v;
|
||||
}
|
||||
|
||||
const float d = (max - min) / ((1 << 4) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
dst.d = d;
|
||||
dst.m = min;
|
||||
|
||||
for (int j = 0; j < QK4_1/2; ++j) {
|
||||
const float x0 = (src[0 + j] - min)*id;
|
||||
const float x1 = (src[QK4_1/2 + j] - min)*id;
|
||||
|
||||
const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
|
||||
const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
|
||||
|
||||
dst.qs[j] = xi0;
|
||||
dst.qs[j] |= xi1 << 4;
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_q5_0(device const float * src, device block_q5_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK5_0; j++) {
|
||||
const float v = src[j];
|
||||
if (amax < fabs(v)) {
|
||||
amax = fabs(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = max / -16;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
dst.d = d;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_0/2; ++j) {
|
||||
const float x0 = src[0 + j]*id;
|
||||
const float x1 = src[QK5_0/2 + j]*id;
|
||||
|
||||
const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
|
||||
const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
|
||||
|
||||
dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
|
||||
}
|
||||
|
||||
thread const uint8_t * qh8 = (thread const uint8_t *)&qh;
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
dst.qh[j] = qh8[j];
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_q5_1(device const float * src, device block_q5_1 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float max = src[0];
|
||||
float min = src[0];
|
||||
|
||||
for (int j = 1; j < QK5_1; j++) {
|
||||
const float v = src[j];
|
||||
min = v < min ? v : min;
|
||||
max = v > max ? v : max;
|
||||
}
|
||||
|
||||
const float d = (max - min) / 31;
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
dst.d = d;
|
||||
dst.m = min;
|
||||
|
||||
uint32_t qh = 0;
|
||||
for (int j = 0; j < QK5_1/2; ++j) {
|
||||
const float x0 = (src[0 + j] - min)*id;
|
||||
const float x1 = (src[QK5_1/2 + j] - min)*id;
|
||||
|
||||
const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
|
||||
const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
|
||||
|
||||
dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
|
||||
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
|
||||
qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
|
||||
}
|
||||
|
||||
thread const uint8_t * qh8 = (thread const uint8_t *)&qh;
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
dst.qh[j] = qh8[j];
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_q8_0(device const float * src, device block_q8_0 & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int j = 0; j < QK8_0; j++) {
|
||||
const float v = src[j];
|
||||
amax = MAX(amax, fabs(v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
dst.d = d;
|
||||
|
||||
for (int j = 0; j < QK8_0; ++j) {
|
||||
const float x0 = src[j]*id;
|
||||
|
||||
dst.qs[j] = round(x0);
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) {
|
||||
#pragma METAL fp math_mode(safe)
|
||||
float amax = 0.0f; // absolute max
|
||||
float max = 0.0f;
|
||||
|
||||
for (int j = 0; j < QK4_NL; j++) {
|
||||
const float v = src[j];
|
||||
if (amax < fabs(v)) {
|
||||
amax = fabs(v);
|
||||
max = v;
|
||||
}
|
||||
}
|
||||
|
||||
const float d = max / kvalues_iq4nl_f[0];
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int j = 0; j < QK4_NL/2; ++j) {
|
||||
const float x0 = src[0 + j]*id;
|
||||
const float x1 = src[QK4_NL/2 + j]*id;
|
||||
|
||||
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0);
|
||||
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1);
|
||||
|
||||
dst.qs[j] = xi0 | (xi1 << 4);
|
||||
|
||||
const float v0 = kvalues_iq4nl_f[xi0];
|
||||
const float v1 = kvalues_iq4nl_f[xi1];
|
||||
const float w0 = src[0 + j]*src[0 + j];
|
||||
const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j];
|
||||
sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j];
|
||||
sumq2 += w0*v0*v0 + w1*v1*v1;
|
||||
|
||||
}
|
||||
|
||||
dst.d = sumq2 > 0 ? sumqx/sumq2 : d;
|
||||
}
|
||||
@@ -0,0 +1,389 @@
|
||||
#include "common.h"
|
||||
#include "dequantize.h"
|
||||
#include "quantize.h"
|
||||
|
||||
template<typename T0, typename T1>
|
||||
kernel void kernel_cpy_t_t(
|
||||
constant ggml_metal_kargs_cpy & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig[2];
|
||||
const int32_t i02 = tgpig[1];
|
||||
const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
|
||||
const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
|
||||
|
||||
const int32_t i3 = n/(args.ne2*args.ne1*args.ne0);
|
||||
const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
|
||||
const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
|
||||
const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
|
||||
|
||||
device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) {
|
||||
device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
|
||||
dst_data[i00] = (T1) src[0];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_cpy_t_t<float, float>) kernel_cpy_t;
|
||||
|
||||
template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy_t_t<float, float>;
|
||||
template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy_t_t<float, half>;
|
||||
template [[host_name("kernel_cpy_f32_i32")]] kernel kernel_cpy_t kernel_cpy_t_t<float, int32_t>;
|
||||
template [[host_name("kernel_cpy_i32_f32")]] kernel kernel_cpy_t kernel_cpy_t_t<int32_t, float>;
|
||||
template [[host_name("kernel_cpy_i32_i32")]] kernel kernel_cpy_t kernel_cpy_t_t<int32_t, int32_t>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_cpy_f32_bf16")]] kernel kernel_cpy_t kernel_cpy_t_t<float, bfloat>;
|
||||
#endif
|
||||
template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy_t_t<half, float>;
|
||||
template [[host_name("kernel_cpy_f16_f16")]] kernel kernel_cpy_t kernel_cpy_t_t<half, half>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_cpy_bf16_f32")]] kernel kernel_cpy_t kernel_cpy_t_t<bfloat, float>;
|
||||
template [[host_name("kernel_cpy_bf16_bf16")]] kernel kernel_cpy_t kernel_cpy_t_t<bfloat, bfloat>;
|
||||
#endif
|
||||
|
||||
template<short QK,
|
||||
typename block_q,
|
||||
void (*quantize_func)(device const float *, device block_q &)>
|
||||
kernel void kernel_cpy_f32_q(
|
||||
constant ggml_metal_kargs_cpy & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig[2];
|
||||
const int32_t i02 = tgpig[1];
|
||||
const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
|
||||
const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
|
||||
|
||||
const int32_t i3 = n / (args.ne2*args.ne1*args.ne0);
|
||||
const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0);
|
||||
const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0;
|
||||
const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK;
|
||||
|
||||
device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) {
|
||||
device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00);
|
||||
|
||||
quantize_func(src, dst_data[i00]);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_cpy_f32_q<QK8_0, block_q8_0, quantize_q8_0>) cpy_f_q_t;
|
||||
|
||||
template [[host_name("kernel_cpy_f32_q8_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK8_0, block_q8_0, quantize_q8_0>;
|
||||
template [[host_name("kernel_cpy_f32_q1_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK1_0, block_q1_0, quantize_q1_0>;
|
||||
template [[host_name("kernel_cpy_f32_q4_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK4_0, block_q4_0, quantize_q4_0>;
|
||||
template [[host_name("kernel_cpy_f32_q4_1")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK4_1, block_q4_1, quantize_q4_1>;
|
||||
template [[host_name("kernel_cpy_f32_q5_0")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK5_0, block_q5_0, quantize_q5_0>;
|
||||
template [[host_name("kernel_cpy_f32_q5_1")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK5_1, block_q5_1, quantize_q5_1>;
|
||||
template [[host_name("kernel_cpy_f32_iq4_nl")]] kernel cpy_f_q_t kernel_cpy_f32_q<QK4_NL, block_iq4_nl, quantize_iq4_nl>;
|
||||
|
||||
template<typename T4x4, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
|
||||
kernel void kernel_cpy_q_f32(
|
||||
constant ggml_metal_kargs_cpy & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig[2];
|
||||
const int32_t i02 = tgpig[1];
|
||||
const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y;
|
||||
const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0;
|
||||
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
|
||||
|
||||
const int32_t i3 = n/(args.ne2*args.ne1*args.ne0);
|
||||
const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
|
||||
const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
|
||||
const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
|
||||
|
||||
device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
|
||||
device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) {
|
||||
T4x4 temp;
|
||||
dequantize_func(src_data + i00/nl, i00%nl, temp);
|
||||
dst_data[i00] = temp;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
|
||||
|
||||
template [[host_name("kernel_cpy_q1_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q1_0, 8, dequantize_q1_0>;
|
||||
template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
|
||||
|
||||
template [[host_name("kernel_cpy_q1_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q1_0, 8, dequantize_q1_0>;
|
||||
template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_concat(
|
||||
constant ggml_metal_kargs_concat & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int i3 = tgpig.z;
|
||||
const int i2 = tgpig.y;
|
||||
const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y;
|
||||
|
||||
if (i1 >= args.ne1) {
|
||||
return;
|
||||
}
|
||||
|
||||
int o[4] = {0, 0, 0, 0};
|
||||
o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
device const T * x;
|
||||
|
||||
if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
|
||||
x = (device const T *)(src0 + (i3 )*args.nb03 + (i2 )*args.nb02 + (i1 )*args.nb01 + (i0 )*args.nb00);
|
||||
} else {
|
||||
x = (device const T *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
|
||||
}
|
||||
|
||||
device T * y = (device T *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_concat<float>) kernel_concat_t;
|
||||
|
||||
template [[host_name("kernel_concat_f32")]] kernel kernel_concat_t kernel_concat<float>;
|
||||
template [[host_name("kernel_concat_f16")]] kernel kernel_concat_t kernel_concat<half>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat<bfloat>;
|
||||
#endif
|
||||
template [[host_name("kernel_concat_i8")]] kernel kernel_concat_t kernel_concat<char>;
|
||||
template [[host_name("kernel_concat_i16")]] kernel kernel_concat_t kernel_concat<short>;
|
||||
template [[host_name("kernel_concat_i32")]] kernel kernel_concat_t kernel_concat<int>;
|
||||
template [[host_name("kernel_concat_i64")]] kernel kernel_concat_t kernel_concat<long>;
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
kernel void kernel_get_rows_q(
|
||||
constant ggml_metal_kargs_get_rows & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device void * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 ntg [[threads_per_threadgroup]]) {
|
||||
const int32_t iw0 = tgpig.x/args.ne10;
|
||||
const int32_t i10 = tgpig.x%args.ne10;
|
||||
const int32_t i11 = tgpig.y;
|
||||
const int32_t i12 = tgpig.z;
|
||||
|
||||
const int32_t r = ((const device int32_t *) ((const device char *) src1 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10))[0];
|
||||
|
||||
const int32_t i02 = i11;
|
||||
const int32_t i03 = i12;
|
||||
|
||||
auto psrc = (device const block_q *) ((const device char *) src0 + i03*args.nb03 + i02*args.nb02 + r*args.nb01);
|
||||
auto pdst = (device float4x4 *) (( device char *) dst + i12*args.nb3 + i11*args.nb2 + i10*args.nb1);
|
||||
|
||||
for (int ind = iw0*ntg.x + tiitg; ind < args.ne00t;) {
|
||||
float4x4 temp;
|
||||
dequantize_func(psrc + ind/nl, ind%nl, temp);
|
||||
pdst[ind] = temp;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T0, typename T>
|
||||
kernel void kernel_get_rows_f(
|
||||
constant ggml_metal_kargs_get_rows & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device void * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 ntg [[threads_per_threadgroup]]) {
|
||||
const int32_t iw0 = tgpig.x/args.ne10;
|
||||
const int32_t i10 = tgpig.x%args.ne10;
|
||||
const int32_t i11 = tgpig.y;
|
||||
const int32_t i12 = tgpig.z;
|
||||
|
||||
const int32_t r = ((const device int32_t *) ((const device char *) src1 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10))[0];
|
||||
|
||||
const int32_t i02 = i11;
|
||||
const int32_t i03 = i12;
|
||||
|
||||
auto psrc = (const device T0 *) ((const device char *) src0 + i03*args.nb03 + i02*args.nb02 + r*args.nb01);
|
||||
auto pdst = ( device T *) (( device char *) dst + i12*args.nb3 + i11*args.nb2 + i10*args.nb1);
|
||||
|
||||
for (int ind = iw0*ntg.x + tiitg; ind < args.ne00t;) {
|
||||
pdst[ind] = psrc[ind];
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TI, typename block_q, void (*quantize_func)(device const float *, device block_q &)>
|
||||
kernel void kernel_set_rows_q32(
|
||||
constant ggml_metal_kargs_set_rows & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg [[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig.z;
|
||||
const int32_t i02 = tgpig.y;
|
||||
|
||||
const int32_t i12 = i03%args.ne12;
|
||||
const int32_t i11 = i02%args.ne11;
|
||||
|
||||
const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x;
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t i10 = i01;
|
||||
const TI i1 = ((const device TI *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0];
|
||||
|
||||
device block_q * dst_row = ( device block_q *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3);
|
||||
const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
|
||||
|
||||
for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) {
|
||||
quantize_func(src_row + 32*ind, dst_row[ind]);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T, typename TI>
|
||||
kernel void kernel_set_rows_f(
|
||||
constant ggml_metal_kargs_set_rows & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint3 tptg [[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig.z;
|
||||
const int32_t i02 = tgpig.y;
|
||||
|
||||
const int32_t i12 = i03%args.ne12;
|
||||
const int32_t i11 = i02%args.ne11;
|
||||
|
||||
const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x;
|
||||
if (i01 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t i10 = i01;
|
||||
const TI i1 = ((const device TI *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0];
|
||||
|
||||
device T * dst_row = ( device T *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3);
|
||||
const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
|
||||
|
||||
for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) {
|
||||
dst_row[ind] = (T) src_row[ind];
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// get rows
|
||||
//
|
||||
|
||||
typedef decltype(kernel_get_rows_f<float, float>) get_rows_f_t;
|
||||
|
||||
template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f<float, float>;
|
||||
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f<half, float>;
|
||||
template [[host_name("kernel_get_rows_i32")]] kernel get_rows_f_t kernel_get_rows_f<int32_t, int32_t>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_get_rows_bf16")]] kernel get_rows_f_t kernel_get_rows_f<bfloat, float>;
|
||||
#endif
|
||||
|
||||
typedef decltype(kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>) get_rows_q_t;
|
||||
|
||||
template [[host_name("kernel_get_rows_q1_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q1_0, 8, dequantize_q1_0>;
|
||||
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_0, 2, dequantize_q4_0>;
|
||||
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_1, 2, dequantize_q4_1>;
|
||||
template [[host_name("kernel_get_rows_q5_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_0, 2, dequantize_q5_0>;
|
||||
template [[host_name("kernel_get_rows_q5_1")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_1, 2, dequantize_q5_1>;
|
||||
template [[host_name("kernel_get_rows_q8_0")]] kernel get_rows_q_t kernel_get_rows_q<block_q8_0, 2, dequantize_q8_0>;
|
||||
template [[host_name("kernel_get_rows_mxfp4")]] kernel get_rows_q_t kernel_get_rows_q<block_mxfp4, 2, dequantize_mxfp4>;
|
||||
template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q2_K, QK_NL, dequantize_q2_K>;
|
||||
template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q3_K, QK_NL, dequantize_q3_K>;
|
||||
template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_K, QK_NL, dequantize_q4_K>;
|
||||
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_K, QK_NL, dequantize_q5_K>;
|
||||
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_q_t kernel_get_rows_q<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_q_t kernel_get_rows_q<block_iq3_s, QK_NL, dequantize_iq3_s>;
|
||||
template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_s, QK_NL, dequantize_iq2_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_q_t kernel_get_rows_q<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_q_t kernel_get_rows_q<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_q_t kernel_get_rows_q<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
|
||||
//
|
||||
// set rows
|
||||
//
|
||||
|
||||
typedef decltype(kernel_set_rows_f<float, int64_t>) set_rows_f_t;
|
||||
|
||||
template [[host_name("kernel_set_rows_f32_i64")]] kernel set_rows_f_t kernel_set_rows_f<float, int64_t>;
|
||||
template [[host_name("kernel_set_rows_f32_i32")]] kernel set_rows_f_t kernel_set_rows_f<float, int32_t>;
|
||||
template [[host_name("kernel_set_rows_f16_i64")]] kernel set_rows_f_t kernel_set_rows_f<half, int64_t>;
|
||||
template [[host_name("kernel_set_rows_f16_i32")]] kernel set_rows_f_t kernel_set_rows_f<half, int32_t>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_set_rows_bf16_i64")]] kernel set_rows_f_t kernel_set_rows_f<bfloat, int64_t>;
|
||||
template [[host_name("kernel_set_rows_bf16_i32")]] kernel set_rows_f_t kernel_set_rows_f<bfloat, int32_t>;
|
||||
#endif
|
||||
|
||||
typedef decltype(kernel_set_rows_q32<int64_t, block_q8_0, quantize_q8_0>) set_rows_q32_t;
|
||||
|
||||
template [[host_name("kernel_set_rows_q8_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32<int64_t, block_q8_0, quantize_q8_0>;
|
||||
template [[host_name("kernel_set_rows_q8_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32<int32_t, block_q8_0, quantize_q8_0>;
|
||||
template [[host_name("kernel_set_rows_q4_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32<int64_t, block_q4_0, quantize_q4_0>;
|
||||
template [[host_name("kernel_set_rows_q4_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32<int32_t, block_q4_0, quantize_q4_0>;
|
||||
template [[host_name("kernel_set_rows_q4_1_i64")]] kernel set_rows_q32_t kernel_set_rows_q32<int64_t, block_q4_1, quantize_q4_1>;
|
||||
template [[host_name("kernel_set_rows_q4_1_i32")]] kernel set_rows_q32_t kernel_set_rows_q32<int32_t, block_q4_1, quantize_q4_1>;
|
||||
template [[host_name("kernel_set_rows_q5_0_i64")]] kernel set_rows_q32_t kernel_set_rows_q32<int64_t, block_q5_0, quantize_q5_0>;
|
||||
template [[host_name("kernel_set_rows_q5_0_i32")]] kernel set_rows_q32_t kernel_set_rows_q32<int32_t, block_q5_0, quantize_q5_0>;
|
||||
template [[host_name("kernel_set_rows_q5_1_i64")]] kernel set_rows_q32_t kernel_set_rows_q32<int64_t, block_q5_1, quantize_q5_1>;
|
||||
template [[host_name("kernel_set_rows_q5_1_i32")]] kernel set_rows_q32_t kernel_set_rows_q32<int32_t, block_q5_1, quantize_q5_1>;
|
||||
template [[host_name("kernel_set_rows_iq4_nl_i64")]] kernel set_rows_q32_t kernel_set_rows_q32<int64_t, block_iq4_nl, quantize_iq4_nl>;
|
||||
template [[host_name("kernel_set_rows_iq4_nl_i32")]] kernel set_rows_q32_t kernel_set_rows_q32<int32_t, block_iq4_nl, quantize_iq4_nl>;
|
||||
@@ -0,0 +1,228 @@
|
||||
#include "common.h"
|
||||
|
||||
kernel void kernel_op_sum_f32(
|
||||
constant ggml_metal_kargs_sum & args,
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
threadgroup float * shmem_f32 [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
if (args.np == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: become function constant
|
||||
const uint nsg = (ntg.x + 31) / 32;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (uint64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
|
||||
sumf += src0[i0];
|
||||
}
|
||||
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shmem_f32[sgitg] = sumf;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
float total = 0;
|
||||
|
||||
if (sgitg == 0) {
|
||||
float v = 0;
|
||||
|
||||
if (tpitg.x < nsg) {
|
||||
v = shmem_f32[tpitg.x];
|
||||
}
|
||||
|
||||
total = simd_sum(v);
|
||||
|
||||
if (tpitg.x == 0) {
|
||||
dst[0] = total;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
constant short FC_sum_rows_op [[function_constant(FC_SUM_ROWS + 0)]];
|
||||
|
||||
template <typename T0, typename T>
|
||||
kernel void kernel_sum_rows_impl(
|
||||
constant ggml_metal_kargs_sum_rows & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
#define FC_OP FC_sum_rows_op
|
||||
|
||||
const int i3 = tgpig.z;
|
||||
const int i2 = tgpig.y;
|
||||
const int i1 = tgpig.x;
|
||||
|
||||
threadgroup T0 * shmem_t = (threadgroup T0 *) shmem;
|
||||
|
||||
if (sgitg == 0) {
|
||||
shmem_t[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
device const T0 * src_row = (device const T0 *) (src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
|
||||
device T * dst_row = (device T *) (dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3);
|
||||
|
||||
T0 sumf = T0(0.0f);
|
||||
|
||||
for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
|
||||
sumf += src_row[i0];
|
||||
}
|
||||
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shmem_t[sgitg] = sumf;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
sumf = shmem_t[tiisg];
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
if (tpitg.x == 0) {
|
||||
if (FC_OP == OP_SUM_ROWS_NUM_MEAN) {
|
||||
if (is_same<float4, T0>::value) {
|
||||
dst_row[0] = sum(sumf) / (4*args.ne00);
|
||||
} else {
|
||||
dst_row[0] = sum(sumf) / args.ne00;
|
||||
}
|
||||
} else {
|
||||
dst_row[0] = sum(sumf);
|
||||
}
|
||||
}
|
||||
|
||||
#undef FC_OP
|
||||
}
|
||||
|
||||
typedef decltype(kernel_sum_rows_impl<float, float>) kernel_sum_rows_t;
|
||||
|
||||
template [[host_name("kernel_sum_rows_f32_f32")]] kernel kernel_sum_rows_t kernel_sum_rows_impl<float, float>;
|
||||
template [[host_name("kernel_sum_rows_f32_f32_4")]] kernel kernel_sum_rows_t kernel_sum_rows_impl<float4, float>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_cumsum_blk(
|
||||
constant ggml_metal_kargs_cumsum_blk & args,
|
||||
device const char * src0,
|
||||
device char * tmp,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int ib = tgpig[0]/args.ne01;
|
||||
|
||||
const int i00 = ib*ntg.x;
|
||||
const int i01 = tgpig[0]%args.ne01;
|
||||
const int i02 = tgpig[1];
|
||||
const int i03 = tgpig[2];
|
||||
|
||||
device const float * src0_row = (device const float *) (src0 +
|
||||
args.nb01*i01 +
|
||||
args.nb02*i02 +
|
||||
args.nb03*i03);
|
||||
|
||||
threadgroup float * shmem_f32 = (threadgroup float *) shmem;
|
||||
|
||||
float v = 0.0f;
|
||||
|
||||
if (i00 + tpitg.x < args.ne00) {
|
||||
v = src0_row[i00 + tpitg.x];
|
||||
}
|
||||
|
||||
float s = simd_prefix_inclusive_sum(v);
|
||||
|
||||
if (tiisg == N_SIMDWIDTH - 1) {
|
||||
shmem_f32[sgitg] = s;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (sgitg == 0) {
|
||||
shmem_f32[tiisg] = simd_prefix_exclusive_sum(shmem_f32[tiisg]);
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
s += shmem_f32[sgitg];
|
||||
|
||||
device float * dst_row = (device float *) dst +
|
||||
args.ne00*i01 +
|
||||
args.ne00*args.ne01*i02 +
|
||||
args.ne00*args.ne01*args.ne02*i03;
|
||||
|
||||
if (i00 + tpitg.x < args.ne00) {
|
||||
dst_row[i00 + tpitg.x] = s;
|
||||
}
|
||||
|
||||
if (args.outb && tpitg.x == ntg.x - 1) {
|
||||
device float * tmp_row = (device float *) tmp +
|
||||
args.net0*i01 +
|
||||
args.net0*args.net1*i02 +
|
||||
args.net0*args.net1*args.net2*i03;
|
||||
|
||||
tmp_row[ib] = s;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_cumsum_blk<float>) kernel_cumsum_blk_t;
|
||||
|
||||
template [[host_name("kernel_cumsum_blk_f32")]] kernel kernel_cumsum_blk_t kernel_cumsum_blk<float>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_cumsum_add(
|
||||
constant ggml_metal_kargs_cumsum_add & args,
|
||||
device const char * tmp,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int ib = tgpig[0]/args.ne01;
|
||||
|
||||
if (ib == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i00 = ib*ntg.x;
|
||||
const int i01 = tgpig[0]%args.ne01;
|
||||
const int i02 = tgpig[1];
|
||||
const int i03 = tgpig[2];
|
||||
|
||||
device const float * tmp_row = (device const float *) (tmp +
|
||||
args.nbt1*i01 +
|
||||
args.nbt2*i02 +
|
||||
args.nbt3*i03);
|
||||
|
||||
device float * dst_row = (device float *) dst +
|
||||
args.ne00*i01 +
|
||||
args.ne00*args.ne01*i02 +
|
||||
args.ne00*args.ne01*args.ne02*i03;
|
||||
|
||||
if (i00 + tpitg.x < args.ne00) {
|
||||
dst_row[i00 + tpitg.x] += tmp_row[ib - 1];
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_cumsum_add<float>) kernel_cumsum_add_t;
|
||||
|
||||
template [[host_name("kernel_cumsum_add_f32")]] kernel kernel_cumsum_add_t kernel_cumsum_add<float>;
|
||||
@@ -0,0 +1,318 @@
|
||||
#include "common.h"
|
||||
|
||||
constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]];
|
||||
constant bool FC_rope_is_back [[function_constant(FC_ROPE + 1)]];
|
||||
|
||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
static void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], int i0, float ext_factor, float mscale,
|
||||
thread float * cos_theta, thread float * sin_theta) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||
}
|
||||
*cos_theta = cos(theta) * mscale;
|
||||
*sin_theta = sin(theta) * mscale;
|
||||
if (FC_rope_is_back) {
|
||||
*sin_theta *= -1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
static float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
|
||||
}
|
||||
|
||||
static void rope_yarn_corr_dims(
|
||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
|
||||
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope_norm(
|
||||
constant ggml_metal_kargs_rope & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tptg [[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int i3 = tgpig[2];
|
||||
const int i2 = tgpig[1];
|
||||
const int i1 = tgpig[0];
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = (device const int32_t *) src1;
|
||||
|
||||
const float theta_base = (float) pos[i2];
|
||||
const float inv_ndims = -1.f/args.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
if (i0 < args.n_dims) {
|
||||
const int ic = i0/2;
|
||||
|
||||
const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[1];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope_neox(
|
||||
constant ggml_metal_kargs_rope & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tptg [[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int i3 = tgpig[2];
|
||||
const int i2 = tgpig[1];
|
||||
const int i1 = tgpig[0];
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = (device const int32_t *) src1;
|
||||
|
||||
const float theta_base = (float) pos[i2];
|
||||
const float inv_ndims = -1.f/args.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
if (i0 < args.n_dims) {
|
||||
const int ic = i0/2;
|
||||
|
||||
const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[args.n_dims/2];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope_multi(
|
||||
constant ggml_metal_kargs_rope & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tptg [[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int i3 = tgpig[2];
|
||||
const int i2 = tgpig[1];
|
||||
const int i1 = tgpig[0];
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = (device const int32_t *) src1;
|
||||
|
||||
const float inv_ndims = -1.f/args.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
if (i0 < args.n_dims) {
|
||||
const int ic = i0/2;
|
||||
|
||||
// mrope theta calculations
|
||||
// note: the rest is the same as kernel_rope_neox
|
||||
const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3;
|
||||
const int sec_w01 = args.sect_0 + args.sect_1; // end of section 1
|
||||
const int sec_w012 = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
|
||||
const int sector = ic % sect_dims;
|
||||
|
||||
float theta_base;
|
||||
if (FC_rope_is_imrope) {
|
||||
if (sector % 3 == 1 && sector < 3 * args.sect_1) { // h
|
||||
theta_base = (float) pos[i2 + args.ne02 * 1];
|
||||
} else if (sector % 3 == 2 && sector < 3 * args.sect_2) { // w
|
||||
theta_base = (float) pos[i2 + args.ne02 * 2];
|
||||
} else if (sector % 3 == 0 && sector < 3 * args.sect_0) { // t
|
||||
theta_base = (float) pos[i2 + args.ne02 * 0];
|
||||
} else { // e
|
||||
theta_base = (float) pos[i2 + args.ne02 * 3];
|
||||
}
|
||||
} else {
|
||||
if (sector < args.sect_0) {
|
||||
theta_base = (float) pos[i2];
|
||||
} else if (sector < sec_w01) {
|
||||
theta_base = (float) pos[i2 + args.ne02 * 1];
|
||||
} else if (sector < sec_w012) {
|
||||
theta_base = (float) pos[i2 + args.ne02 * 2];
|
||||
} else {
|
||||
theta_base = (float) pos[i2 + args.ne02 * 3];
|
||||
}
|
||||
}
|
||||
// end of mrope
|
||||
|
||||
const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
|
||||
|
||||
const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[args.n_dims/2];
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
} else {
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_rope_vision(
|
||||
constant ggml_metal_kargs_rope & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
ushort tiitg[[thread_index_in_threadgroup]],
|
||||
ushort3 tptg [[threads_per_threadgroup]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]]) {
|
||||
const int i3 = tgpig[2];
|
||||
const int i2 = tgpig[1];
|
||||
const int i1 = tgpig[0];
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
|
||||
|
||||
device const int32_t * pos = (device const int32_t *) src1;
|
||||
|
||||
const float inv_ndims = -1.f/args.n_dims;
|
||||
|
||||
float cos_theta;
|
||||
float sin_theta;
|
||||
|
||||
for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
|
||||
if (i0 < 2*args.n_dims) { // different from kernel_rope_multi
|
||||
const int ic = i0/2;
|
||||
|
||||
// mrope theta calculations (only support 2 dimensions)
|
||||
const int sect_dims = args.sect_0 + args.sect_1;
|
||||
const int sector = ic % sect_dims;
|
||||
|
||||
float p;
|
||||
float theta_base;
|
||||
if (sector < args.sect_1) {
|
||||
p = (float) sector;
|
||||
theta_base = (float) pos[i2];
|
||||
} else {
|
||||
p = (float) sector - args.sect_0;
|
||||
theta_base = (float) pos[i2 + args.ne02];
|
||||
}
|
||||
|
||||
const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
|
||||
// end of mrope
|
||||
|
||||
const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
|
||||
|
||||
rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
|
||||
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + ic*args.nb0);
|
||||
|
||||
const float x0 = src[0];
|
||||
const float x1 = src[args.n_dims]; // different from kernel_rope_multi
|
||||
|
||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||
dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi
|
||||
} else {
|
||||
device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
|
||||
device T * dst_data = (device T *)( dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
dst_data[0] = src[0];
|
||||
dst_data[1] = src[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
|
||||
typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
|
||||
typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
|
||||
typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
|
||||
|
||||
template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
|
||||
template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
|
||||
|
||||
template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
|
||||
template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
|
||||
|
||||
template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
|
||||
template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
|
||||
|
||||
template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
|
||||
template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
|
||||
@@ -0,0 +1,223 @@
|
||||
#include "common.h"
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_soft_max(
|
||||
constant ggml_metal_kargs_soft_max & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
threadgroup float * buf [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig.z;
|
||||
const int32_t i02 = tgpig.y;
|
||||
const int32_t i01 = tgpig.x;
|
||||
|
||||
const int32_t i13 = i03%args.ne13;
|
||||
const int32_t i12 = i02%args.ne12;
|
||||
const int32_t i11 = i01;
|
||||
|
||||
device const float * psrc0 = (device const float *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
|
||||
device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr;
|
||||
device const float * psrc2 = src2 != src0 ? (device const float *) (src2) : nullptr;
|
||||
device float * pdst = (device float *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (args.max_bias > 0.0f) {
|
||||
const int32_t h = i02;
|
||||
|
||||
const float base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, exp);
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float lmax = psrc2 ? psrc2[i02] : -INFINITY;
|
||||
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
|
||||
lmax = MAX(lmax, psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f));
|
||||
}
|
||||
|
||||
// find the max value in the block
|
||||
float max_val = simd_max(lmax);
|
||||
if (tptg.x > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = -INFINITY;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
buf[sgitg] = max_val;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
max_val = buf[tiisg];
|
||||
max_val = simd_max(max_val);
|
||||
}
|
||||
|
||||
// parallel sum
|
||||
float lsum = 0.0f;
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
|
||||
const float exp_psrc0 = exp((psrc0[i00]*args.scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
|
||||
lsum += exp_psrc0;
|
||||
pdst[i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
// This barrier fixes a failing test
|
||||
// ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
|
||||
threadgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
float sum = simd_sum(lsum);
|
||||
|
||||
if (tptg.x > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
buf[sgitg] = sum;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
sum = buf[tiisg];
|
||||
sum = simd_sum(sum);
|
||||
}
|
||||
|
||||
if (psrc2) {
|
||||
sum += exp(psrc2[i02] - max_val);
|
||||
}
|
||||
|
||||
const float inv_sum = 1.0f/sum;
|
||||
|
||||
for (int i00 = tpitg.x; i00 < args.ne00; i00 += tptg.x) {
|
||||
pdst[i00] *= inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_soft_max_4(
|
||||
constant ggml_metal_kargs_soft_max & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device const char * src2,
|
||||
device char * dst,
|
||||
threadgroup float * buf [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint3 tptg[[threads_per_threadgroup]]) {
|
||||
const int32_t i03 = tgpig.z;
|
||||
const int32_t i02 = tgpig.y;
|
||||
const int32_t i01 = tgpig.x;
|
||||
|
||||
const int32_t i13 = i03%args.ne13;
|
||||
const int32_t i12 = i02%args.ne12;
|
||||
const int32_t i11 = i01;
|
||||
|
||||
device const float4 * psrc4 = (device const float4 *) (src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03);
|
||||
device const T * pmask = src1 != src0 ? (device const T * ) (src1 + i11*args.nb11 + i12*args.nb12 + i13*args.nb13) : nullptr;
|
||||
device const float * psrc2 = src2 != src0 ? (device const float * ) (src2) : nullptr;
|
||||
device float4 * pdst4 = (device float4 *) (dst + i01*args.nb1 + i02*args.nb2 + i03*args.nb3);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
if (args.max_bias > 0.0f) {
|
||||
const int32_t h = i02;
|
||||
|
||||
const float base = h < args.n_head_log2 ? args.m0 : args.m1;
|
||||
const int exp = h < args.n_head_log2 ? h + 1 : 2*(h - args.n_head_log2) + 1;
|
||||
|
||||
slope = pow(base, exp);
|
||||
}
|
||||
|
||||
// parallel max
|
||||
float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY;
|
||||
|
||||
for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
|
||||
}
|
||||
|
||||
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||
|
||||
float max_val = simd_max(lmax);
|
||||
if (tptg.x > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = -INFINITY;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
buf[sgitg] = max_val;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
max_val = buf[tiisg];
|
||||
max_val = simd_max(max_val);
|
||||
}
|
||||
|
||||
// parallel sum
|
||||
float4 lsum4 = 0.0f;
|
||||
for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
|
||||
const float4 exp_psrc4 = exp((psrc4[i00]*args.scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
|
||||
lsum4 += exp_psrc4;
|
||||
pdst4[i00] = exp_psrc4;
|
||||
}
|
||||
|
||||
const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
|
||||
|
||||
// This barrier fixes a failing test
|
||||
// ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
|
||||
threadgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
float sum = simd_sum(lsum);
|
||||
|
||||
if (tptg.x > N_SIMDWIDTH) {
|
||||
if (sgitg == 0) {
|
||||
buf[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tiisg == 0) {
|
||||
buf[sgitg] = sum;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
sum = buf[tiisg];
|
||||
sum = simd_sum(sum);
|
||||
}
|
||||
|
||||
if (psrc2) {
|
||||
sum += exp(psrc2[i02] - max_val);
|
||||
}
|
||||
|
||||
const float inv_sum = 1.0f/sum;
|
||||
|
||||
for (int i00 = tpitg.x; i00 < args.ne00/4; i00 += tptg.x) {
|
||||
pdst4[i00] *= inv_sum;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_soft_max<float>) kernel_soft_max_t;
|
||||
typedef decltype(kernel_soft_max_4<float4>) kernel_soft_max_4_t;
|
||||
|
||||
template [[host_name("kernel_soft_max_f16")]] kernel kernel_soft_max_t kernel_soft_max<half>;
|
||||
template [[host_name("kernel_soft_max_f32")]] kernel kernel_soft_max_t kernel_soft_max<float>;
|
||||
template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<half4>;
|
||||
template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<float4>;
|
||||
@@ -0,0 +1,75 @@
|
||||
#include "common.h"
|
||||
|
||||
constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]];
|
||||
constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]];
|
||||
constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]];
|
||||
|
||||
kernel void kernel_solve_tri_f32(
|
||||
constant ggml_metal_kargs_solve_tri & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
threadgroup char * shmem [[threadgroup(0)]],
|
||||
ushort3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
|
||||
const short NSG = FC_solve_tri_nsg;
|
||||
const short N = FC_solve_tri_n;
|
||||
const short K = FC_solve_tri_k;
|
||||
const short NP = PAD2(N, NW);
|
||||
|
||||
const int32_t i03 = tgpig.z;
|
||||
const int32_t i02 = tgpig.y;
|
||||
const int32_t i01 = tgpig.x*NSG + sgitg;
|
||||
|
||||
threadgroup float * sh0 = (threadgroup float *) shmem;
|
||||
|
||||
device const float * src0_ptr = (device const float *)(src0 + i02 * args.nb02 + i03 * args.nb03) + sgitg*N;
|
||||
device const float * src1_ptr = (device const float *)(src1 + i02 * args.nb12 + i03 * args.nb13) + i01;
|
||||
device float * dst_ptr = (device float *)(dst + i02 * args.nb2 + i03 * args.nb3) + i01;
|
||||
|
||||
for (short rr = 0; rr < N; rr += NSG) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
{
|
||||
threadgroup float * sh0_cur = sh0 + sgitg*NP;
|
||||
|
||||
for (short t = 0; t*NW < N; ++t) {
|
||||
const short idx = t*NW + tiisg;
|
||||
sh0_cur[idx] = src0_ptr[idx];
|
||||
}
|
||||
|
||||
src0_ptr += NSG*N;
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (i01 >= args.ne10) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (short ir = 0; ir < NSG && rr + ir < N; ++ir) {
|
||||
const short r = rr + ir;
|
||||
|
||||
threadgroup float * sh0_cur = sh0 + ir*NP;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (short t = 0; t*NW < r; ++t) {
|
||||
const short idx = t*NW + tiisg;
|
||||
sum += sh0_cur[idx] * dst_ptr[idx*K] * (idx < r);
|
||||
}
|
||||
|
||||
sum = simd_sum(sum);
|
||||
|
||||
if (tiisg == 0) {
|
||||
const float diag = sh0_cur[r];
|
||||
|
||||
dst_ptr[r*K] = (src1_ptr[r*K] - sum) / diag;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,279 @@
|
||||
#include "common.h"
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_conv_f32
|
||||
kernel void kernel_ssm_conv_f32_f32(
|
||||
constant ggml_metal_kargs_ssm_conv & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i3 = tgpig.z;
|
||||
|
||||
const int64_t nc = args.ne10;
|
||||
//const int64_t ncs = args.ne00;
|
||||
//const int64_t nr = args.ne01;
|
||||
//const int64_t n_t = args.ne1;
|
||||
//const int64_t n_s = args.ne2;
|
||||
|
||||
device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
|
||||
device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11);
|
||||
device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
sumf += s[i0] * c[i0];
|
||||
}
|
||||
|
||||
x[0] = sumf;
|
||||
}
|
||||
|
||||
kernel void kernel_ssm_conv_f32_f32_4(
|
||||
constant ggml_metal_kargs_ssm_conv & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i3 = tgpig.z;
|
||||
|
||||
const int64_t nc = args.ne10;
|
||||
//const int64_t ncs = args.ne00;
|
||||
//const int64_t nr = args.ne01;
|
||||
//const int64_t n_t = args.ne1;
|
||||
//const int64_t n_s = args.ne2;
|
||||
|
||||
device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
|
||||
device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11);
|
||||
device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int64_t i0 = 0; i0 < nc/4; ++i0) {
|
||||
sumf += dot(s[i0], c[i0]);
|
||||
}
|
||||
|
||||
x[0] = sumf;
|
||||
}
|
||||
|
||||
constant short FC_ssm_conv_bs [[function_constant(FC_SSM_CONV + 0)]];
|
||||
|
||||
// Batched version: each threadgroup processes multiple tokens for better efficiency
|
||||
// Thread layout: each thread handles one token, threadgroup covers BATCH_SIZE tokens
|
||||
kernel void kernel_ssm_conv_f32_f32_batched(
|
||||
constant ggml_metal_kargs_ssm_conv & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
// tgpig.x = row index (ir)
|
||||
// tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
|
||||
// tgpig.z = sequence index (i3)
|
||||
// tpitg.x = thread within batch (0..BATCH_SIZE-1)
|
||||
const short BATCH_SIZE = FC_ssm_conv_bs;
|
||||
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i2_base = tgpig.y * BATCH_SIZE;
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2_off = tpitg.x;
|
||||
const int64_t i2 = i2_base + i2_off;
|
||||
|
||||
const int64_t nc = args.ne10; // conv kernel size (typically 4)
|
||||
const int64_t n_t = args.ne1; // number of tokens
|
||||
|
||||
// Bounds check for partial batches at the end
|
||||
if (i2 >= n_t) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load conv weights (shared across all tokens for this row)
|
||||
device const float * c = (device const float *) ((device const char *) src1 + ir*args.nb11);
|
||||
|
||||
// Load source for this specific token
|
||||
device const float * s = (device const float *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
|
||||
|
||||
// Output location for this token
|
||||
device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
for (int64_t i0 = 0; i0 < nc; ++i0) {
|
||||
sumf += s[i0] * c[i0];
|
||||
}
|
||||
|
||||
x[0] = sumf;
|
||||
}
|
||||
|
||||
kernel void kernel_ssm_conv_f32_f32_batched_4(
|
||||
constant ggml_metal_kargs_ssm_conv & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device float * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
// tgpig.x = row index (ir)
|
||||
// tgpig.y = batch of tokens (i2_base / BATCH_SIZE)
|
||||
// tgpig.z = sequence index (i3)
|
||||
// tpitg.x = thread within batch (0..BATCH_SIZE-1)
|
||||
const short BATCH_SIZE = FC_ssm_conv_bs;
|
||||
|
||||
const int64_t ir = tgpig.x;
|
||||
const int64_t i2_base = tgpig.y * BATCH_SIZE;
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2_off = tpitg.x;
|
||||
const int64_t i2 = i2_base + i2_off;
|
||||
|
||||
const int64_t nc = args.ne10; // conv kernel size (typically 4)
|
||||
const int64_t n_t = args.ne1; // number of tokens
|
||||
|
||||
// Bounds check for partial batches at the end
|
||||
if (i2 >= n_t) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Load conv weights (shared across all tokens for this row)
|
||||
device const float4 * c = (device const float4 *) ((device const char *) src1 + ir*args.nb11);
|
||||
|
||||
// Load source for this specific token
|
||||
device const float4 * s = (device const float4 *) ((device const char *) src0 + ir*args.nb01 + i2*args.nb00 + i3*args.nb02);
|
||||
|
||||
// Output location for this token
|
||||
device float * x = (device float *) ((device char *) dst + ir*args.nb0 + i2*args.nb1 + i3*args.nb2);
|
||||
|
||||
float sumf = 0.0f;
|
||||
for (int64_t i0 = 0; i0 < nc/4; ++i0) {
|
||||
sumf += dot(s[i0], c[i0]);
|
||||
}
|
||||
|
||||
x[0] = sumf;
|
||||
}
|
||||
|
||||
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32, Mamba-2 part
|
||||
// Optimized version: reduces redundant memory loads by having one thread load shared values
|
||||
kernel void kernel_ssm_scan_f32(
|
||||
constant ggml_metal_kargs_ssm_scan & args,
|
||||
device const void * src0,
|
||||
device const void * src1,
|
||||
device const void * src2,
|
||||
device const void * src3,
|
||||
device const void * src4,
|
||||
device const void * src5,
|
||||
device const void * src6,
|
||||
device float * dst,
|
||||
threadgroup float * shared [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort sgptg[[simdgroups_per_threadgroup]],
|
||||
uint3 tgpg[[threadgroups_per_grid]]) {
|
||||
constexpr short NW = N_SIMDWIDTH;
|
||||
|
||||
// Shared memory layout:
|
||||
// [0..sgptg*NW-1]: partial sums for reduction (existing)
|
||||
// [sgptg*NW..sgptg*NW+sgptg-1]: pre-computed x_dt values for each token in batch
|
||||
// [sgptg*NW+sgptg..sgptg*NW+2*sgptg-1]: pre-computed dA values for each token in batch
|
||||
threadgroup float * shared_sums = shared;
|
||||
threadgroup float * shared_x_dt = shared + sgptg * NW;
|
||||
threadgroup float * shared_dA = shared + sgptg * NW + sgptg;
|
||||
|
||||
shared_sums[tpitg.x] = 0.0f;
|
||||
|
||||
const int32_t i0 = tpitg.x;
|
||||
const int32_t i1 = tgpig.x;
|
||||
const int32_t ir = tgpig.y; // current head
|
||||
const int32_t i3 = tgpig.z; // current seq
|
||||
|
||||
const int32_t nc = args.d_state;
|
||||
const int32_t nr = args.d_inner;
|
||||
const int32_t nh = args.n_head;
|
||||
const int32_t ng = args.n_group;
|
||||
const int32_t n_t = args.n_seq_tokens;
|
||||
|
||||
const int32_t s_off = args.s_off;
|
||||
|
||||
device const int32_t * ids = (device const int32_t *) src6;
|
||||
|
||||
device const float * s0_buff = (device const float *) ((device const char *) src0 + ir*args.nb02 + ids[i3]*args.nb03);
|
||||
device float * s_buff = (device float *) ((device char *) dst + ir*args.nb02 + i3*args.nb03 + s_off);
|
||||
|
||||
const int32_t i = i0 + i1*nc;
|
||||
const int32_t g = ir / (nh / ng); // repeat_interleave
|
||||
|
||||
float s0 = s0_buff[i];
|
||||
float s = 0.0f;
|
||||
|
||||
device const float * A = (device const float *) ((device const char *) src3 + ir*args.nb31); // {ne30, nh}
|
||||
|
||||
const float A0 = A[i0%args.ne30];
|
||||
|
||||
device const float * x = (device const float *)((device const char *) src1 + i1*args.nb10 + ir*args.nb11 + i3*args.nb13); // {dim, nh, nt, ns}
|
||||
device const float * dt = (device const float *)((device const char *) src2 + ir*args.nb20 + i3*args.nb22); // {nh, nt, ns}
|
||||
device const float * B = (device const float *)((device const char *) src4 + g*args.nb41 + i3*args.nb43); // {d_state, ng, nt, ns}
|
||||
device const float * C = (device const float *)((device const char *) src5 + g*args.nb51 + i3*args.nb53); // {d_state, ng, nt, ns}
|
||||
|
||||
device float * y = dst + (i1 + ir*(nr) + i3*(n_t*nh*nr)); // {dim, nh, nt, ns}
|
||||
|
||||
for (int i2 = 0; i2 < n_t; i2 += sgptg) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// Pre-compute x_dt and dA for this batch of tokens
|
||||
// Only first sgptg threads do the loads and expensive math
|
||||
if (i0 < sgptg && i2 + i0 < n_t) {
|
||||
// ns12 and ns21 are element strides (nb12/nb10, nb21/nb20)
|
||||
device const float * x_t = x + i0 * args.ns12;
|
||||
device const float * dt_t = dt + i0 * args.ns21;
|
||||
|
||||
const float dt0 = dt_t[0];
|
||||
const float dtsp = dt0 <= 20.0f ? log(1.0f + exp(dt0)) : dt0;
|
||||
shared_x_dt[i0] = x_t[0] * dtsp;
|
||||
shared_dA[i0] = dtsp; // Store dtsp, compute exp(dtsp * A0) per-thread since A0 varies
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
for (int t = 0; t < sgptg && i2 + t < n_t; t++) {
|
||||
const float x_dt = shared_x_dt[t];
|
||||
const float dA = exp(shared_dA[t] * A0);
|
||||
|
||||
s = (s0 * dA) + (B[i0] * x_dt);
|
||||
|
||||
const float sumf = simd_sum(s * C[i0]);
|
||||
|
||||
if (tiisg == 0) {
|
||||
shared_sums[t*NW + sgitg] = sumf;
|
||||
}
|
||||
|
||||
// recurse
|
||||
s0 = s;
|
||||
|
||||
B += args.ns42;
|
||||
C += args.ns52;
|
||||
}
|
||||
|
||||
// Advance pointers for next batch
|
||||
x += sgptg * args.ns12;
|
||||
dt += sgptg * args.ns21;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const float sumf = simd_sum(shared_sums[sgitg*NW + tiisg]);
|
||||
|
||||
if (tiisg == 0 && i2 + sgitg < n_t) {
|
||||
y[sgitg*nh*nr] = sumf;
|
||||
}
|
||||
|
||||
y += sgptg*nh*nr;
|
||||
}
|
||||
|
||||
s_buff[i] = s;
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
#include "common.h"
|
||||
|
||||
template<uint32_t ttype>
|
||||
bool _ggml_vec_tri_cmp(const int i, const int r);
|
||||
|
||||
template<>
|
||||
bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_LOWER */ 3>(const int i, const int r) {
|
||||
return i < r;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_LOWER_DIAG */ 2>(const int i, const int r) {
|
||||
return i <= r;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_UPPER */ 1>(const int i, const int r) {
|
||||
return i > r;
|
||||
}
|
||||
|
||||
template<>
|
||||
bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_UPPER_DIAG */ 0>(const int i, const int r) {
|
||||
return i >= r;
|
||||
}
|
||||
|
||||
template<typename T, int ttype>
|
||||
kernel void kernel_tri(
|
||||
constant ggml_metal_kargs_tri & args,
|
||||
device const char * src0,
|
||||
device const char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i3 = tgpig.z;
|
||||
const int i2 = tgpig.y;
|
||||
const int i1 = tgpig.x;
|
||||
|
||||
if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
device const T * src_row = (device const T *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
|
||||
device T * dst_row = (device T *) ((device char *) dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3);
|
||||
|
||||
// Each thread is a single element of the row if ne00 < max threads per
|
||||
// threadgroup, so this will loop once for each index that this thread is
|
||||
// responsible for
|
||||
for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
|
||||
// Use the comparison as a mask for branchless
|
||||
dst_row[i0] = static_cast<T>(_ggml_vec_tri_cmp<ttype>(i0, i1)) * src_row[i0];
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_tri<float, 0>) kernel_tri_t;
|
||||
|
||||
template [[host_name("kernel_tri_f32_0")]] kernel kernel_tri_t kernel_tri<float, 0>;
|
||||
template [[host_name("kernel_tri_f32_1")]] kernel kernel_tri_t kernel_tri<float, 1>;
|
||||
template [[host_name("kernel_tri_f32_2")]] kernel kernel_tri_t kernel_tri<float, 2>;
|
||||
template [[host_name("kernel_tri_f32_3")]] kernel kernel_tri_t kernel_tri<float, 3>;
|
||||
template [[host_name("kernel_tri_f16_0")]] kernel kernel_tri_t kernel_tri<half, 0>;
|
||||
template [[host_name("kernel_tri_f16_1")]] kernel kernel_tri_t kernel_tri<half, 1>;
|
||||
template [[host_name("kernel_tri_f16_2")]] kernel kernel_tri_t kernel_tri<half, 2>;
|
||||
template [[host_name("kernel_tri_f16_3")]] kernel kernel_tri_t kernel_tri<half, 3>;
|
||||
#if defined(GGML_METAL_HAS_BF16)
|
||||
template [[host_name("kernel_tri_bf16_0")]] kernel kernel_tri_t kernel_tri<bfloat, 0>;
|
||||
template [[host_name("kernel_tri_bf16_1")]] kernel kernel_tri_t kernel_tri<bfloat, 1>;
|
||||
template [[host_name("kernel_tri_bf16_2")]] kernel kernel_tri_t kernel_tri<bfloat, 2>;
|
||||
template [[host_name("kernel_tri_bf16_3")]] kernel kernel_tri_t kernel_tri<bfloat, 3>;
|
||||
#endif
|
||||
@@ -0,0 +1,360 @@
|
||||
#include "common.h"
|
||||
|
||||
constant short FC_unary_op [[function_constant(FC_UNARY + 0)]];
|
||||
constant bool FC_unary_cnt[[function_constant(FC_UNARY + 1)]];
|
||||
|
||||
template <typename T0, typename T, typename TC>
|
||||
kernel void kernel_unary_impl(
|
||||
constant ggml_metal_kargs_unary & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
#define FC_OP FC_unary_op
|
||||
#define FC_CNT FC_unary_cnt
|
||||
|
||||
device const T0 * src0_ptr;
|
||||
device T * dst_ptr;
|
||||
|
||||
int i0;
|
||||
|
||||
if (FC_CNT) {
|
||||
i0 = tgpig.x;
|
||||
|
||||
src0_ptr = (device const T0 *) (src0);
|
||||
dst_ptr = (device T *) (dst);
|
||||
} else {
|
||||
const int i03 = tgpig.z;
|
||||
const int i02 = tgpig.y;
|
||||
const int k0 = tgpig.x/args.ne01;
|
||||
const int i01 = tgpig.x - k0*args.ne01;
|
||||
|
||||
i0 = k0*ntg.x + tpitg.x;
|
||||
|
||||
src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
|
||||
dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 );
|
||||
}
|
||||
|
||||
{
|
||||
//threadgroup_barrier(mem_flags::mem_none);
|
||||
|
||||
if (!FC_CNT) {
|
||||
if (i0 >= args.ne0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const TC x = (TC) src0_ptr[i0];
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_SCALE) {
|
||||
dst_ptr[i0] = (T) (args.scale * x + args.bias);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_FILL) {
|
||||
dst_ptr[i0] = (T) args.val;
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_CLAMP) {
|
||||
dst_ptr[i0] = (T) clamp(x, args.min, args.max);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_SQR) {
|
||||
dst_ptr[i0] = (T) (x * x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_SQRT) {
|
||||
dst_ptr[i0] = (T) sqrt(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_SIN) {
|
||||
dst_ptr[i0] = (T) sin(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_COS) {
|
||||
dst_ptr[i0] = (T) cos(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_LOG) {
|
||||
dst_ptr[i0] = (T) log(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_LEAKY_RELU) {
|
||||
dst_ptr[i0] = (T) (TC(x > 0)*x + TC(x <= 0)*(x * args.slope));
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_TANH) {
|
||||
dst_ptr[i0] = (T) precise::tanh(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_RELU) {
|
||||
dst_ptr[i0] = (T) fmax(0, x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_SIGMOID) {
|
||||
dst_ptr[i0] = (T) (1 / (1 + exp(-x)));
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_GELU) {
|
||||
dst_ptr[i0] = (T) (0.5*x*(1 + precise::tanh(SQRT_2_OVER_PI*x*(1 + GELU_COEF_A*x*x))));
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_GELU_ERF) {
|
||||
dst_ptr[i0] = (T) (0.5*x*(1 + erf_approx(SQRT_2_INV*x)));
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_GELU_QUICK) {
|
||||
dst_ptr[i0] = (T) (x * (1/(1 + exp(GELU_QUICK_COEF*x))));
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_SILU) {
|
||||
dst_ptr[i0] = (T) (x / (1 + exp(-x)));
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_ELU) {
|
||||
dst_ptr[i0] = (T) elu_approx(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_NEG) {
|
||||
dst_ptr[i0] = (T) -x;
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_ABS) {
|
||||
dst_ptr[i0] = (T) fabs(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_SGN) {
|
||||
dst_ptr[i0] = T(x > 0) - T(x < 0);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_STEP) {
|
||||
dst_ptr[i0] = T(x > 0);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_HARDSWISH) {
|
||||
dst_ptr[i0] = (T) (x * fmax(0, fmin(1, x/6 + 0.5)));
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_HARDSIGMOID) {
|
||||
dst_ptr[i0] = (T) fmax(0, fmin(1, x/6 + 0.5));
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_EXP) {
|
||||
dst_ptr[i0] = (T) exp(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_SOFTPLUS) {
|
||||
dst_ptr[i0] = (T) select(log(1 + exp(x)), x, x > 20);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_EXPM1) {
|
||||
// TODO: precise implementation
|
||||
dst_ptr[i0] = (T) (exp(x) - 1);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_FLOOR) {
|
||||
dst_ptr[i0] = (T) floor(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_CEIL) {
|
||||
dst_ptr[i0] = (T) ceil(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_ROUND) {
|
||||
dst_ptr[i0] = (T) round(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_TRUNC) {
|
||||
dst_ptr[i0] = (T) trunc(x);
|
||||
}
|
||||
|
||||
if (FC_OP == OP_UNARY_NUM_XIELU) {
|
||||
const TC xi = x;
|
||||
const TC gate = TC(xi > TC(0.0f));
|
||||
const TC clamped = fmin(xi, TC(args.val));
|
||||
const TC y_pos = TC(args.scale) * xi * xi + TC(args.bias) * xi;
|
||||
const TC y_neg = (exp(clamped) - TC(1.0f) - xi) * TC(args.slope) + TC(args.bias) * xi;
|
||||
dst_ptr[i0] = (T) (gate * y_pos + (TC(1.0f) - gate) * y_neg);
|
||||
}
|
||||
}
|
||||
|
||||
#undef FC_OP
|
||||
#undef FC_CNT
|
||||
}
|
||||
|
||||
typedef decltype(kernel_unary_impl<float, float, float>) kernel_unary_t;
|
||||
|
||||
template [[host_name("kernel_unary_f32_f32")]] kernel kernel_unary_t kernel_unary_impl<float, float, float>;
|
||||
template [[host_name("kernel_unary_f32_f32_4")]] kernel kernel_unary_t kernel_unary_impl<float4, float4, float4>;
|
||||
template [[host_name("kernel_unary_f16_f16")]] kernel kernel_unary_t kernel_unary_impl<half, half, float>;
|
||||
template [[host_name("kernel_unary_f16_f16_4")]] kernel kernel_unary_t kernel_unary_impl<half4, half4, float4>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_reglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
dst_row[i0] = (T)(x0*x1*(x0 > 0.0f));
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_reglu<float>) kernel_reglu_t;
|
||||
|
||||
template [[host_name("kernel_reglu_f32")]] kernel kernel_reglu_t kernel_reglu<float>;
|
||||
template [[host_name("kernel_reglu_f16")]] kernel kernel_reglu_t kernel_reglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu = 0.5f*x0*(1.0f + precise::tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0)));
|
||||
|
||||
dst_row[i0] = (T)(gelu*x1);
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_geglu<float>) kernel_geglu_t;
|
||||
|
||||
template [[host_name("kernel_geglu_f32")]] kernel kernel_geglu_t kernel_geglu<float>;
|
||||
template [[host_name("kernel_geglu_f16")]] kernel kernel_geglu_t kernel_geglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_swiglu(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float silu = x0 / (1.0f + exp(-x0));
|
||||
|
||||
dst_row[i0] = (T)(silu*x1);
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_swiglu<float>) kernel_swiglu_t;
|
||||
|
||||
template [[host_name("kernel_swiglu_f32")]] kernel kernel_swiglu_t kernel_swiglu<float>;
|
||||
template [[host_name("kernel_swiglu_f16")]] kernel kernel_swiglu_t kernel_swiglu<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_swiglu_oai(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
float x0 = src0_row[i0];
|
||||
float x1 = src1_row[i0];
|
||||
|
||||
x0 = min(x0, args.limit);
|
||||
x1 = max(min(x1, args.limit), -args.limit);
|
||||
|
||||
float out_glu = x0 / (1.0f + exp(-x0 * args.alpha));
|
||||
out_glu = out_glu * (1.0f + x1);
|
||||
|
||||
dst_row[i0] = (T)out_glu;
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_swiglu_oai<float>) kernel_swiglu_oai_t;
|
||||
|
||||
template [[host_name("kernel_swiglu_oai_f32")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai<float>;
|
||||
template [[host_name("kernel_swiglu_oai_f16")]] kernel kernel_swiglu_oai_t kernel_swiglu_oai<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu_erf(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_erf = 0.5f*x0*(1.0f+erf_approx<float>(x0*SQRT_2_INV));
|
||||
|
||||
dst_row[i0] = (T)(gelu_erf*x1);
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_geglu_erf<float>) kernel_geglu_erf_t;
|
||||
|
||||
template [[host_name("kernel_geglu_erf_f32")]] kernel kernel_geglu_erf_t kernel_geglu_erf<float>;
|
||||
template [[host_name("kernel_geglu_erf_f16")]] kernel kernel_geglu_erf_t kernel_geglu_erf<half>;
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_geglu_quick(
|
||||
constant ggml_metal_kargs_glu & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const T * src0_row = (device const T *) ((device const char *) src0 + tgpig*args.nb01) + args.i00;
|
||||
device const T * src1_row = (device const T *) ((device const char *) src1 + tgpig*args.nb11) + args.i10;
|
||||
device T * dst_row = (device T *) ((device char *) dst + tgpig*args.nb1);
|
||||
|
||||
for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) {
|
||||
const float x0 = src0_row[i0];
|
||||
const float x1 = src1_row[i0];
|
||||
|
||||
const float gelu_quick = x0*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x0)));
|
||||
|
||||
dst_row[i0] = (T)(gelu_quick*x1);
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_geglu_quick<float>) kernel_geglu_quick_t;
|
||||
|
||||
template [[host_name("kernel_geglu_quick_f32")]] kernel kernel_geglu_quick_t kernel_geglu_quick<float>;
|
||||
template [[host_name("kernel_geglu_quick_f16")]] kernel kernel_geglu_quick_t kernel_geglu_quick<half>;
|
||||
@@ -0,0 +1,179 @@
|
||||
#include "common.h"
|
||||
|
||||
constant bool FC_upscale_aa [[function_constant(FC_UPSCALE + 0)]];
|
||||
|
||||
kernel void kernel_upscale_nearest_f32(
|
||||
constant ggml_metal_kargs_upscale & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3/args.sf3;
|
||||
const int64_t i02 = i2/args.sf2;
|
||||
const int64_t i01 = i1/args.sf1;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const int64_t i00 = i0/args.sf0;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00);
|
||||
device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
|
||||
|
||||
dst_ptr[0] = src0_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
static inline float bilinear_tri(float x) {
|
||||
return MAX(0.0f, 1.0f - fabs(x));
|
||||
}
|
||||
|
||||
kernel void kernel_upscale_bilinear_f32(
|
||||
constant ggml_metal_kargs_upscale & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3 / args.sf3;
|
||||
const int64_t i02 = i2 / args.sf2;
|
||||
|
||||
const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs;
|
||||
const int64_t i01 = MAX(0, MIN(args.ne01 - 1, (int64_t)floor(f01)));
|
||||
const int64_t i01p = MAX(0, MIN(args.ne01 - 1, i01 + 1));
|
||||
const float fd1 = MAX(0.0f, MIN(1.0f, f01 - (float)i01));
|
||||
|
||||
src0 += i03*args.nb03 + i02*args.nb02;
|
||||
|
||||
device float * dst_ptr = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1);
|
||||
|
||||
if (FC_upscale_aa) {
|
||||
const float support0 = MAX(1.0f, 1.0f / args.sf0);
|
||||
const float invscale0 = 1.0f / support0;
|
||||
const float support1 = MAX(1.0f, 1.0f / args.sf1);
|
||||
const float invscale1 = 1.0f / support1;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
|
||||
|
||||
int64_t x_min = MAX((int64_t)0, (int64_t)floor(f00 - support0 + args.poffs));
|
||||
int64_t x_max = MIN(args.ne00, (int64_t)ceil (f00 + support0 + args.poffs));
|
||||
|
||||
int64_t y_min = MAX((int64_t)0, (int64_t)floor(f01 - support1 + args.poffs));
|
||||
int64_t y_max = MIN(args.ne01, (int64_t)ceil (f01 + support1 + args.poffs));
|
||||
|
||||
float sum = 0.0f;
|
||||
float wsum = 0.0f;
|
||||
|
||||
for (int64_t sy = y_min; sy < y_max; ++sy) {
|
||||
const float wy = MAX(0.0f, 1.0f - fabs((float)sy - f01) * invscale1);
|
||||
for (int64_t sx = x_min; sx < x_max; ++sx) {
|
||||
const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0);
|
||||
const float w = wx * wy;
|
||||
device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00);
|
||||
sum += (*src_ptr) * w;
|
||||
wsum += w;
|
||||
}
|
||||
}
|
||||
|
||||
const float v = (wsum > 0.0f) ? (sum / wsum) : 0.0f;
|
||||
dst_ptr[i0] = v;
|
||||
}
|
||||
} else {
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
|
||||
const int64_t i00 = MAX(0, MIN(args.ne00 - 1, (int64_t)floor(f00)));
|
||||
const int64_t i00p = MAX(0, MIN(args.ne00 - 1, i00 + 1));
|
||||
const float fd0 = MAX(0.0f, MIN(1.0f, f00 - (float)i00));
|
||||
|
||||
device const float * src00 = (device const float *)(src0 + i01*args.nb01 + i00*args.nb00);
|
||||
device const float * src10 = (device const float *)(src0 + i01*args.nb01 + i00p*args.nb00);
|
||||
device const float * src01 = (device const float *)(src0 + i01p*args.nb01 + i00*args.nb00);
|
||||
device const float * src11 = (device const float *)(src0 + i01p*args.nb01 + i00p*args.nb00);
|
||||
|
||||
const float v =
|
||||
(*src00) * (1.0f - fd0) * (1.0f - fd1) +
|
||||
(*src10) * fd0 * (1.0f - fd1) +
|
||||
(*src01) * (1.0f - fd0) * fd1 +
|
||||
(*src11) * fd0 * fd1;
|
||||
|
||||
dst_ptr[i0] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline float bicubic_weight1(float x) {
|
||||
const float a = -0.75f;
|
||||
return ((a + 2) * x - (a + 3)) * x * x + 1;
|
||||
}
|
||||
|
||||
static inline float bicubic_weight2(float x) {
|
||||
const float a = -0.75f;
|
||||
return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a;
|
||||
}
|
||||
|
||||
kernel void kernel_upscale_bicubic_f32(
|
||||
constant ggml_metal_kargs_upscale & args,
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3 / args.sf3;
|
||||
const int64_t i02 = i2 / args.sf2;
|
||||
|
||||
const float f01 = ((float)i1 + args.poffs) / args.sf1 - args.poffs;
|
||||
const int64_t i01 = (int64_t)floor(f01);
|
||||
const float fd1 = f01 - (float)i01;
|
||||
|
||||
const float w_y0 = bicubic_weight2(fd1 + 1.0f);
|
||||
const float w_y1 = bicubic_weight1(fd1);
|
||||
const float w_y2 = bicubic_weight1(1.0f - fd1);
|
||||
const float w_y3 = bicubic_weight2(2.0f - fd1);
|
||||
|
||||
const device const char * src_slice = src0 + i03 * args.nb03 + i02 * args.nb02;
|
||||
|
||||
device float * dst_ptr = (device float *)(dst + i3 * args.nb3 + i2 * args.nb2 + i1 * args.nb1);
|
||||
|
||||
for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
|
||||
const float f00 = ((float)i0 + args.poffs) / args.sf0 - args.poffs;
|
||||
const int64_t i00 = (int64_t)floor(f00);
|
||||
const float fd0 = f00 - (float)i00;
|
||||
|
||||
const float w_x0 = bicubic_weight2(fd0 + 1.0f);
|
||||
const float w_x1 = bicubic_weight1(fd0);
|
||||
const float w_x2 = bicubic_weight1(1.0f - fd0);
|
||||
const float w_x3 = bicubic_weight2(2.0f - fd0);
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int dy = -1; dy <= 2; ++dy) {
|
||||
const int64_t iy = MAX(0, MIN(args.ne01 - 1, i01 + dy));
|
||||
const float wy = (dy == -1) ? w_y0 : (dy == 0) ? w_y1 : (dy == 1) ? w_y2 : w_y3;
|
||||
|
||||
for (int dx = -1; dx <= 2; ++dx) {
|
||||
const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx));
|
||||
const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3;
|
||||
|
||||
device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00);
|
||||
sum += (*src_ptr) * wx * wy;
|
||||
}
|
||||
}
|
||||
|
||||
dst_ptr[i0] = sum;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,179 @@
|
||||
#include "common.h"
|
||||
|
||||
kernel void kernel_rwkv_wkv6_f32(
|
||||
device const float * k,
|
||||
device const float * v,
|
||||
device const float * r,
|
||||
device const float * tf,
|
||||
device const float * td,
|
||||
device const float * state_in,
|
||||
device float * dst,
|
||||
constant uint & B,
|
||||
constant uint & T,
|
||||
constant uint & C,
|
||||
constant uint & H,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const uint head_size = 64; // TODO: support head_size = 128
|
||||
const uint batch_id = tgpig.x / H;
|
||||
const uint head_id = tgpig.x % H;
|
||||
const uint tid = tpitg.x;
|
||||
|
||||
if (batch_id >= B || head_id >= H) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint state_size = C * head_size;
|
||||
const uint n_seq_tokens = T / B;
|
||||
|
||||
threadgroup float _k[head_size];
|
||||
threadgroup float _r[head_size];
|
||||
threadgroup float _tf[head_size];
|
||||
threadgroup float _td[head_size];
|
||||
|
||||
float state[head_size];
|
||||
|
||||
for (uint i = 0; i < head_size; i++) {
|
||||
state[i] = state_in[batch_id * state_size + head_id * head_size * head_size
|
||||
+ i * head_size + tid];
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
_tf[tid] = tf[head_id * head_size + tid];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid;
|
||||
const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid;
|
||||
|
||||
for (uint t = start_t; t < end_t; t += C) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
_k[tid] = k[t];
|
||||
_r[tid] = r[t];
|
||||
_td[tid] = td[t];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const float v_val = v[t];
|
||||
float y = 0.0;
|
||||
|
||||
for (uint j = 0; j < head_size; j += 4) {
|
||||
float4 k_vec = float4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
|
||||
float4 r_vec = float4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
|
||||
float4 tf_vec = float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]);
|
||||
float4 td_vec = float4(_td[j], _td[j+1], _td[j+2], _td[j+3]);
|
||||
float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]);
|
||||
|
||||
float4 kv = k_vec * v_val;
|
||||
|
||||
float4 temp = tf_vec * kv + s_vec;
|
||||
y += dot(r_vec, temp);
|
||||
|
||||
s_vec = s_vec * td_vec + kv;
|
||||
state[j] = s_vec[0];
|
||||
state[j+1] = s_vec[1];
|
||||
state[j+2] = s_vec[2];
|
||||
state[j+3] = s_vec[3];
|
||||
}
|
||||
|
||||
dst[t] = y;
|
||||
}
|
||||
|
||||
for (uint i = 0; i < head_size; i++) {
|
||||
dst[T * C + batch_id * state_size + head_id * head_size * head_size
|
||||
+ i * head_size + tid] = state[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_rwkv_wkv7_f32(
|
||||
device const float * r,
|
||||
device const float * w,
|
||||
device const float * k,
|
||||
device const float * v,
|
||||
device const float * a,
|
||||
device const float * b,
|
||||
device const float * state_in,
|
||||
device float * dst,
|
||||
constant uint & B,
|
||||
constant uint & T,
|
||||
constant uint & C,
|
||||
constant uint & H,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const uint head_size = 64; // TODO: support head_size = 128
|
||||
const uint batch_id = tgpig.x / H;
|
||||
const uint head_id = tgpig.x % H;
|
||||
const uint tid = tpitg.x;
|
||||
|
||||
if (batch_id >= B || head_id >= H) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint state_size = C * head_size;
|
||||
const uint n_seq_tokens = T / B;
|
||||
|
||||
threadgroup float _r[head_size];
|
||||
threadgroup float _w[head_size];
|
||||
threadgroup float _k[head_size];
|
||||
threadgroup float _a[head_size];
|
||||
threadgroup float _b[head_size];
|
||||
|
||||
float state[head_size];
|
||||
|
||||
for (uint i = 0; i < head_size; i++) {
|
||||
state[i] = state_in[batch_id * state_size + head_id * head_size * head_size
|
||||
+ tid * head_size + i];
|
||||
}
|
||||
|
||||
const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid;
|
||||
const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid;
|
||||
|
||||
for (uint t = start_t; t < end_t; t += C) {
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
_r[tid] = r[t];
|
||||
_w[tid] = w[t];
|
||||
_k[tid] = k[t];
|
||||
_a[tid] = a[t];
|
||||
_b[tid] = b[t];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const float v_val = v[t];
|
||||
float y = 0.0, sa = 0.0;
|
||||
|
||||
float4 sa_vec(0.0);
|
||||
|
||||
for (uint j = 0; j < head_size; j += 4) {
|
||||
float4 a_vec = float4(_a[j], _a[j+1], _a[j+2], _a[j+3]);
|
||||
float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]);
|
||||
sa_vec += a_vec * s_vec;
|
||||
}
|
||||
sa = sa_vec[0] + sa_vec[1] + sa_vec[2] + sa_vec[3];
|
||||
|
||||
for (uint j = 0; j < head_size; j += 4) {
|
||||
float4 r_vec = float4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
|
||||
float4 w_vec = float4(_w[j], _w[j+1], _w[j+2], _w[j+3]);
|
||||
float4 k_vec = float4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
|
||||
float4 b_vec = float4(_b[j], _b[j+1], _b[j+2], _b[j+3]);
|
||||
float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]);
|
||||
|
||||
float4 kv = k_vec * v_val;
|
||||
|
||||
s_vec = s_vec * w_vec + kv + sa * b_vec;
|
||||
y += dot(s_vec, r_vec);
|
||||
|
||||
state[j] = s_vec[0];
|
||||
state[j+1] = s_vec[1];
|
||||
state[j+2] = s_vec[2];
|
||||
state[j+3] = s_vec[3];
|
||||
}
|
||||
|
||||
dst[t] = y;
|
||||
}
|
||||
|
||||
for (uint i = 0; i < head_size; i++) {
|
||||
dst[T * C + batch_id * state_size + head_id * head_size * head_size
|
||||
+ tid * head_size + i] = state[i];
|
||||
}
|
||||
}
|
||||
@@ -3788,7 +3788,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) {
|
||||
ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants);
|
||||
}
|
||||
|
||||
static void ggml_backend_webgpu_request_adapter(wgpu::Instance & instance, wgpu::Adapter & adapter) {
|
||||
static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
|
||||
#ifndef __EMSCRIPTEN__
|
||||
@@ -3800,20 +3800,17 @@ static void ggml_backend_webgpu_request_adapter(wgpu::Instance & instance, wgpu:
|
||||
options.nextInChain = &adapterTogglesDesc;
|
||||
#endif
|
||||
|
||||
instance.WaitAny(instance.RequestAdapter(
|
||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
adapter = std::move(_adapter);
|
||||
}),
|
||||
UINT64_MAX);
|
||||
}
|
||||
|
||||
static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
|
||||
ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, ctx->webgpu_global_ctx->adapter);
|
||||
ctx->webgpu_global_ctx->instance.WaitAny(
|
||||
ctx->webgpu_global_ctx->instance.RequestAdapter(
|
||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
ctx->webgpu_global_ctx->adapter = std::move(adapter);
|
||||
}),
|
||||
UINT64_MAX);
|
||||
GGML_ASSERT(ctx->webgpu_global_ctx->adapter != nullptr);
|
||||
|
||||
ctx->webgpu_global_ctx->adapter.GetLimits(&ctx->webgpu_global_ctx->capabilities.limits);
|
||||
@@ -4546,7 +4543,20 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
|
||||
// Probe for adapter support
|
||||
wgpu::Adapter adapter;
|
||||
if (ctx->webgpu_global_ctx->instance != nullptr) {
|
||||
ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, adapter);
|
||||
wgpu::RequestAdapterOptions options = {};
|
||||
|
||||
// probe for adapter support
|
||||
ctx->webgpu_global_ctx->instance.WaitAny(
|
||||
ctx->webgpu_global_ctx->instance.RequestAdapter(
|
||||
&options, wgpu::CallbackMode::AllowSpontaneous,
|
||||
[&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) {
|
||||
if (status != wgpu::RequestAdapterStatus::Success) {
|
||||
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
||||
return;
|
||||
}
|
||||
adapter = std::move(_adapter);
|
||||
}),
|
||||
UINT64_MAX);
|
||||
}
|
||||
|
||||
// WebGPU backend requires f16 support and, on native, implicit device synchronization.
|
||||
|
||||
+10
-7
@@ -600,15 +600,18 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
|
||||
// convert fname (UTF-8)
|
||||
wchar_t * wfname = ggml_mbstowcs(fname);
|
||||
if (wfname) {
|
||||
// convert mode (UTF-8)
|
||||
wchar_t * wmode = ggml_mbstowcs(mode);
|
||||
if (wmode) {
|
||||
// open file
|
||||
file = _wfopen(wfname, wmode);
|
||||
GGML_FREE(wmode);
|
||||
}
|
||||
// convert mode (ANSI)
|
||||
wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
|
||||
wchar_t * wmode_p = wmode;
|
||||
do {
|
||||
*wmode_p++ = (wchar_t)*mode;
|
||||
} while (*mode++);
|
||||
|
||||
// open file
|
||||
file = _wfopen(wfname, wmode);
|
||||
|
||||
GGML_FREE(wfname);
|
||||
GGML_FREE(wmode);
|
||||
}
|
||||
|
||||
return file;
|
||||
|
||||
+8
-9
@@ -558,15 +558,14 @@ extern "C" {
|
||||
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
||||
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
||||
|
||||
LLAMA_API int32_t llama_model_n_ctx_train (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
||||
|
||||
// Get the model's RoPE frequency scaling factor
|
||||
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "refs/tags/v0.48.0"
|
||||
HTTPLIB_VERSION = "refs/tags/v0.47.0"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
@@ -1156,10 +1156,6 @@ void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
|
||||
sched_need_reserve = true;
|
||||
}
|
||||
|
||||
void llama_context::set_nextn_layer_offset(int32_t offset) {
|
||||
cparams.nextn_layer_offset = offset;
|
||||
}
|
||||
|
||||
void llama_context::set_causal_attn(bool value) {
|
||||
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
|
||||
|
||||
@@ -3703,10 +3699,6 @@ void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool valu
|
||||
ctx->set_embeddings_layer_inp(lid, value);
|
||||
}
|
||||
|
||||
void llama_set_nextn_layer_offset(llama_context * ctx, int32_t offset) {
|
||||
ctx->set_nextn_layer_offset(offset);
|
||||
}
|
||||
|
||||
llama_memory_t llama_get_memory(const struct llama_context * ctx) {
|
||||
if (!ctx) {
|
||||
return nullptr;
|
||||
|
||||
@@ -115,7 +115,6 @@ struct llama_context {
|
||||
void set_embeddings (bool value);
|
||||
void set_embeddings_nextn(bool value, bool masked);
|
||||
void set_embeddings_layer_inp(uint32_t lid, bool enable);
|
||||
void set_nextn_layer_offset(int32_t offset);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
|
||||
|
||||
@@ -18,8 +18,6 @@ struct llama_cparams {
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
int32_t nextn_layer_offset = 0;
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
|
||||
|
||||
@@ -95,11 +95,6 @@ LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_c
|
||||
// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits
|
||||
LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked);
|
||||
|
||||
// Select which appended NextN block the DECODER_MTP graph runs (offset past
|
||||
// the trunk: il = n_layer() + offset). Used by the speculative NextN driver to
|
||||
// chain multiple trained NextN heads. Default 0 (first head).
|
||||
LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);
|
||||
|
||||
// mirrors:
|
||||
// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
|
||||
|
||||
+2
-9
@@ -682,16 +682,9 @@ struct llm_graph_params {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448035248
|
||||
if (cparams.nextn_layer_offset != other.cparams.nextn_layer_offset) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return
|
||||
cparams.embeddings == other.cparams.embeddings &&
|
||||
cparams.embeddings_nextn == other.cparams.embeddings_nextn &&
|
||||
cparams.embeddings_nextn_masked == other.cparams.embeddings_nextn_masked &&
|
||||
cparams.causal_attn == other.cparams.causal_attn &&
|
||||
cparams.embeddings == other.cparams.embeddings &&
|
||||
cparams.causal_attn == other.cparams.causal_attn &&
|
||||
arch == other.arch &&
|
||||
gtype == other.gtype &&
|
||||
cvec == other.cvec &&
|
||||
|
||||
@@ -2312,10 +2312,6 @@ int32_t llama_model_n_layer(const llama_model * model) {
|
||||
return model->hparams.n_layer();
|
||||
}
|
||||
|
||||
int32_t llama_model_n_layer_nextn(const llama_model * model) {
|
||||
return model->hparams.n_layer_nextn;
|
||||
}
|
||||
|
||||
int32_t llama_model_n_head(const llama_model * model) {
|
||||
return model->hparams.n_head();
|
||||
}
|
||||
|
||||
+2
-2
@@ -932,8 +932,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
|
||||
// copy the KV pairs from the input file
|
||||
gguf_set_kv (ctx_out.get(), ml.metadata);
|
||||
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION).c_str(), GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_GENERAL_FILE_TYPE).c_str(), ftype);
|
||||
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
||||
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
|
||||
|
||||
// Remove split metadata
|
||||
gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
||||
|
||||
@@ -101,11 +101,11 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) {
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
||||
|
||||
// DSA indexer
|
||||
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags | TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED);
|
||||
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
|
||||
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
|
||||
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
|
||||
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
|
||||
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
|
||||
if (i < (int) hparams.n_layer_dense_lead) {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
|
||||
|
||||
@@ -156,8 +156,6 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
|
||||
|
||||
// MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
|
||||
@@ -179,8 +179,6 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p
|
||||
|
||||
// MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass.
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
res->t_layer_inp[il] = inpL;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
|
||||
|
||||
+28
-27
@@ -112,7 +112,7 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
|
||||
};
|
||||
|
||||
auto load_block_mtp = [&](int i) {
|
||||
auto load_block_mtp = [&](int i, bool is_first_mtp) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
const uint32_t n_head_l = hparams.n_head(i);
|
||||
@@ -121,12 +121,15 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
|
||||
|
||||
// The MTP block is a full Step3p5 decoder layer (mtp_block) plus the
|
||||
// NextN-specific wiring (enorm/hnorm/eh_proj + optional shared head).
|
||||
// Multi-block MTP: every declared MTP block is required (the draft chain
|
||||
// runs all n_layer_nextn heads), so each block uses the captured
|
||||
// `mtp_flags` directly — already NOT_REQUIRED for a trunk-only GGUF,
|
||||
// which keeps that path correct.
|
||||
// `mtp_flags` becomes NOT_REQUIRED when the GGUF is trunk-only.
|
||||
//
|
||||
// Only the FIRST MTP block (i == n_main) is required for the
|
||||
// single-block MTP runtime; trailing MTP blocks are always tolerated
|
||||
// as missing so pruned GGUFs (block 0 only) load cleanly. Override
|
||||
// mtp_flags to NOT_REQUIRED for those.
|
||||
const int eff_mtp_flags = is_first_mtp ? mtp_flags : (mtp_flags | TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, mtp_flags);
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, eff_mtp_flags);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
@@ -137,12 +140,12 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
|
||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, mtp_flags);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, mtp_flags);
|
||||
create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, eff_mtp_flags);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, eff_mtp_flags);
|
||||
|
||||
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, mtp_flags);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, eff_mtp_flags);
|
||||
|
||||
// dense MLP (leading dense blocks) — present if the MTP block isn't MoE
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
||||
@@ -162,9 +165,9 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
// NextN-specific tensors that define the MTP block.
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, mtp_flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, mtp_flags);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, mtp_flags);
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, eff_mtp_flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, eff_mtp_flags);
|
||||
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, eff_mtp_flags);
|
||||
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED);
|
||||
@@ -173,11 +176,13 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) {
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
load_block_trunk(i, trunk_flags);
|
||||
}
|
||||
// All n_layer_nextn MTP blocks are required — the multi-block draft chain
|
||||
// runs every head (head k at offset k). The GGUF declares the count via
|
||||
// step35.nextn_predict_layers.
|
||||
// Only the first MTP block (i == n_main) is required at runtime — the
|
||||
// single-block-MTP graph in build_arch_graph always uses that one.
|
||||
// Trailing MTP blocks are loaded if present (so an un-pruned GGUF with
|
||||
// all MTP layers still works) but tolerated when absent via the pruning
|
||||
// path. See scripts/prune_step35_extra_mtp.py for the pruner.
|
||||
for (int i = n_layer; i < n_layer_all; ++i) {
|
||||
load_block_mtp(i);
|
||||
load_block_mtp(i, /*is_first_mtp=*/ i == n_layer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -367,14 +372,13 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
|
||||
: llm_graph_context(params) {
|
||||
GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0");
|
||||
|
||||
// Multi-block MTP: the DECODER_MTP graph runs the MTP head selected by
|
||||
// cparams.nextn_layer_offset (0 = first trained head). The speculative driver
|
||||
// bumps the offset per draft step to chain heads 45->46->47. offset 0 keeps
|
||||
// single-block behavior identical to before.
|
||||
const int il = hparams.n_layer() + cparams.nextn_layer_offset;
|
||||
GGML_ASSERT(cparams.nextn_layer_offset >= 0 &&
|
||||
cparams.nextn_layer_offset < (int) hparams.n_layer_nextn &&
|
||||
"nextn_layer_offset out of range [0, n_layer_nextn)");
|
||||
// Single-block MTP only: always run the first trained MTP block (Qwen
|
||||
// MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to
|
||||
// be a much deeper refactor than this PR justifies; the trailing MTP
|
||||
// blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just
|
||||
// block 0) also work — see load_arch_tensors below and
|
||||
// scripts/prune_step35_extra_mtp.py.
|
||||
const int il = hparams.n_layer();
|
||||
const auto & layer = model.layers[il];
|
||||
|
||||
GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj");
|
||||
@@ -532,9 +536,6 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "mtp_post_ffn", il);
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
|
||||
// Pre-norm hidden state: used by the AR draft loop to seed the next MTP step.
|
||||
cb(cur, "h_nextn", -1);
|
||||
res->t_h_nextn = cur;
|
||||
|
||||
@@ -129,86 +129,8 @@ void test_gbnf_generation(testing &t) {
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= until-0
|
||||
root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])* ("<" | "</" | "</t" | "</ta" | "</tag")?
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
until-0 ::= | [<] until-0-01 | [^<] until-0
|
||||
until-0-01 ::= | [<] until-0-01 | [/] until-0-02 | [^/<] until-0
|
||||
until-0-02 ::= | [<] until-0-01 | [t] until-0-03 | [^<t] until-0
|
||||
until-0-03 ::= | [<] until-0-01 | [a] until-0-04 | [^<a] until-0
|
||||
until-0-04 ::= | [<] until-0-01 | [g] until-0-05 | [^<g] until-0
|
||||
until-0-05 ::= | [<] until-0-01 | [^<>] until-0
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
t.test("until grammar overlapping delimiter", [](testing &t) {
|
||||
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
|
||||
return p.until("\n</parameter>\n");
|
||||
});
|
||||
|
||||
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
|
||||
parser.build_grammar(builder);
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= until-0
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
until-0 ::= | [\n] until-0-01 | [^\n] until-0
|
||||
until-0-01 ::= | [\n] until-0-01 | [<] until-0-02 | [^\n<] until-0
|
||||
until-0-02 ::= | [\n] until-0-01 | [/] until-0-03 | [^\n/] until-0
|
||||
until-0-03 ::= | [\n] until-0-01 | [p] until-0-04 | [^\np] until-0
|
||||
until-0-04 ::= | [\n] until-0-01 | [a] until-0-05 | [^\na] until-0
|
||||
until-0-05 ::= | [\n] until-0-01 | [r] until-0-06 | [^\nr] until-0
|
||||
until-0-06 ::= | [\n] until-0-01 | [a] until-0-07 | [^\na] until-0
|
||||
until-0-07 ::= | [\n] until-0-01 | [m] until-0-08 | [^\nm] until-0
|
||||
until-0-08 ::= | [\n] until-0-01 | [e] until-0-09 | [^\ne] until-0
|
||||
until-0-09 ::= | [\n] until-0-01 | [t] until-0-10 | [^\nt] until-0
|
||||
until-0-10 ::= | [\n] until-0-01 | [e] until-0-11 | [^\ne] until-0
|
||||
until-0-11 ::= | [\n] until-0-01 | [r] until-0-12 | [^\nr] until-0
|
||||
until-0-12 ::= | [\n] until-0-01 | [>] until-0-13 | [^\n>] until-0
|
||||
until-0-13 ::= | [^\n] until-0
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
// DeepSeek-V3.2 tag prefix. The DSML token (|DSML|) embeds U+FF5C,
|
||||
// so the delimiter mixes ASCII and multi-byte codepoints.
|
||||
t.test("until grammar unicode delimiter", [](testing &t) {
|
||||
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
|
||||
return p.until("<|DSML|");
|
||||
});
|
||||
|
||||
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
|
||||
parser.build_grammar(builder);
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= until-0
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
until-0 ::= | [<] until-0-01 | [^<] until-0
|
||||
until-0-01 ::= | [<] until-0-01 | [\uFF5C] until-0-02 | [^<\uFF5C] until-0
|
||||
until-0-02 ::= | [<] until-0-01 | [D] until-0-03 | [^<D] until-0
|
||||
until-0-03 ::= | [<] until-0-01 | [S] until-0-04 | [^<S] until-0
|
||||
until-0-04 ::= | [<] until-0-01 | [M] until-0-05 | [^<M] until-0
|
||||
until-0-05 ::= | [<] until-0-01 | [L] until-0-06 | [^<L] until-0
|
||||
until-0-06 ::= | [<] until-0-01 | [^<\uFF5C] until-0
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
t.test("until grammar multiple delimiters", [](testing &t) {
|
||||
auto parser = build_peg_parser([](common_peg_parser_builder & p) {
|
||||
return p.until_one_of({"ab", "cd", "ef"});
|
||||
});
|
||||
|
||||
auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
|
||||
parser.build_grammar(builder);
|
||||
});
|
||||
|
||||
assert_gbnf_equal(t, R"""(
|
||||
root ::= until-0
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
until-0 ::= | [a] until-0-01 | [c] until-0-03 | [e] until-0-05 | [^ace] until-0
|
||||
until-0-01 ::= | [a] until-0-01 | [c] until-0-03 | [e] until-0-05 | [^abce] until-0
|
||||
until-0-03 ::= | [a] until-0-01 | [c] until-0-03 | [e] until-0-05 | [^acde] until-0
|
||||
until-0-05 ::= | [a] until-0-01 | [c] until-0-03 | [e] until-0-05 | [^acef] until-0
|
||||
)""", gbnf);
|
||||
});
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#undef NDEBUG
|
||||
#include <cassert>
|
||||
|
||||
static void test(void) {
|
||||
int main(void) {
|
||||
common_params params;
|
||||
|
||||
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
||||
@@ -210,13 +210,3 @@ static void test(void) {
|
||||
|
||||
printf("test-arg-parser: all tests OK\n\n");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
try {
|
||||
test();
|
||||
} catch (std::exception & e) {
|
||||
fprintf(stderr, "test-arg-parser: exception: %s\n", e.what());
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
+2
-2
@@ -5022,14 +5022,14 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run();
|
||||
|
||||
tst.test(
|
||||
"```json\n\"42\"\n```")
|
||||
"```json\n\"42\" \n```")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.json_schema(const_schema)
|
||||
.expect_content(R"("42")")
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
"\"42\"\n")
|
||||
"\"42\" \n")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.json_schema(const_schema)
|
||||
.expect_content(R"("42")")
|
||||
|
||||
@@ -995,32 +995,6 @@ static void test_macros(testing & t) {
|
||||
json::object(),
|
||||
"Hello, John Smith,Hi, Jane Doe"
|
||||
);
|
||||
|
||||
test_template(t, "macro with caller",
|
||||
"\
|
||||
{%- macro nest_dict(o, i, ff='') %}\n\
|
||||
{{- caller(ff) }}\n\
|
||||
{%- for k, v in o|items %}\n\
|
||||
{{- i + k + ': ' }}\n\
|
||||
{%- if v is mapping %}\n\
|
||||
{{- '{' }}\n\
|
||||
{% call(f) nest_dict(v, i + ' ') %}\n\
|
||||
{{- 'fail' if ff is undefined }}\n\
|
||||
{%- endcall %}\n\
|
||||
{{- i + '}' }}\n\
|
||||
{% else %}\n\
|
||||
{{- v|string }}\n\
|
||||
{% endif %}\n\
|
||||
{%- endfor %}\n\
|
||||
{%- endmacro %}\n\
|
||||
{%- call(f) nest_dict({'root1': 1, 'root2': {'nest1': 1, 'nest2': {'nest3': 2}}}, ' ', 'Dict') %}\n\
|
||||
{{- 'fail' if ff is defined }}\n\
|
||||
{{- f + ' {' }}\n\
|
||||
{% endcall %}\n\
|
||||
{{- '}' }}",
|
||||
json::object(),
|
||||
"Dict {\n root1: 1\n root2: {\n nest1: 1\n nest2: {\n nest3: 2\n }\n }\n}"
|
||||
);
|
||||
}
|
||||
|
||||
static void test_namespace(testing & t) {
|
||||
|
||||
@@ -92,7 +92,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minimum": 0
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ([0] | [1-9] [0-9]{0,15})
|
||||
root ::= ([0] | [1-9] [0-9]{0,15}) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -105,7 +105,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minimum": 1
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ([1-9] [0-9]{0,15})
|
||||
root ::= ([1-9] [0-9]{0,15}) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -118,7 +118,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minimum": 3
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15})
|
||||
root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -131,7 +131,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minimum": 9
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15})
|
||||
root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -144,7 +144,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minimum": 10
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15})
|
||||
root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -157,7 +157,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minimum": 25
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15})
|
||||
root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -170,7 +170,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": 30
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0"))
|
||||
root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -183,7 +183,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minimum": -5
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15})
|
||||
root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -196,7 +196,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minimum": -123
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15})
|
||||
root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -209,7 +209,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": -5
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15}))
|
||||
root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -222,7 +222,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": 1
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("-" [1-9] [0-9]{0,15} | [0-1])
|
||||
root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -235,7 +235,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": 100
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100")
|
||||
root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -249,7 +249,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": 23
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ([0-9] | ([1] [0-9] | [2] [0-3]))
|
||||
root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -263,7 +263,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": 300
|
||||
})""",
|
||||
R"""(
|
||||
root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00"))
|
||||
root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -277,7 +277,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": 30
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ([5-9] | ([1-2] [0-9] | [3] "0"))
|
||||
root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -291,7 +291,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": 42
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2]))
|
||||
root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -305,7 +305,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maximum": 10
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("-" ([0-9] | "10") | [0-9] | "10")
|
||||
root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -333,17 +333,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"empty schema (object)",
|
||||
"{}",
|
||||
R"""(
|
||||
array ::= "[" space ( value ("," space value)* )? space "]"
|
||||
boolean ::= ("true" | "false")
|
||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
||||
boolean ::= ("true" | "false") space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
null ::= "null"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
|
||||
null ::= "null" space
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
||||
root ::= object
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
value ::= object | array | string | number | boolean | null
|
||||
)"""
|
||||
});
|
||||
@@ -361,17 +361,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
date ::= [0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
|
||||
date-string ::= "\"" date "\""
|
||||
date-string ::= "\"" date "\"" space
|
||||
date-time ::= date "T" time
|
||||
date-time-string ::= "\"" date-time "\""
|
||||
root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 space "]"
|
||||
date-time-string ::= "\"" date-time "\"" space
|
||||
root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
|
||||
time-string ::= "\"" time "\""
|
||||
time-string ::= "\"" time "\"" space
|
||||
tuple-0 ::= date-string
|
||||
tuple-2 ::= time-string
|
||||
tuple-3 ::= date-time-string
|
||||
uuid ::= "\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""
|
||||
uuid ::= "\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -383,7 +383,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "\"" char* "\""
|
||||
root ::= "\"" char* "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -397,7 +397,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "\"" char+ "\""
|
||||
root ::= "\"" char+ "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -411,7 +411,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "\"" char{3,} "\""
|
||||
root ::= "\"" char{3,} "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -425,7 +425,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "\"" char{0,3} "\""
|
||||
root ::= "\"" char{0,3} "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -440,7 +440,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "\"" char{1,4} "\""
|
||||
root ::= "\"" char{1,4} "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -452,7 +452,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"type": "boolean"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("true" | "false")
|
||||
root ::= ("true" | "false") space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -465,7 +465,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
root ::= ("-"? integral-part)
|
||||
root ::= ("-"? integral-part) space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -477,7 +477,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"const": "foo"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"foo\""
|
||||
root ::= "\"foo\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -489,7 +489,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"const": 123
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "123"
|
||||
root ::= "123" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -501,7 +501,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"enum": ["red", "amber", "green", null, 42, ["foo"]]
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]")
|
||||
root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]") space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -515,9 +515,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "[" space (string ("," space string)*)? space "]"
|
||||
root ::= "[" space (string ("," space string)*)? "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -529,12 +529,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"prefixItems": { "type": "string" }
|
||||
})""",
|
||||
R"""(
|
||||
alternative-0 ::= "[" space (string ("," space string)*)? space "]"
|
||||
alternative-0 ::= "[" space (string ("," space string)*)? "]" space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
null ::= "null"
|
||||
null ::= "null" space
|
||||
root ::= alternative-0 | null
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -546,9 +546,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "[" space string space "]"
|
||||
root ::= "[" space string "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -562,10 +562,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= "[" space string "," space number space "]"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "[" space string "," space number "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -577,18 +577,18 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"items": {}
|
||||
})""",
|
||||
R"""(
|
||||
array ::= "[" space ( value ("," space value)* )? space "]"
|
||||
boolean ::= ("true" | "false")
|
||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
||||
boolean ::= ("true" | "false") space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
item ::= object
|
||||
null ::= "null"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
|
||||
root ::= "[" space (item ("," space item)*)? space "]"
|
||||
null ::= "null" space
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
||||
root ::= "[" space (item ("," space item)*)? "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
value ::= object | array | string | number | boolean | null
|
||||
)"""
|
||||
});
|
||||
@@ -602,18 +602,18 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"prefixItems": { "type": "string" }
|
||||
})""",
|
||||
R"""(
|
||||
array ::= "[" space ( value ("," space value)* )? space "]"
|
||||
boolean ::= ("true" | "false")
|
||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
||||
boolean ::= ("true" | "false") space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
item ::= object
|
||||
null ::= "null"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
|
||||
root ::= "[" space (item ("," space item)*)? space "]"
|
||||
null ::= "null" space
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
||||
root ::= "[" space (item ("," space item)*)? "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
value ::= object | array | string | number | boolean | null
|
||||
)"""
|
||||
});
|
||||
@@ -627,7 +627,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
R"""(
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -642,8 +642,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"minItems": 2
|
||||
})""",
|
||||
R"""(
|
||||
boolean ::= ("true" | "false")
|
||||
root ::= "[" space boolean ("," space boolean)+ space "]"
|
||||
boolean ::= ("true" | "false") space
|
||||
root ::= "[" space boolean ("," space boolean)+ "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -658,8 +658,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maxItems": 0
|
||||
})""",
|
||||
R"""(
|
||||
boolean ::= ("true" | "false")
|
||||
root ::= "[" space space "]"
|
||||
boolean ::= ("true" | "false") space
|
||||
root ::= "[" space "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -674,8 +674,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maxItems": 1
|
||||
})""",
|
||||
R"""(
|
||||
boolean ::= ("true" | "false")
|
||||
root ::= "[" space boolean? space "]"
|
||||
boolean ::= ("true" | "false") space
|
||||
root ::= "[" space boolean? "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -690,8 +690,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maxItems": 2
|
||||
})""",
|
||||
R"""(
|
||||
boolean ::= ("true" | "false")
|
||||
root ::= "[" space (boolean ("," space boolean)?)? space "]"
|
||||
boolean ::= ("true" | "false") space
|
||||
root ::= "[" space (boolean ("," space boolean)?)? "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -708,11 +708,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integer ::= ("-"? integral-part)
|
||||
integer ::= ("-"? integral-part) space
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
item ::= number | integer
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= "[" space item ("," space item){2,4} space "]"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "[" space item ("," space item){2,4} "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -730,8 +730,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maxItems": 5
|
||||
})""",
|
||||
R"""(
|
||||
item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7]))
|
||||
root ::= "[" space item ("," space item){2,4} space "]"
|
||||
item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
|
||||
root ::= "[" space item ("," space item){2,4} "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -749,8 +749,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"maxItems": 5
|
||||
})""",
|
||||
R"""(
|
||||
item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7]))
|
||||
root ::= "[" space item ("," space item){2,4} space "]"
|
||||
item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
|
||||
root ::= "[" space item ("," space item){2,4} "]" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -763,7 +763,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"pattern": "^abc?d*efg+(hij)?kl$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\""
|
||||
root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -776,7 +776,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" ("[]{}()|+*?") "\""
|
||||
root ::= "\"" ("[]{}()|+*?") "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -789,7 +789,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"pattern": "^\"$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" ("\"") "\""
|
||||
root ::= "\"" ("\"") "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -802,7 +802,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"pattern": "^A|B|C|D$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" ("A" | "B" | "C" | "D") "\""
|
||||
root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -816,7 +816,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
dot ::= [^\x0A\x0D]
|
||||
root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\""
|
||||
root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
|
||||
root-1 ::= [0-9]
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
@@ -845,9 +845,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
b-kv ::= "\"b\"" space ":" space string
|
||||
c-kv ::= "\"c\"" space ":" space string
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "{" space b-kv "," space c-kv "," space a-kv space "}"
|
||||
root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -865,9 +865,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
R"""(
|
||||
a-kv ::= "\"a\"" space ":" space string
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "{" space (a-kv )? space "}"
|
||||
root ::= "{" space (a-kv )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -889,9 +889,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
b-rest ::= ( "," space c-kv )?
|
||||
c-kv ::= "\"c\"" space ":" space string
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? space "}"
|
||||
root ::= "{" space (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -915,9 +915,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
d-kv ::= "\"d\"" space ":" space string
|
||||
d-rest ::= ( "," space c-kv )?
|
||||
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? space "}"
|
||||
root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -930,14 +930,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
additional-kv ::= string ":" space additional-value
|
||||
additional-value ::= "[" space (number ("," space number)*)? space "]"
|
||||
additional-value ::= "[" space (number ("," space number)*)? "]" space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= "{" space (additional-kv ( "," space additional-kv )* )? space "}"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space (additional-kv ( "," space additional-kv )* )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -949,17 +949,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"additionalProperties": true
|
||||
})""",
|
||||
R"""(
|
||||
array ::= "[" space ( value ("," space value)* )? space "]"
|
||||
boolean ::= ("true" | "false")
|
||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
||||
boolean ::= ("true" | "false") space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
null ::= "null"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
|
||||
null ::= "null" space
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
||||
root ::= object
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
value ::= object | array | string | number | boolean | null
|
||||
)"""
|
||||
});
|
||||
@@ -971,17 +971,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"type": "object"
|
||||
})""",
|
||||
R"""(
|
||||
array ::= "[" space ( value ("," space value)* )? space "]"
|
||||
boolean ::= ("true" | "false")
|
||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
||||
boolean ::= ("true" | "false") space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
null ::= "null"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
|
||||
null ::= "null" space
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
||||
root ::= object
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
value ::= object | array | string | number | boolean | null
|
||||
)"""
|
||||
});
|
||||
@@ -994,7 +994,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"additionalProperties": false
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "{" space space "}"
|
||||
root ::= "{" space "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1012,15 +1012,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
a-kv ::= "\"a\"" space ":" space number
|
||||
additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["]
|
||||
additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
|
||||
additional-kv ::= additional-k ":" space string
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? space "}"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -1037,13 +1037,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
R"""(
|
||||
a-kv ::= "\"a\"" space ":" space number
|
||||
a-rest ::= ( "," space additional-kv )*
|
||||
additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["]
|
||||
additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
|
||||
additional-kv ::= additional-k ":" space number
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= "{" space (a-kv a-rest | additional-kv ( "," space additional-kv )* )? space "}"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space (a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1061,7 +1061,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"additionalProperties": {"type": "number"}
|
||||
})""",
|
||||
R"""(
|
||||
additional-k ::= ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["]
|
||||
additional-k ::= ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
|
||||
additional-kv ::= additional-k ":" space number
|
||||
also-kv ::= "\"also\"" space ":" space number
|
||||
also-rest ::= ( "," space additional-kv )*
|
||||
@@ -1069,8 +1069,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? space "}"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1090,13 +1090,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
-rest ::= ( "," space a-kv )? a-rest
|
||||
a-kv ::= "\"a\"" space ":" space integer
|
||||
a-rest ::= ( "," space additional-kv )*
|
||||
additional-k ::= ["] ( [a] char+ | [^"a] char* ) ["]
|
||||
additional-k ::= ["] ( [a] char+ | [^"a] char* ) ["] space
|
||||
additional-kv ::= additional-k ":" space integer
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
integer ::= ("-"? integral-part)
|
||||
integer ::= ("-"? integral-part) space
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
root ::= ("-"? integral-part)
|
||||
root0 ::= "{" space (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? space "}"
|
||||
root ::= ("-"? integral-part) space
|
||||
root0 ::= "{" space (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1116,12 +1116,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
a-rest ::= ( "," space aa-kv )? aa-rest
|
||||
aa-kv ::= "\"aa\"" space ":" space integer
|
||||
aa-rest ::= ( "," space additional-kv )*
|
||||
additional-k ::= ["] ( [a] ([a] char+ | [^"a] char*) | [^"a] char* )? ["]
|
||||
additional-k ::= ["] ( [a] ([a] char+ | [^"a] char*) | [^"a] char* )? ["] space
|
||||
additional-kv ::= additional-k ":" space integer
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
integer ::= ("-"? integral-part)
|
||||
integer ::= ("-"? integral-part) space
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
root ::= "{" space (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? space "}"
|
||||
root ::= "{" space (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1141,12 +1141,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
ab-rest ::= ( "," space ac-kv )? ac-rest
|
||||
ac-kv ::= "\"ac\"" space ":" space integer
|
||||
ac-rest ::= ( "," space additional-kv )*
|
||||
additional-k ::= ["] ( [a] ([b] char+ | [c] char+ | [^"bc] char*) | [^"a] char* )? ["]
|
||||
additional-k ::= ["] ( [a] ([b] char+ | [c] char+ | [^"bc] char*) | [^"a] char* )? ["] space
|
||||
additional-kv ::= additional-k ":" space integer
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
integer ::= ("-"? integral-part)
|
||||
integer ::= ("-"? integral-part) space
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
root ::= "{" space (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? space "}"
|
||||
root ::= "{" space (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1173,11 +1173,11 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
})""",
|
||||
R"""(
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
ref-definitions-foo ::= "{" space ref-definitions-foo-a-kv space "}"
|
||||
ref-definitions-foo ::= "{" space ref-definitions-foo-a-kv "}" space
|
||||
ref-definitions-foo-a-kv ::= "\"a\"" space ":" space string
|
||||
root ::= ref-definitions-foo
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -1204,10 +1204,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
alternative-1 ::= ref-definitions-bar
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
ref-definitions-bar ::= "{" space (ref-definitions-bar-b-kv )? space "}"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
ref-definitions-bar ::= "{" space (ref-definitions-bar-b-kv )? "}" space
|
||||
ref-definitions-bar-b-kv ::= "\"b\"" space ":" space number
|
||||
ref-definitions-foo ::= "{" space (ref-definitions-foo-a-kv )? space "}"
|
||||
ref-definitions-foo ::= "{" space (ref-definitions-foo-a-kv )? "}" space
|
||||
ref-definitions-foo-a-kv ::= "\"a\"" space ":" space number
|
||||
root ::= alternative-0 | alternative-1
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
@@ -1241,14 +1241,14 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
b ::= b-0 | boolean
|
||||
b-0 ::= string
|
||||
b-kv ::= "\"b\"" space ":" space b
|
||||
boolean ::= ("true" | "false")
|
||||
boolean ::= ("true" | "false") space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= "{" space (a-kv a-rest | b-kv )? space "}"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space (a-kv a-rest | b-kv )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
)"""
|
||||
});
|
||||
|
||||
@@ -1290,8 +1290,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
d-rest ::= ( "," space c-kv )?
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? space "}"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1311,7 +1311,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
}
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("\"a\"" | "\"b\"")
|
||||
root ::= ("\"a\"" | "\"b\"") space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1336,7 +1336,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
}
|
||||
})""",
|
||||
R"""(
|
||||
root ::= ("\"b\"" | "\"c\"")
|
||||
root ::= ("\"b\"" | "\"c\"") space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1378,13 +1378,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
R"""(
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
number- ::= "{" space number-number-kv space "}"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
number- ::= "{" space number-number-kv "}" space
|
||||
number-kv ::= "\"number\"" space ":" space number-
|
||||
number-number ::= "{" space number-number-root-kv space "}"
|
||||
number-number ::= "{" space number-number-root-kv "}" space
|
||||
number-number-kv ::= "\"number\"" space ":" space number-number
|
||||
number-number-root-kv ::= "\"root\"" space ":" space number
|
||||
root ::= "{" space number-kv space "}"
|
||||
root ::= "{" space number-kv "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1394,17 +1394,17 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"description only (no type) treated as unconstrained",
|
||||
R"""({"description": "The 0-based index of the last line to be retrieved (inclusive). If None, read until the end of the file."})""",
|
||||
R"""(
|
||||
array ::= "[" space ( value ("," space value)* )? space "]"
|
||||
boolean ::= ("true" | "false")
|
||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
||||
boolean ::= ("true" | "false") space
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||
decimal-part ::= [0-9]{1,16}
|
||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||
null ::= "null"
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? space "}"
|
||||
null ::= "null" space
|
||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
||||
root ::= value
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
string ::= "\"" char* "\""
|
||||
string ::= "\"" char* "\"" space
|
||||
value ::= object | array | string | number | boolean | null
|
||||
)"""
|
||||
});
|
||||
@@ -1428,9 +1428,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||
"type": "object"
|
||||
})""",
|
||||
R"""(
|
||||
code ::= "\" \\r \\n \\\" \\\\ \""
|
||||
code ::= "\" \\r \\n \\\" \\\\ \"" space
|
||||
code-kv ::= "\"code\"" space ":" space code
|
||||
root ::= "{" space code-kv space "}"
|
||||
root ::= "{" space code-kv "}" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)"""
|
||||
});
|
||||
@@ -1547,7 +1547,7 @@ int main() {
|
||||
"pattern": "^(?:foo|bar)baz$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" (("foo" | "bar") "baz") "\""
|
||||
root ::= "\"" (("foo" | "bar") "baz") "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""",
|
||||
});
|
||||
@@ -1560,7 +1560,7 @@ int main() {
|
||||
"pattern": "^(?:(?:ab)+c)?d$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" ((("ab")+ "c")? "d") "\""
|
||||
root ::= "\"" ((("ab")+ "c")? "d") "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""",
|
||||
});
|
||||
|
||||
+1
-2
@@ -161,7 +161,7 @@
|
||||
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
|
||||
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
||||
| `--image, --audio, --video FILE` | path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files |
|
||||
| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files |
|
||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
||||
@@ -174,7 +174,6 @@
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
|
||||
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
|
||||
| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) |
|
||||
| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
|
||||
| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||
| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||
|
||||
+1
-1
@@ -202,7 +202,7 @@ struct cli_context {
|
||||
|
||||
// TODO: support remote files in the future (http, https, etc)
|
||||
std::string load_input_file(const std::string & fname, bool is_media) {
|
||||
std::ifstream file = fs_open_ifstream(fname, std::ios::binary);
|
||||
std::ifstream file(fname, std::ios::binary);
|
||||
if (!file) {
|
||||
return "";
|
||||
}
|
||||
|
||||
@@ -13,14 +13,6 @@
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <fstream>
|
||||
|
||||
#ifdef _WIN32
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// Internal header for clip.cpp
|
||||
|
||||
@@ -669,22 +661,6 @@ struct clip_image_f32_batch {
|
||||
// common utils
|
||||
//
|
||||
|
||||
#ifdef _WIN32
|
||||
static std::ifstream open_ifstream_binary(const std::string & fname) {
|
||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0);
|
||||
if (!wlen) {
|
||||
throw std::runtime_error("failed to convert filename to UTF-16: " + fname);
|
||||
}
|
||||
std::vector<wchar_t> wfname(wlen);
|
||||
(void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen);
|
||||
return std::ifstream(wfname.data(), std::ios::binary);
|
||||
}
|
||||
#else
|
||||
static std::ifstream open_ifstream_binary(const std::string & fname) {
|
||||
return std::ifstream(fname, std::ios::binary);
|
||||
}
|
||||
#endif
|
||||
|
||||
static std::string string_format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
|
||||
+26
-89
@@ -1045,17 +1045,8 @@ struct clip_model_loader {
|
||||
bool has_vision = false;
|
||||
bool has_audio = false;
|
||||
|
||||
mtmd_progress_callback progress_callback = nullptr;
|
||||
void * progress_callback_user_data = nullptr;
|
||||
|
||||
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_model
|
||||
clip_model_loader(const char * fname,
|
||||
bool skip_tensors = false,
|
||||
mtmd_progress_callback progress_cb = nullptr,
|
||||
void * progress_user_data = nullptr)
|
||||
: fname(fname),
|
||||
progress_callback(progress_cb),
|
||||
progress_callback_user_data(progress_user_data) {
|
||||
clip_model_loader(const char * fname, bool skip_tensors = false) : fname(fname) {
|
||||
struct ggml_context * meta = nullptr;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
@@ -1684,9 +1675,6 @@ struct clip_model_loader {
|
||||
// note: some models having hparams.image_size == 0, which means the image size is dynamic
|
||||
throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size));
|
||||
}
|
||||
if (hparams.image_size > 65536) {
|
||||
throw std::runtime_error(string_format("%s: image_size (%d) is too large (max 65536)\n", __func__, hparams.image_size));
|
||||
}
|
||||
if (hparams.patch_size <= 0) {
|
||||
throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
|
||||
}
|
||||
@@ -1735,19 +1723,6 @@ struct clip_model_loader {
|
||||
LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft);
|
||||
LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len);
|
||||
LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len);
|
||||
|
||||
// GEMMA4UA is encoder-free: it uses n_mel_bins as a raw-waveform frame size (640) and has no FFT/filterbank, so the mel-range and FFT
|
||||
// checks below do not apply to it.
|
||||
const bool fft_based = model.proj_type != PROJECTOR_TYPE_GEMMA4UA;
|
||||
|
||||
// Validate audio hparams loaded from GGUF metadata
|
||||
if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) {
|
||||
throw std::runtime_error(string_format("%s: n_mel_bins (%d) must be in range [1, 256]\n", __func__, hparams.n_mel_bins));
|
||||
}
|
||||
if (fft_based && (hparams.audio_sample_rate <= 0 || hparams.audio_n_fft <= 0 || hparams.audio_hop_len <= 0 || hparams.audio_window_len <= 0)) {
|
||||
throw std::runtime_error(string_format("%s: audio hparams invalid: sample_rate=%d n_fft=%d window_len=%d hop_len=%d\n",
|
||||
__func__, hparams.audio_sample_rate, hparams.audio_n_fft, hparams.audio_window_len, hparams.audio_hop_len));
|
||||
}
|
||||
}
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
@@ -1761,7 +1736,7 @@ struct clip_model_loader {
|
||||
std::map<std::string, size_t> tensor_offset;
|
||||
std::vector<ggml_tensor *> tensors_to_load;
|
||||
|
||||
auto fin = open_ifstream_binary(fname);
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -2796,60 +2771,37 @@ struct clip_model_loader {
|
||||
}
|
||||
|
||||
// load data
|
||||
{
|
||||
if (!ctx_clip.no_alloc) {
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
// start loading event
|
||||
if (progress_callback){
|
||||
progress_callback(0.0, progress_callback_user_data);
|
||||
}
|
||||
|
||||
// compute total tensor data size for progress reporting
|
||||
size_t total_data_size = 0;
|
||||
for (auto & t : tensors_to_load) {
|
||||
total_data_size += ggml_nbytes(t);
|
||||
}
|
||||
|
||||
// alloc memory and offload data
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
|
||||
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
|
||||
ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
// read the weight from file
|
||||
if (!ctx_clip.no_alloc) {
|
||||
size_t data_loaded = 0;
|
||||
for (auto & t : tensors_to_load) {
|
||||
ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
|
||||
GGML_ASSERT(cur && "tensor not found in ctx_data");
|
||||
auto it_off = tensor_offset.find(t->name);
|
||||
GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
|
||||
const size_t offset = it_off->second;
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
|
||||
}
|
||||
size_t num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
} else {
|
||||
// read into a temporary buffer first, then copy to device memory
|
||||
read_buf.resize(num_bytes);
|
||||
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
data_loaded += num_bytes;
|
||||
if (progress_callback && total_data_size > 0) {
|
||||
const float progress = (float)data_loaded / (float)total_data_size;
|
||||
if (!progress_callback(progress, progress_callback_user_data)) {
|
||||
throw std::runtime_error(string_format("%s: model loading cancelled by progress_callback\n", __func__));
|
||||
}
|
||||
}
|
||||
for (auto & t : tensors_to_load) {
|
||||
ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
|
||||
GGML_ASSERT(cur && "tensor not found in ctx_data");
|
||||
auto it_off = tensor_offset.find(t->name);
|
||||
GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
|
||||
const size_t offset = it_off->second;
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
|
||||
}
|
||||
size_t num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
} else {
|
||||
// read into a temporary buffer first, then copy to device memory
|
||||
read_buf.resize(num_bytes);
|
||||
fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
|
||||
} else {
|
||||
LOG_DBG("%s: no_alloc is set, skipping tensor data loading (%zu tensors)\n", __func__, tensors_to_load.size());
|
||||
}
|
||||
fin.close();
|
||||
|
||||
LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str());
|
||||
}
|
||||
|
||||
}
|
||||
@@ -2879,12 +2831,6 @@ struct clip_model_loader {
|
||||
img.set_size({sz, sz}, false, false);
|
||||
LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz);
|
||||
} else {
|
||||
// GEMMA4UA uses n_mel_bins as a raw-waveform frame size (640), not a mel-bin count,
|
||||
// so the [1, 256] bound only applies to FFT-based models.
|
||||
const bool fft_based = ctx_clip.model.proj_type != PROJECTOR_TYPE_GEMMA4UA;
|
||||
if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) {
|
||||
throw std::runtime_error(string_format("%s: invalid n_mel_bins (%d), must be in [1, 256]\n", __func__, hparams.n_mel_bins));
|
||||
}
|
||||
img.set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false);
|
||||
LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size);
|
||||
}
|
||||
@@ -3048,13 +2994,7 @@ struct clip_model_loader {
|
||||
}
|
||||
return;
|
||||
}
|
||||
const uint32_t val = gguf_get_val_u32(ctx_gguf.get(), i);
|
||||
// sanity check
|
||||
if (val > (uint32_t) INT32_MAX) {
|
||||
throw std::runtime_error(string_format("%s: value %u for key '%s' exceeds INT32_MAX\n",
|
||||
__func__, val, key.c_str()));
|
||||
}
|
||||
output = (int) val;
|
||||
output = gguf_get_val_u32(ctx_gguf.get(), i);
|
||||
}
|
||||
|
||||
void get_f32(const std::string & key, float & output, bool required = true) const {
|
||||
@@ -3137,10 +3077,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
|
||||
clip_ctx * ctx_audio = nullptr;
|
||||
|
||||
try {
|
||||
clip_model_loader loader(fname,
|
||||
/* skip_tensors */ false,
|
||||
ctx_params.progress_callback,
|
||||
ctx_params.progress_callback_user_data);
|
||||
clip_model_loader loader(fname);
|
||||
bool skip_audio = false;
|
||||
|
||||
if (loader.has_vision) {
|
||||
|
||||
@@ -24,9 +24,6 @@ struct clip_image_size {
|
||||
return !(*this == other);
|
||||
}
|
||||
int area() const {
|
||||
// avoid overflow when computing area
|
||||
GGML_ASSERT(width >= 0 && width <= 46000);
|
||||
GGML_ASSERT(height >= 0 && height <= 46000);
|
||||
return width * height;
|
||||
}
|
||||
};
|
||||
@@ -54,8 +51,6 @@ struct clip_context_params {
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
bool no_alloc;
|
||||
mtmd_progress_callback progress_callback;
|
||||
void * progress_callback_user_data;
|
||||
};
|
||||
|
||||
struct clip_init_result {
|
||||
|
||||
+63
-76
@@ -32,8 +32,8 @@ void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
|
||||
}
|
||||
}
|
||||
|
||||
void mtmd_audio_cache::fill_mel_filterbank_matrix(int64_t n_mel,
|
||||
int64_t n_fft,
|
||||
void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel,
|
||||
int n_fft,
|
||||
int sample_rate,
|
||||
float fmin,
|
||||
float fmax,
|
||||
@@ -86,16 +86,11 @@ void mtmd_audio_cache::fill_mel_filterbank_matrix(int64_t n_mel,
|
||||
hz_pts[i] = mel_to_hz(mel_pts[i]);
|
||||
}
|
||||
|
||||
const int64_t n_fft_bins = n_fft / 2 + 1;
|
||||
|
||||
// Validate allocation size
|
||||
if ((size_t)n_mel * (size_t)n_fft_bins > SIZE_MAX) {
|
||||
GGML_ASSERT(false && "mel filterbank allocation too large");
|
||||
}
|
||||
const int n_fft_bins = n_fft / 2 + 1;
|
||||
|
||||
// filterbank
|
||||
std::vector<float> out((size_t)n_mel * (size_t)n_fft_bins, 0);
|
||||
for (int64_t m = 0; m < n_mel; ++m) {
|
||||
std::vector<float> out(n_mel * n_fft_bins, 0);
|
||||
for (int m = 0; m < n_mel; ++m) {
|
||||
const double f_left = hz_pts[m];
|
||||
const double f_center = hz_pts[m + 1];
|
||||
const double f_right = hz_pts[m + 2];
|
||||
@@ -271,8 +266,8 @@ static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out)
|
||||
}
|
||||
|
||||
struct filter_params {
|
||||
int64_t n_mel;
|
||||
int64_t n_fft_bins;
|
||||
int32_t n_mel;
|
||||
int32_t n_fft_bins;
|
||||
int32_t hann_window_size;
|
||||
int32_t hop_length;
|
||||
int32_t sample_rate;
|
||||
@@ -298,8 +293,8 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
std::vector<float> fft_in(frame_size * 2, 0.0);
|
||||
std::vector<float> fft_out(frame_size * 2 * 2 * 2);
|
||||
|
||||
int64_t n_fft_bins = params.n_fft_bins;
|
||||
int64_t i = ith;
|
||||
int n_fft_bins = params.n_fft_bins;
|
||||
int i = ith;
|
||||
|
||||
const auto & filters = cache.filters;
|
||||
|
||||
@@ -307,18 +302,17 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2));
|
||||
GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size());
|
||||
// calculate FFT only when fft_in are not all zero
|
||||
for (; i < std::min((int64_t)(n_samples / frame_step + 1), out.n_len); i += n_threads) {
|
||||
const int64_t offset = i * frame_step;
|
||||
for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) {
|
||||
const int offset = i * frame_step;
|
||||
|
||||
// apply Hann window (~10% faster)
|
||||
const int valid_len = std::min(frame_size, std::max(0, n_samples - (int)offset));
|
||||
for (int j = 0; j < valid_len; j++) {
|
||||
for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
|
||||
fft_in[j] = hann[j] * samples[offset + j];
|
||||
}
|
||||
|
||||
// fill the rest with zeros
|
||||
if (valid_len < frame_size) {
|
||||
std::fill(fft_in.begin() + valid_len, fft_in.end(), 0.0);
|
||||
if (n_samples - offset < frame_size) {
|
||||
std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
|
||||
}
|
||||
|
||||
// FFT
|
||||
@@ -331,7 +325,7 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
}
|
||||
|
||||
// mel spectrogram
|
||||
for (int64_t j = 0; j < out.n_mel; j++) {
|
||||
for (int j = 0; j < out.n_mel; j++) {
|
||||
double sum = 0.0;
|
||||
// unroll loop (suggested by GH user @lunixbochs)
|
||||
int k = 0;
|
||||
@@ -345,21 +339,21 @@ static void log_mel_spectrogram_worker_thread(int ith,
|
||||
}
|
||||
// handle n_fft remainder
|
||||
for (; k < n_fft_bins; k++) {
|
||||
sum += fft_out[k] * filters.data[(size_t)j * n_fft_bins + k];
|
||||
sum += fft_out[k] * filters.data[j * n_fft_bins + k];
|
||||
}
|
||||
sum = std::max(sum, (double)params.mel_floor);
|
||||
sum = params.use_natural_log
|
||||
? log(sum)
|
||||
: log10(sum);
|
||||
out.data[(size_t)j * out.n_len + i] = sum;
|
||||
out.data[j * out.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise fft_out are all zero
|
||||
double sum = params.use_natural_log ? log(1e-10) : log10(1e-10);
|
||||
for (; i < out.n_len; i += n_threads) {
|
||||
for (int64_t j = 0; j < out.n_mel; j++) {
|
||||
out.data[(size_t)j * out.n_len + i] = sum;
|
||||
for (int j = 0; j < out.n_mel; j++) {
|
||||
out.data[j * out.n_len + i] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -443,21 +437,16 @@ static bool log_mel_spectrogram(
|
||||
GGML_ASSERT(params.hop_length > 0);
|
||||
out.n_mel = params.n_mel;
|
||||
out.n_len = (n_samples - frame_size) / frame_step + 1;
|
||||
// Validate dimensions before allocation to prevent integer overflow
|
||||
if (out.n_mel <= 0 || out.n_len <= 0) {
|
||||
LOG_ERR("%s: invalid mel dimensions n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len);
|
||||
return false;
|
||||
}
|
||||
const size_t total_size = (size_t)out.n_mel * (size_t)out.n_len;
|
||||
if (total_size > SIZE_MAX / sizeof(float)) {
|
||||
LOG_ERR("%s: size overflow: n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len);
|
||||
// TODO: handle these checks better
|
||||
if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) {
|
||||
LOG_ERR("%s: size overflow\n", __func__);
|
||||
return false;
|
||||
}
|
||||
if (n_samples < frame_size) {
|
||||
LOG_ERR("%s: not enough samples after padding\n", __func__);
|
||||
return false;
|
||||
}
|
||||
out.data.resize(total_size);
|
||||
out.data.resize(out.n_mel * out.n_len);
|
||||
|
||||
{
|
||||
std::vector<std::thread> workers(n_threads - 1);
|
||||
@@ -475,39 +464,38 @@ static bool log_mel_spectrogram(
|
||||
}
|
||||
}
|
||||
|
||||
const int64_t effective_n_len = n_samples_in / frame_step;
|
||||
const int effective_n_len = n_samples_in / frame_step;
|
||||
if (params.norm_per_feature) {
|
||||
GGML_ASSERT(effective_n_len > 1);
|
||||
for (int64_t i = 0; i < out.n_mel; i++) {
|
||||
for (int i = 0; i < out.n_mel; i++) {
|
||||
double mean = 0;
|
||||
for (int64_t j = 0; j < effective_n_len; ++j) {
|
||||
mean += out.data[(size_t)i * out.n_len + j];
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
mean += out.data[i * out.n_len + j];
|
||||
}
|
||||
mean /= effective_n_len;
|
||||
|
||||
double var = 0.0;
|
||||
for (int64_t j = 0; j < effective_n_len; ++j) {
|
||||
const double value = out.data[(size_t)i * out.n_len + j] - mean;
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
const double value = out.data[i * out.n_len + j] - mean;
|
||||
var += value * value;
|
||||
}
|
||||
var /= effective_n_len - 1; // unbiased
|
||||
const double mstd = std::sqrt(var + 1e-5);
|
||||
|
||||
for (int64_t j = 0; j < effective_n_len; ++j) {
|
||||
auto &value = out.data[(size_t)i * out.n_len + j];
|
||||
for (int j = 0; j < effective_n_len; ++j) {
|
||||
auto &value = out.data[i * out.n_len + j];
|
||||
value = (value - mean) / mstd;
|
||||
}
|
||||
|
||||
// pad the rest with zeros
|
||||
for (int64_t j = effective_n_len; j < out.n_len; ++j) {
|
||||
out.data[(size_t)i * out.n_len + j] = 0.0;
|
||||
for (int j = effective_n_len; j < out.n_len; ++j) {
|
||||
out.data[i * out.n_len + j] = 0.0;
|
||||
}
|
||||
}
|
||||
} else if (!params.no_padding) {
|
||||
// Whisper-style clamping and normalization (NOT used by Gemma4)
|
||||
double mmax = -1e20;
|
||||
const size_t mel_size = (size_t)out.n_mel * (size_t)out.n_len;
|
||||
for (size_t i = 0; i < mel_size; i++) {
|
||||
for (int i = 0; i < out.n_mel*out.n_len; i++) {
|
||||
if (out.data[i] > mmax) {
|
||||
mmax = out.data[i];
|
||||
}
|
||||
@@ -515,7 +503,7 @@ static bool log_mel_spectrogram(
|
||||
|
||||
mmax -= 8.0;
|
||||
|
||||
for (size_t i = 0; i < mel_size; i++) {
|
||||
for (int i = 0; i < out.n_mel*out.n_len; i++) {
|
||||
if (out.data[i] < mmax) {
|
||||
out.data[i] = mmax;
|
||||
}
|
||||
@@ -594,13 +582,13 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
|
||||
// because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
|
||||
// we always expect the mel to have 3000 silent frames at the end
|
||||
if (DEBUG) {
|
||||
printf("output: n_mel = %d, n_len = %d\n", (int) out_full.n_mel, (int) out_full.n_len);
|
||||
printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len);
|
||||
}
|
||||
const size_t frames_per_chunk = 3000;
|
||||
GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk);
|
||||
for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
|
||||
int64_t n_len = std::min((int64_t)frames_per_chunk, out_full.n_len - (int64_t)off);
|
||||
if (n_len < (int64_t)frames_per_chunk) {
|
||||
int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
|
||||
if ((size_t) n_len < frames_per_chunk) {
|
||||
break; // last incomplete chunk will always be a padded chunk, safe to ignore
|
||||
}
|
||||
|
||||
@@ -608,10 +596,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
|
||||
out_chunk.n_len = n_len;
|
||||
out_chunk.n_mel = out_full.n_mel;
|
||||
out_chunk.n_len_org = out_full.n_mel; // unused
|
||||
out_chunk.data.reserve((size_t)out_chunk.n_mel * (size_t)out_chunk.n_len);
|
||||
out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
|
||||
|
||||
for (int64_t i = 0; i < out_full.n_mel; i++) {
|
||||
auto src = out_full.data.begin() + (size_t)i * out_full.n_len + off;
|
||||
for (int i = 0; i < out_full.n_mel; i++) {
|
||||
auto src = out_full.data.begin() + i * out_full.n_len + off;
|
||||
out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
|
||||
}
|
||||
|
||||
@@ -693,8 +681,8 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa
|
||||
|
||||
// The effective frame count: center-padded STFT gives ~n_samples/hop_length frames.
|
||||
// We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames.
|
||||
const int64_t n_eff = std::min(mel_full.n_len,
|
||||
(int64_t)(n_samples / hparams.audio_hop_len) + 1);
|
||||
const int n_eff = std::min(mel_full.n_len,
|
||||
(int)(n_samples / hparams.audio_hop_len) + 1);
|
||||
|
||||
// Split into inference windows matching n_window_infer=800 from model config.
|
||||
// Each window is padded to the next multiple of chunk_size for the cgraph.
|
||||
@@ -702,18 +690,18 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa
|
||||
const int chunk_size = 100; // conv sub-chunk size (n_window * 2, n_window=50)
|
||||
const int window_size = 800; // mel frames per forward pass (n_window_infer=800)
|
||||
|
||||
for (int64_t off = 0; off < n_eff; off += window_size) {
|
||||
const int64_t win_eff = std::min((int64_t)window_size, n_eff - off);
|
||||
const int64_t n_chunks = (win_eff + chunk_size - 1) / chunk_size;
|
||||
const int64_t n_padded = n_chunks * chunk_size;
|
||||
for (int off = 0; off < n_eff; off += window_size) {
|
||||
const int win_eff = std::min(window_size, n_eff - off);
|
||||
const int n_chunks = (win_eff + chunk_size - 1) / chunk_size;
|
||||
const int n_padded = n_chunks * chunk_size;
|
||||
|
||||
mtmd_audio_mel out;
|
||||
out.n_mel = mel_full.n_mel;
|
||||
out.n_len = n_padded;
|
||||
out.n_len_org = win_eff;
|
||||
out.data.assign((size_t)out.n_mel * (size_t)out.n_len, 0.0f);
|
||||
for (int64_t m = 0; m < out.n_mel; m++) {
|
||||
const int64_t copy_len = std::min((int64_t)win_eff, mel_full.n_len - off);
|
||||
out.data.assign(out.n_mel * out.n_len, 0.0f);
|
||||
for (int m = 0; m < out.n_mel; m++) {
|
||||
const int copy_len = std::min(win_eff, mel_full.n_len - off);
|
||||
if (copy_len > 0) {
|
||||
std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off,
|
||||
mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len,
|
||||
@@ -835,38 +823,37 @@ bool mtmd_audio_preprocessor_granite_speech::preprocess(const float *
|
||||
}
|
||||
|
||||
double mmax = -1e20;
|
||||
const size_t mel_size = (size_t)mel.n_mel * (size_t)mel.n_len;
|
||||
for (size_t i = 0; i < mel_size; i++) {
|
||||
for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
|
||||
if (mel.data[i] > mmax) {
|
||||
mmax = mel.data[i];
|
||||
}
|
||||
}
|
||||
mmax -= 8.0;
|
||||
|
||||
for (size_t i = 0; i < mel_size; i++) {
|
||||
for (int i = 0; i < mel.n_mel * mel.n_len; i++) {
|
||||
if (mel.data[i] < mmax) {
|
||||
mel.data[i] = mmax;
|
||||
}
|
||||
mel.data[i] = (mel.data[i] + 4.0) / 4.0;
|
||||
}
|
||||
|
||||
int64_t n_frames = mel.n_len;
|
||||
int n_frames = mel.n_len;
|
||||
if (n_frames % 2 == 1) {
|
||||
n_frames--;
|
||||
}
|
||||
const int64_t n_mel = mel.n_mel;
|
||||
const int64_t n_stacked = n_frames / 2;
|
||||
const int n_mel = mel.n_mel;
|
||||
const int n_stacked = n_frames / 2;
|
||||
|
||||
mtmd_audio_mel stacked;
|
||||
stacked.n_mel = 2 * n_mel;
|
||||
stacked.n_len = n_stacked;
|
||||
stacked.n_len_org = (int64_t)n_samples;
|
||||
stacked.data.resize((size_t)2 * (size_t)n_mel * (size_t)n_stacked);
|
||||
stacked.n_len_org = (int)n_samples;
|
||||
stacked.data.resize(2 * n_mel * n_stacked);
|
||||
|
||||
for (int64_t t = 0; t < n_stacked; t++) {
|
||||
for (int64_t m = 0; m < n_mel; m++) {
|
||||
stacked.data[(size_t)m * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t];
|
||||
stacked.data[(size_t)(m + n_mel) * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t + 1];
|
||||
for (int t = 0; t < n_stacked; t++) {
|
||||
for (int m = 0; m < n_mel; m++) {
|
||||
stacked.data[m * n_stacked + t] = mel.data[m * mel.n_len + 2 * t];
|
||||
stacked.data[(m + n_mel) * n_stacked + t] = mel.data[m * mel.n_len + 2 * t + 1];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -934,8 +921,8 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s
|
||||
const int hop = hparams.audio_hop_len;
|
||||
const int n_with_left = (int)chunk_len + pad_left;
|
||||
// PyTorch: unfold(size=frame_length+1, step=hop) on semicausal-padded waveform
|
||||
const int64_t pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
|
||||
const int64_t n_padded_needed = (pt_frames - 1) * hop + fft_size;
|
||||
const int pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1;
|
||||
const int n_padded_needed = (pt_frames - 1) * hop + fft_size;
|
||||
const int total_pad = std::max((int)(n_padded_needed - (int)chunk_len), pad_left);
|
||||
std::vector<float> padded_samples(total_pad + chunk_len, 0.0f);
|
||||
std::copy(chunk_ptr, chunk_ptr + chunk_len, padded_samples.data() + pad_left);
|
||||
|
||||
@@ -10,16 +10,16 @@
|
||||
#define MTMD_INTERNAL_HEADER
|
||||
|
||||
struct mtmd_audio_mel {
|
||||
int64_t n_len;
|
||||
int64_t n_len_org;
|
||||
int64_t n_mel;
|
||||
int n_len;
|
||||
int n_len_org;
|
||||
int n_mel;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct mtmd_audio_mel_filters {
|
||||
int64_t n_mel;
|
||||
int64_t n_fft;
|
||||
int32_t n_mel;
|
||||
int32_t n_fft;
|
||||
|
||||
std::vector<float> data;
|
||||
};
|
||||
@@ -39,8 +39,8 @@ struct mtmd_audio_cache {
|
||||
|
||||
// Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
|
||||
// n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
|
||||
void fill_mel_filterbank_matrix(int64_t n_mel,
|
||||
int64_t n_fft,
|
||||
void fill_mel_filterbank_matrix(int n_mel,
|
||||
int n_fft,
|
||||
int sample_rate, // e.g. 16000
|
||||
float fmin = 0.0f, // e.g. 0.0
|
||||
float fmax = -1.0f, // e.g. sr/2; pass -1 for auto
|
||||
|
||||
@@ -396,9 +396,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
|
||||
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
// Ctrl+C handling
|
||||
{
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
|
||||
@@ -582,29 +582,13 @@ mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx,
|
||||
}
|
||||
|
||||
mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) {
|
||||
#ifdef _WIN32
|
||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
|
||||
if (!wlen) {
|
||||
LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname);
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
std::vector<wchar_t> wfname(wlen);
|
||||
wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wfname.data(), wlen);
|
||||
if (!wlen) {
|
||||
LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname);
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
FILE * f = _wfopen(wfname.data(), L"rb");
|
||||
#else
|
||||
std::vector<unsigned char> buf;
|
||||
FILE * f = fopen(fname, "rb");
|
||||
#endif
|
||||
if (!f) {
|
||||
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
||||
return {nullptr, nullptr};
|
||||
}
|
||||
|
||||
std::vector<unsigned char> buf;
|
||||
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
|
||||
+2
-12
@@ -251,8 +251,6 @@ mtmd_context_params mtmd_context_params_default() {
|
||||
/* cb_eval */ nullptr,
|
||||
/* cb_eval_user_data */ nullptr,
|
||||
/* batch_max_tokens */ 1024,
|
||||
/* progress_callback */ nullptr,
|
||||
/* progress_callback_user_data */ nullptr,
|
||||
};
|
||||
return params;
|
||||
}
|
||||
@@ -347,8 +345,6 @@ struct mtmd_context {
|
||||
/* cb_eval */ ctx_params.cb_eval,
|
||||
/* cb_eval_user_data */ ctx_params.cb_eval_user_data,
|
||||
/* no_alloc */ no_alloc,
|
||||
/* progress_callback */ ctx_params.progress_callback,
|
||||
/* progress_callback_user_data */ ctx_params.progress_callback_user_data,
|
||||
};
|
||||
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
@@ -1299,12 +1295,9 @@ struct mtmd_tokenizer {
|
||||
for (auto & mel_spec : mel_spec_chunks) {
|
||||
const bool is_placeholder = mel_spec.data.empty();
|
||||
|
||||
// Validate dimensions fit in clip_image_size (int)
|
||||
GGML_ASSERT(mel_spec.n_len <= INT32_MAX && mel_spec.n_len >= 0);
|
||||
GGML_ASSERT(mel_spec.n_mel <= INT32_MAX && mel_spec.n_mel >= 0);
|
||||
clip_image_f32 mel_f32;
|
||||
mel_f32.set_size(
|
||||
{(int)mel_spec.n_len, (int)mel_spec.n_mel},
|
||||
{mel_spec.n_len, mel_spec.n_mel},
|
||||
is_placeholder, /* is_audio */ true);
|
||||
mel_f32.cpy_buf(mel_spec.data);
|
||||
|
||||
@@ -2137,12 +2130,9 @@ std::map<ggml_backend_dev_t, size_t> mtmd_get_memory_usage(const char * mmproj_f
|
||||
mtmd::context_ptr ctx;
|
||||
auto saved_log_callback = g_logger_state.log_callback;
|
||||
auto saved_log_user_data = g_logger_state.log_callback_user_data;
|
||||
|
||||
ctx_params.progress_callback = nullptr;
|
||||
|
||||
try {
|
||||
mtmd_log_set(stub_log_callback, nullptr); // suppress logging
|
||||
ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params, true));
|
||||
ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params));
|
||||
mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback
|
||||
std::map<ggml_backend_dev_t, size_t> total_mem;
|
||||
auto merge = [&](const struct clip_ctx * c) {
|
||||
|
||||
@@ -83,8 +83,6 @@ typedef struct mtmd_input_chunks mtmd_input_chunks;
|
||||
typedef struct mtmd_input_text mtmd_input_text;
|
||||
typedef struct mtmd_batch mtmd_batch;
|
||||
|
||||
typedef bool (*mtmd_progress_callback)(float progress, void * user_data);
|
||||
|
||||
struct mtmd_context_params {
|
||||
bool use_gpu;
|
||||
bool print_timings;
|
||||
@@ -106,12 +104,6 @@ struct mtmd_context_params {
|
||||
int32_t batch_max_tokens; // maximum number of output tokens in a batch
|
||||
// (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit)
|
||||
// (default: 1024)
|
||||
|
||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
// If the provided progress_callback returns true, model loading continues.
|
||||
// If it returns false, model loading is immediately aborted.
|
||||
mtmd_progress_callback progress_callback;
|
||||
void * progress_callback_user_data;
|
||||
};
|
||||
|
||||
MTMD_API const char * mtmd_default_marker(void);
|
||||
|
||||
@@ -180,17 +180,6 @@ That requires `JSON.stringify` when formatted to message content:
|
||||
}
|
||||
```
|
||||
|
||||
### Router mode: how child <--> router communicates
|
||||
|
||||
Upon spawning a new child process using `subprocess`, both child and router listen to the stdout/stderr (combined)
|
||||
|
||||
For the direction from child to router:
|
||||
- Generic messages are logs, it will be forwarded to router's stdout
|
||||
- Special state update messages are prefixed by `cmd_child_to_router:state:`, followed by a JSON. See `server_models::handle_child_state` for more
|
||||
|
||||
For the direction from router to child:
|
||||
- When server sends `cmd_router_to_child:exit`, the child should exit gracefully --> if after `DEFAULT_STOP_TIMEOUT` and the child is still running, force-kill it
|
||||
|
||||
### Model management API (router mode)
|
||||
|
||||
Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)
|
||||
|
||||
+13
-35
@@ -175,12 +175,13 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `-np, --parallel N` | number of server slots (default: -1, -1 = auto)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
||||
| `-tk, --talker-model FILE` | path to the qwen3-omni talker gguf, enables the /v1/audio/speech endpoint<br/>(env: LLAMA_ARG_TALKER_MODEL) |
|
||||
| `-c2w, --code2wav-model FILE` | path to the qwen3-omni code2wav gguf, the talker code detokenizer<br/>(env: LLAMA_ARG_CODE2WAV_MODEL) |
|
||||
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
|
||||
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||
| `--mtmd-batch-max-tokens N` | maximum number of image tokens per batch when encoding images (default: 1024)<br/>(env: LLAMA_ARG_MTMD_BATCH_MAX_TOKENS) |
|
||||
| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
|
||||
| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) |
|
||||
@@ -189,21 +190,23 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)<br/>(env: LLAMA_ARG_REUSE_PORT) |
|
||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
|
||||
| `--ui-config, --webui-config JSON` | JSON that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG) |
|
||||
| `--ui-config-file, --webui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG_FILE) |
|
||||
| `--ui-mcp-proxy, --webui-mcp-proxy, --no-ui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_UI_MCP_PROXY) |
|
||||
| `--webui-config JSON` | [DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
|
||||
| `--ui-config JSON` | JSON that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG) |
|
||||
| `--webui-config-file PATH` | [DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
|
||||
| `--ui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)<br/>(env: LLAMA_ARG_UI_CONFIG_FILE) |
|
||||
| `--webui-mcp-proxy, --no-webui-mcp-proxy` | [DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy<br/>(env: LLAMA_ARG_WEBUI_MCP_PROXY) |
|
||||
| `--ui-mcp-proxy, --no-ui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_UI_MCP_PROXY) |
|
||||
| `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)<br/>specify "all" to enable all tools<br/>available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime<br/>(env: LLAMA_ARG_TOOLS) |
|
||||
| `-ag, --agent, -no-ag, --no-agent` | whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)<br/>(env: LLAMA_ARG_AGENT) |
|
||||
| `--ui, --webui, --no-ui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_UI) |
|
||||
| `--webui, --no-webui` | [DEPRECATED: use --ui/--no-ui] whether to enable the Web UI<br/>(env: LLAMA_ARG_WEBUI) |
|
||||
| `--ui, --no-ui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_UI) |
|
||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||
| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||
| `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||
| `--api-key-file FNAME` | path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
|
||||
| `--api-key-file FNAME` | path to file containing API keys (default: none)<br/>(env: LLAMA_ARG_API_KEY_FILE) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||
| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 3600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||
| `--sse-ping-interval N` | server SSE ping interval in seconds (-1 = disabled, default: 30)<br/>(env: LLAMA_ARG_SSE_PING_INTERVAL) |
|
||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||
| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)<br/>(env: LLAMA_ARG_CACHE_PROMPT) |
|
||||
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||
@@ -228,7 +231,6 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
|
||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||
| `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) |
|
||||
| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) |
|
||||
| `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) |
|
||||
| `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||
| `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||
@@ -1859,33 +1861,9 @@ Example events:
|
||||
|
||||
{
|
||||
"model": "...",
|
||||
"event": "model_status",
|
||||
"event": "download_finished",
|
||||
"data": {
|
||||
"status": "loading",
|
||||
"progress": {
|
||||
"stage": "fit_params",
|
||||
"value": 0.5 // from 0.0 to 1.0 ; note: not all stages have this "value"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
"model": "...",
|
||||
"event": "model_status",
|
||||
"data": {
|
||||
"status": "loaded",
|
||||
"info": {
|
||||
// note: only include info on first load
|
||||
// waking up from sleep doesn't have this
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
"model": "...",
|
||||
"event": "model_status",
|
||||
"data": {
|
||||
"status": "sleeping"
|
||||
"status": "loading"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user