mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-25 15:17:41 +02:00
Compare commits
63 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6bd12ce409 | |||
| 5442939fcc | |||
| 56411a950f | |||
| 2b737caae1 | |||
| ee3dff6b8e | |||
| edc29433fa | |||
| 8b99e2aa66 | |||
| 271ff3fc44 | |||
| e2b065071c | |||
| 0548a4187f | |||
| 9335b969e8 | |||
| c41767154e | |||
| 74b239b3d5 | |||
| 852aafb163 | |||
| 0136966daf | |||
| 10b1e45876 | |||
| 197c00681b | |||
| 95f84d5ce8 | |||
| 5487593bc7 | |||
| 1d8fca72ae | |||
| 62bfef5194 | |||
| eaf6e03174 | |||
| d6ef0e77dd | |||
| dff451cfa1 | |||
| d298382ad9 | |||
| 32a28217f4 | |||
| c429b33beb | |||
| 9146d36fe7 | |||
| b9adcbbf92 | |||
| 9588f196b1 | |||
| 3cbd23ed88 | |||
| 00c6390793 | |||
| faa0e6979a | |||
| 9791f40258 | |||
| 902184dd3a | |||
| 57684331fc | |||
| b83bab15a5 | |||
| d041d2ceaa | |||
| 27891f6db0 | |||
| fbca2f27fc | |||
| 0df0aa8e43 | |||
| 74f33adf5f | |||
| 1debe72737 | |||
| 007489e895 | |||
| 8b94e799df | |||
| 3015851c5a | |||
| 55ac3b7aea | |||
| dacfcebd60 | |||
| 9b82476ee9 | |||
| a61a94e543 | |||
| 152da28ae5 | |||
| d48c88cbd5 | |||
| e84b71c2c6 | |||
| 1b1e27cb49 | |||
| fbf777d2b9 | |||
| cd93a28cb1 | |||
| 1e374365d1 | |||
| 197ff91462 | |||
| 6ff13987ad | |||
| 38c03478a3 | |||
| b18532a4ef | |||
| fcda1128bc | |||
| 03d8900ebe |
@@ -0,0 +1,50 @@
|
||||
name: Low Severity Bugs
|
||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "low severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
@@ -0,0 +1,50 @@
|
||||
name: Medium Severity Bug
|
||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "medium severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
@@ -0,0 +1,50 @@
|
||||
name: High Severity Bug
|
||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "high severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
@@ -0,0 +1,50 @@
|
||||
name: Critical Severity Bug
|
||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "critical severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
@@ -0,0 +1,51 @@
|
||||
name: Enhancement
|
||||
description: Used to request enhancements for llama.cpp
|
||||
title: "Feature Request: "
|
||||
labels: ["enhancement"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
attributes:
|
||||
label: Prerequisites
|
||||
description: Please confirm the following before submitting your enhancement request.
|
||||
options:
|
||||
- label: I am running the latest code. Mention the version if possible as well.
|
||||
required: true
|
||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
required: true
|
||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||
required: true
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: feature-description
|
||||
attributes:
|
||||
label: Feature Description
|
||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
placeholder: Detailed description of the enhancement
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: motivation
|
||||
attributes:
|
||||
label: Motivation
|
||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
placeholder: Explanation of why this feature is needed and its benefits
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: possible-implementation
|
||||
attributes:
|
||||
label: Possible Implementation
|
||||
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||
placeholder: Detailed description of potential implementation
|
||||
validations:
|
||||
required: false
|
||||
@@ -0,0 +1,38 @@
|
||||
name: Question
|
||||
description: Used to ask questions about llama.cpp
|
||||
title: "Question: "
|
||||
labels: ["question"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
[Please search your question first in Discussion if you got a common general question.](https://github.com/ggerganov/llama.cpp/discussions/categories/q-a)
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
attributes:
|
||||
label: Prerequisites
|
||||
description: Please confirm the following before submitting your question.
|
||||
options:
|
||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||
required: true
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new useful question to share that cannot be answered within Discussions.
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: background-description
|
||||
attributes:
|
||||
label: Background Description
|
||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an question.
|
||||
placeholder: Detailed description of your question
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: possible-answer
|
||||
attributes:
|
||||
label: Possible Answer
|
||||
description: If you have some idea of possible answers you want to confirm, that would also be appreciated.
|
||||
placeholder: Your idea of possible answers
|
||||
validations:
|
||||
required: false
|
||||
@@ -0,0 +1,28 @@
|
||||
name: Refactor (Maintainers)
|
||||
description: Used to track refactoring opportunities
|
||||
title: "Refactor: "
|
||||
labels: ["refactor"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||
|
||||
- type: textarea
|
||||
id: background-description
|
||||
attributes:
|
||||
label: Background Description
|
||||
description: Please provide a detailed written description of the pain points you are trying to solve.
|
||||
placeholder: Detailed description behind your motivation to request refactor
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: possible-approaches
|
||||
attributes:
|
||||
label: Possible Refactor Approaches
|
||||
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
|
||||
placeholder: Your idea of possible refactoring opportunity/approaches
|
||||
validations:
|
||||
required: false
|
||||
@@ -1,11 +0,0 @@
|
||||
---
|
||||
name: Bug template
|
||||
about: Used to report bugs in llama.cpp
|
||||
labels: ["bug-unconfirmed"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
||||
|
||||
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
||||
@@ -1,28 +0,0 @@
|
||||
---
|
||||
name: Enhancement template
|
||||
about: Used to request enhancements for llama.cpp
|
||||
labels: ["enhancement"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# Prerequisites
|
||||
|
||||
Please answer the following questions for yourself before submitting an issue.
|
||||
|
||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
||||
|
||||
# Feature Description
|
||||
|
||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
|
||||
# Motivation
|
||||
|
||||
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
|
||||
# Possible Implementation
|
||||
|
||||
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||
+18
-1
@@ -1,5 +1,16 @@
|
||||
# https://github.com/actions/labeler
|
||||
|
||||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-kompute.h
|
||||
- ggml-kompute.cpp
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-metal.h
|
||||
- ggml-metal.cpp
|
||||
- README-metal.md
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -9,6 +20,7 @@ SYCL:
|
||||
Nvidia GPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-cuda.h
|
||||
- ggml-cuda/**
|
||||
Vulkan:
|
||||
- changed-files:
|
||||
@@ -62,6 +74,8 @@ server:
|
||||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml.c
|
||||
- ggml.h
|
||||
- ggml-*.c
|
||||
- ggml-*.h
|
||||
- ggml-cuda/**
|
||||
@@ -71,3 +85,6 @@ nix:
|
||||
- "**/*.nix"
|
||||
- .github/workflows/nix-*.yml
|
||||
- .devops/nix/nixpkgs-instances.nix
|
||||
embedding:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/embedding/
|
||||
|
||||
@@ -42,8 +42,9 @@ jobs:
|
||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
# TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
|
||||
#- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
#- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
name: Zig CI
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
runs-on: [ubuntu-latest, macos-latest, windows-latest]
|
||||
runs-on: ${{ matrix.runs-on }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
fetch-depth: 0
|
||||
- uses: goto-bus-stop/setup-zig@v2
|
||||
with:
|
||||
version: 0.11.0
|
||||
- name: Build Summary
|
||||
run: zig build --summary all -freference-trace
|
||||
+10
-5
@@ -72,6 +72,7 @@ else()
|
||||
set(INS_ENB ON)
|
||||
endif()
|
||||
|
||||
option(LLAMA_SVE "llama: enable SVE" OFF)
|
||||
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
@@ -124,7 +125,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
|
||||
@@ -384,10 +384,6 @@ if (LLAMA_LLAMAFILE)
|
||||
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
|
||||
endif()
|
||||
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
|
||||
set(LLAMA_CUDA ON)
|
||||
@@ -505,6 +501,12 @@ if (LLAMA_VULKAN)
|
||||
|
||||
add_compile_definitions(GGML_USE_VULKAN)
|
||||
|
||||
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
|
||||
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
|
||||
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
|
||||
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_CHECK_RESULTS)
|
||||
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
|
||||
endif()
|
||||
@@ -1039,6 +1041,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
if (LLAMA_SVE)
|
||||
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||
endif()
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
|
||||
+6
-2
@@ -1,4 +1,4 @@
|
||||
{
|
||||
{
|
||||
"version": 4,
|
||||
"configurePresets": [
|
||||
{
|
||||
@@ -40,6 +40,10 @@
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
||||
]
|
||||
}
|
||||
|
||||
@@ -389,10 +389,6 @@ else
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifdef LLAMA_QKK_64
|
||||
MK_CPPFLAGS += -DGGML_QKK_64
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
# Mac OS - include Accelerate framework.
|
||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||
@@ -445,6 +441,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
||||
ifdef LLAMA_DEBUG
|
||||
MK_NVCCFLAGS += -lineinfo
|
||||
endif # LLAMA_DEBUG
|
||||
ifdef LLAMA_CUDA_DEBUG
|
||||
MK_NVCCFLAGS += --device-debug
|
||||
endif # LLAMA_CUDA_DEBUG
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
||||
else
|
||||
|
||||
@@ -127,6 +127,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
- [x] [OLMo](https://allenai.org/olmo)
|
||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||
|
||||
@@ -140,6 +141,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||
|
||||
**HTTP server**
|
||||
|
||||
@@ -201,6 +203,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
**Tools:**
|
||||
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
|
||||
---
|
||||
|
||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||
|
||||
@@ -1,172 +0,0 @@
|
||||
// Compatible with Zig Version 0.11.0
|
||||
const std = @import("std");
|
||||
const ArrayList = std.ArrayList;
|
||||
const Compile = std.Build.Step.Compile;
|
||||
const ConfigHeader = std.Build.Step.ConfigHeader;
|
||||
const Mode = std.builtin.Mode;
|
||||
const CrossTarget = std.zig.CrossTarget;
|
||||
|
||||
const Maker = struct {
|
||||
builder: *std.build.Builder,
|
||||
target: CrossTarget,
|
||||
optimize: Mode,
|
||||
enable_lto: bool,
|
||||
|
||||
include_dirs: ArrayList([]const u8),
|
||||
cflags: ArrayList([]const u8),
|
||||
cxxflags: ArrayList([]const u8),
|
||||
objs: ArrayList(*Compile),
|
||||
|
||||
fn addInclude(m: *Maker, dir: []const u8) !void {
|
||||
try m.include_dirs.append(dir);
|
||||
}
|
||||
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
|
||||
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
|
||||
}
|
||||
fn addCFlag(m: *Maker, flag: []const u8) !void {
|
||||
try m.cflags.append(flag);
|
||||
}
|
||||
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
|
||||
try m.cxxflags.append(flag);
|
||||
}
|
||||
fn addFlag(m: *Maker, flag: []const u8) !void {
|
||||
try m.addCFlag(flag);
|
||||
try m.addCxxFlag(flag);
|
||||
}
|
||||
|
||||
fn init(builder: *std.build.Builder) !Maker {
|
||||
const target = builder.standardTargetOptions(.{});
|
||||
const zig_version = @import("builtin").zig_version_string;
|
||||
const commit_hash = try std.ChildProcess.exec(
|
||||
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
|
||||
);
|
||||
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
|
||||
\\int LLAMA_BUILD_NUMBER = {};
|
||||
\\char const *LLAMA_COMMIT = "{s}";
|
||||
\\char const *LLAMA_COMPILER = "Zig {s}";
|
||||
\\char const *LLAMA_BUILD_TARGET = "{s}";
|
||||
\\
|
||||
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
|
||||
var m = Maker{
|
||||
.builder = builder,
|
||||
.target = target,
|
||||
.optimize = builder.standardOptimizeOption(.{}),
|
||||
.enable_lto = false,
|
||||
.include_dirs = ArrayList([]const u8).init(builder.allocator),
|
||||
.cflags = ArrayList([]const u8).init(builder.allocator),
|
||||
.cxxflags = ArrayList([]const u8).init(builder.allocator),
|
||||
.objs = ArrayList(*Compile).init(builder.allocator),
|
||||
};
|
||||
|
||||
try m.addCFlag("-std=c11");
|
||||
try m.addCxxFlag("-std=c++11");
|
||||
try m.addProjectInclude(&.{});
|
||||
try m.addProjectInclude(&.{"common"});
|
||||
return m;
|
||||
}
|
||||
|
||||
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
|
||||
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||
if (o.target.getAbi() != .msvc)
|
||||
o.defineCMacro("_GNU_SOURCE", null);
|
||||
|
||||
if (std.mem.endsWith(u8, src, ".c")) {
|
||||
o.addCSourceFiles(&.{src}, m.cflags.items);
|
||||
o.linkLibC();
|
||||
} else {
|
||||
o.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||
if (o.target.getAbi() == .msvc) {
|
||||
o.linkLibC(); // need winsdk + crt
|
||||
} else {
|
||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||
o.linkLibCpp();
|
||||
}
|
||||
}
|
||||
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
|
||||
o.want_lto = m.enable_lto;
|
||||
return o;
|
||||
}
|
||||
|
||||
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
|
||||
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
|
||||
e.addCSourceFiles(&.{src}, m.cxxflags.items);
|
||||
for (deps) |d| e.addObject(d);
|
||||
for (m.objs.items) |o| e.addObject(o);
|
||||
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
|
||||
|
||||
// https://github.com/ziglang/zig/issues/15448
|
||||
if (e.target.getAbi() == .msvc) {
|
||||
e.linkLibC(); // need winsdk + crt
|
||||
} else {
|
||||
// linkLibCpp already add (libc++ + libunwind + libc)
|
||||
e.linkLibCpp();
|
||||
}
|
||||
m.builder.installArtifact(e);
|
||||
e.want_lto = m.enable_lto;
|
||||
return e;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn build(b: *std.build.Builder) !void {
|
||||
var make = try Maker.init(b);
|
||||
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
|
||||
|
||||
const ggml = make.obj("ggml", "ggml.c");
|
||||
const sgemm = make.obj("sgemm", "sgemm.cpp");
|
||||
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
|
||||
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
|
||||
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
|
||||
const unicode = make.obj("unicode", "unicode.cpp");
|
||||
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
|
||||
const llama = make.obj("llama", "llama.cpp");
|
||||
const buildinfo = make.obj("common", "common/build-info.cpp");
|
||||
const common = make.obj("common", "common/common.cpp");
|
||||
const console = make.obj("console", "common/console.cpp");
|
||||
const sampling = make.obj("sampling", "common/sampling.cpp");
|
||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
const clip = make.obj("clip", "examples/llava/clip.cpp");
|
||||
const llava = make.obj("llava", "examples/llava/llava.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
|
||||
if (server.target.isWindows()) {
|
||||
server.linkSystemLibrary("ws2_32");
|
||||
}
|
||||
|
||||
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
|
||||
for (server_assets) |asset| {
|
||||
const input_path = b.fmt("examples/server/public/{s}", .{asset});
|
||||
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
|
||||
|
||||
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
|
||||
|
||||
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
|
||||
defer b.allocator.free(input);
|
||||
|
||||
var buf = std.ArrayList(u8).init(b.allocator);
|
||||
defer buf.deinit();
|
||||
|
||||
for (input) |byte| {
|
||||
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
|
||||
}
|
||||
|
||||
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
|
||||
defer b.allocator.free(name);
|
||||
std.mem.replaceScalar(u8, name, '.', '_');
|
||||
|
||||
try std.fs.cwd().writeFile(output_path, b.fmt(
|
||||
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
|
||||
.{ name, buf.items, name, input.len },
|
||||
));
|
||||
|
||||
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
|
||||
}
|
||||
}
|
||||
@@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
|
||||
}
|
||||
|
||||
function gg_get_model {
|
||||
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
||||
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||
if [[ -s $gguf_3b ]]; then
|
||||
echo -n "$gguf_3b"
|
||||
elif [[ -s $gguf_7b ]]; then
|
||||
echo -n "$gguf_7b"
|
||||
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
||||
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
||||
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||
if [[ -s $gguf_0 ]]; then
|
||||
echo -n "$gguf_0"
|
||||
elif [[ -s $gguf_1 ]]; then
|
||||
echo -n "$gguf_1"
|
||||
elif [[ -s $gguf_2 ]]; then
|
||||
echo -n "$gguf_2"
|
||||
else
|
||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||
exit 1
|
||||
@@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
# open_llama_3b_v2
|
||||
|
||||
function gg_run_open_llama_3b_v2 {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||
|
||||
path_models="../models-mnt/open-llama/3B-v2"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_open_llama_3b_v2 {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
}
|
||||
|
||||
# open_llama_7b_v2
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
@@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
}
|
||||
|
||||
# pythia_1.4b
|
||||
|
||||
function gg_run_pythia_1_4b {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||
|
||||
path_models="../models-mnt/pythia/1.4B"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_pythia_1_4b {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Pythia 1.4B:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
}
|
||||
|
||||
# pythia_2_8b
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
function gg_run_pythia_2_8b {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
|
||||
path_models="../models-mnt/pythia/2.8B"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_pythia_2_8b {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Pythia 2.8B:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
}
|
||||
|
||||
# bge-small
|
||||
|
||||
function gg_run_embd_bge_small {
|
||||
@@ -552,7 +688,7 @@ function gg_run_embd_bge_small {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models}
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -606,9 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
|
||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||
test $ret -eq 0 && gg_run pythia_1_4b
|
||||
else
|
||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
test $ret -eq 0 && gg_run pythia_2_8b
|
||||
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||
|
||||
+649
-699
File diff suppressed because it is too large
Load Diff
+48
-43
@@ -27,7 +27,7 @@
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define print_build_info() do { \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
||||
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
|
||||
} while(0)
|
||||
|
||||
@@ -35,14 +35,18 @@
|
||||
|
||||
// build info
|
||||
extern int LLAMA_BUILD_NUMBER;
|
||||
extern char const *LLAMA_COMMIT;
|
||||
extern char const *LLAMA_COMPILER;
|
||||
extern char const *LLAMA_BUILD_TARGET;
|
||||
extern char const * LLAMA_COMMIT;
|
||||
extern char const * LLAMA_COMPILER;
|
||||
extern char const * LLAMA_BUILD_TARGET;
|
||||
|
||||
struct llama_control_vector_load_info;
|
||||
|
||||
int get_math_cpu_count();
|
||||
int32_t get_num_physical_cores();
|
||||
//
|
||||
// CPU utils
|
||||
//
|
||||
|
||||
int32_t cpu_get_num_physical_cores();
|
||||
int32_t cpu_get_num_math();
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
@@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
|
||||
struct gpt_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
|
||||
int32_t n_threads = get_math_cpu_count();
|
||||
int32_t n_threads = cpu_get_num_math();
|
||||
int32_t n_threads_draft = -1;
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
@@ -142,6 +146,7 @@ struct gpt_params {
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||
bool special = false; // enable special token output
|
||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
@@ -179,33 +184,34 @@ struct gpt_params {
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
|
||||
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
||||
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
||||
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||
|
||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||
|
||||
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
||||
|
||||
std::string get_system_info(const gpt_params & params);
|
||||
|
||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||
|
||||
void process_escapes(std::string& input);
|
||||
|
||||
bool validate_file_name(const std::string & filename);
|
||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||
|
||||
//
|
||||
// String utils
|
||||
//
|
||||
|
||||
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
|
||||
std::vector<std::string> string_split(std::string input, char separator);
|
||||
|
||||
std::string string_strip(const std::string & str);
|
||||
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
|
||||
std::string string_get_sortable_timestamp();
|
||||
std::string string_random_prompt(std::mt19937 & rng);
|
||||
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
void string_process_escapes(std::string & input);
|
||||
|
||||
//
|
||||
// Filesystem utils
|
||||
//
|
||||
|
||||
bool fs_validate_filename(const std::string & filename);
|
||||
bool fs_create_directory_with_parents(const std::string & path);
|
||||
|
||||
std::string fs_get_cache_directory();
|
||||
|
||||
//
|
||||
// Model utils
|
||||
@@ -276,30 +282,15 @@ std::string llama_detokenize_bpe(
|
||||
// defaults to true when model type is SPM, otherwise false.
|
||||
bool llama_should_add_bos_token(const llama_model * model);
|
||||
|
||||
//
|
||||
// YAML utils
|
||||
//
|
||||
|
||||
bool create_directory_with_parents(const std::string & path);
|
||||
std::string get_cache_directory();
|
||||
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
|
||||
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
|
||||
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||
std::string get_sortable_timestamp();
|
||||
|
||||
void dump_non_result_info_yaml(
|
||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||
|
||||
//
|
||||
// KV cache utils
|
||||
//
|
||||
|
||||
// Dump the KV cache view with the number of sequences per cell.
|
||||
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||
|
||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||
|
||||
//
|
||||
// Embedding utils
|
||||
@@ -333,6 +324,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
||||
//
|
||||
// Split utils
|
||||
//
|
||||
|
||||
static const char * const LLM_KV_SPLIT_NO = "split.no";
|
||||
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
|
||||
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
||||
|
||||
//
|
||||
// YAML utils
|
||||
//
|
||||
|
||||
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
|
||||
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
|
||||
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||
|
||||
void yaml_dump_non_result_info(
|
||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||
|
||||
|
||||
+82
-1
@@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||
std::string result = "CFG -> Penalties ";
|
||||
if (params.mirostat == 0) {
|
||||
for (auto sampler_type : params.samplers_sequence) {
|
||||
const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
|
||||
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
|
||||
if (!sampler_type_name.empty()) {
|
||||
result += "-> " + sampler_type_name + " ";
|
||||
}
|
||||
@@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
||||
switch (sampler_type) {
|
||||
case llama_sampler_type::TOP_K: return "top_k";
|
||||
case llama_sampler_type::TFS_Z: return "tfs_z";
|
||||
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
||||
case llama_sampler_type::TOP_P: return "top_p";
|
||||
case llama_sampler_type::MIN_P: return "min_p";
|
||||
case llama_sampler_type::TEMPERATURE: return "temperature";
|
||||
default : return "";
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
||||
{"top_k", llama_sampler_type::TOP_K},
|
||||
{"top_p", llama_sampler_type::TOP_P},
|
||||
{"typical_p", llama_sampler_type::TYPICAL_P},
|
||||
{"min_p", llama_sampler_type::MIN_P},
|
||||
{"tfs_z", llama_sampler_type::TFS_Z},
|
||||
{"temperature", llama_sampler_type::TEMPERATURE}
|
||||
};
|
||||
|
||||
// since samplers names are written multiple ways
|
||||
// make it ready for both system names and input names
|
||||
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
|
||||
{"top-k", llama_sampler_type::TOP_K},
|
||||
{"top-p", llama_sampler_type::TOP_P},
|
||||
{"nucleus", llama_sampler_type::TOP_P},
|
||||
{"typical-p", llama_sampler_type::TYPICAL_P},
|
||||
{"typical", llama_sampler_type::TYPICAL_P},
|
||||
{"min-p", llama_sampler_type::MIN_P},
|
||||
{"tfs-z", llama_sampler_type::TFS_Z},
|
||||
{"tfs", llama_sampler_type::TFS_Z},
|
||||
{"temp", llama_sampler_type::TEMPERATURE}
|
||||
};
|
||||
|
||||
std::vector<llama_sampler_type> sampler_types;
|
||||
sampler_types.reserve(names.size());
|
||||
for (const auto & name : names)
|
||||
{
|
||||
auto sampler_item = sampler_canonical_name_map.find(name);
|
||||
if (sampler_item != sampler_canonical_name_map.end())
|
||||
{
|
||||
sampler_types.push_back(sampler_item->second);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (allow_alt_names)
|
||||
{
|
||||
sampler_item = sampler_alt_name_map.find(name);
|
||||
if (sampler_item != sampler_alt_name_map.end())
|
||||
{
|
||||
sampler_types.push_back(sampler_item->second);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return sampler_types;
|
||||
}
|
||||
|
||||
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
|
||||
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
||||
{'k', llama_sampler_type::TOP_K},
|
||||
{'p', llama_sampler_type::TOP_P},
|
||||
{'y', llama_sampler_type::TYPICAL_P},
|
||||
{'m', llama_sampler_type::MIN_P},
|
||||
{'f', llama_sampler_type::TFS_Z},
|
||||
{'t', llama_sampler_type::TEMPERATURE}
|
||||
};
|
||||
|
||||
std::vector<llama_sampler_type> sampler_types;
|
||||
sampler_types.reserve(names_string.size());
|
||||
for (const auto & c : names_string) {
|
||||
const auto sampler_item = sampler_name_map.find(c);
|
||||
if (sampler_item != sampler_name_map.end()) {
|
||||
sampler_types.push_back(sampler_item->second);
|
||||
}
|
||||
}
|
||||
return sampler_types;
|
||||
}
|
||||
|
||||
// no reasons to expose this function in header
|
||||
static void sampler_queue(
|
||||
struct llama_context * ctx_main,
|
||||
|
||||
@@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
|
||||
// Print sampling order into a string
|
||||
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
||||
|
||||
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
|
||||
|
||||
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
|
||||
|
||||
// this is a common sampling function used across the examples for convenience
|
||||
// it can serve as a starting point for implementing your own sampling function
|
||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||
|
||||
+2
-2
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
||||
|
||||
params.custom_n_ctx = false;
|
||||
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_checkpointing = true;
|
||||
|
||||
params.sample_start = "";
|
||||
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
|
||||
|
||||
void finish_processing_train_args(struct train_params_common * params) {
|
||||
if (params->escape) {
|
||||
process_escapes(params->sample_start);
|
||||
string_process_escapes(params->sample_start);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -81,6 +81,7 @@ models = [
|
||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
+289
-7
@@ -313,11 +313,10 @@ class Model:
|
||||
data = data.astype(np.float32)
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
|
||||
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"""{{{', '.join(str(n) for n in reversed(
|
||||
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
|
||||
)}}}"""
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||
|
||||
# n_dims is implicit in the shape
|
||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
@@ -474,6 +473,9 @@ class Model:
|
||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||
res = "smaug-bpe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -673,6 +675,44 @@ class GPTNeoXModel(Model):
|
||||
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
|
||||
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
||||
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
||||
qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
|
||||
data_torch = torch.cat(
|
||||
(
|
||||
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
||||
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
||||
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
|
||||
),
|
||||
dim=0,
|
||||
)
|
||||
logger.info("re-format attention.linear_qkv.weight")
|
||||
elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
|
||||
qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
|
||||
data_torch = torch.cat(
|
||||
(
|
||||
qkv_bias[:, 0, :].reshape((n_embed,)),
|
||||
qkv_bias[:, 1, :].reshape((n_embed,)),
|
||||
qkv_bias[:, 2, :].reshape((n_embed,)),
|
||||
),
|
||||
dim=0,
|
||||
)
|
||||
logger.info("re-format attention.linear_qkv.bias")
|
||||
|
||||
tensors.append((self.map_tensor_name(name), data_torch))
|
||||
|
||||
return tensors
|
||||
|
||||
|
||||
@Model.register("BloomForCausalLM")
|
||||
class BloomModel(Model):
|
||||
@@ -1277,6 +1317,17 @@ class LlamaModel(Model):
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
# Apply to granite small models only
|
||||
if self.hparams.get("vocab_size", 32000) == 49152:
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
|
||||
@staticmethod
|
||||
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
|
||||
if n_head_kv is not None and n_head != n_head_kv:
|
||||
@@ -1291,9 +1342,9 @@ class LlamaModel(Model):
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
|
||||
if name.endswith("q_proj.weight"):
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith("k_proj.weight"):
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
# process the experts separately
|
||||
@@ -2355,7 +2406,8 @@ class CommandR2Model(Model):
|
||||
|
||||
# max_position_embeddings = 8192 in config.json but model was actually
|
||||
# trained on 128k context length
|
||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
||||
# aya-23 models don't have model_max_length specified
|
||||
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
@@ -2428,6 +2480,236 @@ class JinaBertV2Model(BertModel):
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
|
||||
@Model.register("ArcticForCausalLM")
|
||||
class ArcticModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.ARCTIC
|
||||
|
||||
def set_vocab(self):
|
||||
# The reason for using a custom implementation here is that the
|
||||
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
||||
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
logger.error(f'Error: Missing {tokenizer_path}')
|
||||
sys.exit(1)
|
||||
|
||||
# Read the whole vocabulary from the tokenizer.model file
|
||||
tokenizer = SentencePieceProcessor()
|
||||
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
|
||||
piece = tokenizer.IdToPiece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.GetScore(token_id)
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if tokenizer.IsUnknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif tokenizer.IsControl(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif tokenizer.IsUnused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
|
||||
# Use the added_tokens_decoder field from tokeniser_config.json as the source
|
||||
# of information about added/redefined tokens and modify them accordingly.
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
|
||||
if "added_tokens_decoder" in tokenizer_config_json:
|
||||
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||
for token_id, token_json in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
if (token_id >= vocab_size):
|
||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
token_content = token_json["content"]
|
||||
token_type = SentencePieceTokenTypes.USER_DEFINED
|
||||
token_score = -10000.0
|
||||
|
||||
# Map unk_token to UNKNOWN, other special tokens to CONTROL
|
||||
# Set the score to 0.0 as in the original tokenizer.model
|
||||
if ("special" in token_json) and token_json["special"]:
|
||||
if token_content == tokenizer_config_json["unk_token"]:
|
||||
token_type = SentencePieceTokenTypes.UNKNOWN
|
||||
else:
|
||||
token_type = SentencePieceTokenTypes.CONTROL
|
||||
token_score = 0.0
|
||||
|
||||
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
|
||||
tokens[token_id] = token_content.encode("utf-8")
|
||||
toktypes[token_id] = token_type
|
||||
scores[token_id] = token_score
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
|
||||
if name.endswith("q_proj.weight"):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith("k_proj.weight"):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
# process the experts separately
|
||||
if name.find("block_sparse_moe.experts") != -1:
|
||||
n_experts = self.hparams["num_local_experts"]
|
||||
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for wid in ["w1", "w2", "w3"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("DeepseekV2ForCausalLM")
|
||||
class DeepseekV2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
|
||||
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_value_length(hparams["v_head_dim"])
|
||||
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
|
||||
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
|
||||
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
|
||||
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# process the experts separately
|
||||
if name.find("mlp.experts") != -1:
|
||||
n_experts = self.hparams["n_routed_experts"]
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
||||
process_escapes(params.prompt);
|
||||
string_process_escapes(params.prompt);
|
||||
|
||||
// init LLM
|
||||
|
||||
|
||||
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
|
||||
|
||||
params.samples_start_after_nl = false;
|
||||
params.use_adam = true;
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_scratch = true;
|
||||
|
||||
// only adam
|
||||
|
||||
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
// split the prompt into lines
|
||||
|
||||
@@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
@@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
bool OK = run(ctx, params);
|
||||
|
||||
@@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
|
||||
struct ggml_tensor * t16;
|
||||
if (enable_flash_attn) {
|
||||
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
|
||||
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
||||
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
|
||||
} else {
|
||||
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
||||
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
||||
|
||||
@@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
sparams.dataset = params.prompt_file;
|
||||
@@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
|
||||
|
||||
@@ -50,9 +50,9 @@ static void write_logfile(
|
||||
return;
|
||||
}
|
||||
|
||||
const std::string timestamp = get_sortable_timestamp();
|
||||
const std::string timestamp = string_get_sortable_timestamp();
|
||||
|
||||
const bool success = create_directory_with_parents(params.logdir);
|
||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||
if (!success) {
|
||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
__func__, params.logdir.c_str());
|
||||
@@ -70,7 +70,7 @@ static void write_logfile(
|
||||
fprintf(logfile, "binary: infill\n");
|
||||
char model_desc[128];
|
||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
||||
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
||||
|
||||
fprintf(logfile, "\n");
|
||||
fprintf(logfile, "######################\n");
|
||||
@@ -78,8 +78,8 @@ static void write_logfile(
|
||||
fprintf(logfile, "######################\n");
|
||||
fprintf(logfile, "\n");
|
||||
|
||||
dump_string_yaml_multiline(logfile, "output", output.c_str());
|
||||
dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
|
||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||
|
||||
llama_dump_timing_info_yaml(logfile, ctx);
|
||||
fclose(logfile);
|
||||
@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
|
||||
// print system information
|
||||
{
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
||||
@@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (params.escape) {
|
||||
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
||||
process_escapes(params.input_prefix);
|
||||
process_escapes(params.input_suffix);
|
||||
string_process_escapes(params.input_prefix);
|
||||
string_process_escapes(params.input_suffix);
|
||||
}
|
||||
suff_rm_leading_spc = params.escape;
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
|
||||
@@ -195,12 +195,12 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
||||
/* n_prompt */ {512},
|
||||
/* n_gen */ {128},
|
||||
/* n_pg */ {{512, 128}},
|
||||
/* n_pg */ {},
|
||||
/* n_batch */ {2048},
|
||||
/* n_ubatch */ {512},
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {get_math_cpu_count()},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
/* main_gpu */ {0},
|
||||
|
||||
@@ -7,8 +7,6 @@ android {
|
||||
namespace = "com.example.llama"
|
||||
compileSdk = 34
|
||||
|
||||
ndkVersion = "26.1.10909125"
|
||||
|
||||
defaultConfig {
|
||||
applicationId = "com.example.llama"
|
||||
minSdk = 33
|
||||
@@ -20,17 +18,6 @@ android {
|
||||
vectorDrawables {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
@@ -55,17 +42,6 @@ android {
|
||||
composeOptions {
|
||||
kotlinCompilerExtensionVersion = "1.5.1"
|
||||
}
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
@@ -78,6 +54,7 @@ dependencies {
|
||||
implementation("androidx.compose.ui:ui-graphics")
|
||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||
implementation("androidx.compose.material3:material3")
|
||||
implementation(project(":llama"))
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.llama.cpp.LLamaAndroid
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
@@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
|
||||
import kotlinx.coroutines.flow.catch
|
||||
import kotlinx.coroutines.launch
|
||||
|
||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val NanosPerSecond = 1_000_000_000.0
|
||||
@@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.unload()
|
||||
llamaAndroid.unload()
|
||||
} catch (exc: IllegalStateException) {
|
||||
messages += exc.message!!
|
||||
}
|
||||
@@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
messages += ""
|
||||
|
||||
viewModelScope.launch {
|
||||
llm.send(text)
|
||||
llamaAndroid.send(text)
|
||||
.catch {
|
||||
Log.e(tag, "send() failed", it)
|
||||
messages += it.message!!
|
||||
@@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
val start = System.nanoTime()
|
||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
||||
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
|
||||
val end = System.nanoTime()
|
||||
|
||||
messages += warmupResult
|
||||
@@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
return@launch
|
||||
}
|
||||
|
||||
messages += llm.bench(512, 128, 1, 3)
|
||||
messages += llamaAndroid.bench(512, 128, 1, 3)
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "bench() failed", exc)
|
||||
messages += exc.message!!
|
||||
@@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
fun load(pathToModel: String) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.load(pathToModel)
|
||||
llamaAndroid.load(pathToModel)
|
||||
messages += "Loaded $pathToModel"
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "load() failed", exc)
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
plugins {
|
||||
id("com.android.application") version "8.2.0" apply false
|
||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||
id("com.android.library") version "8.2.0" apply false
|
||||
}
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
/build
|
||||
+1
-1
@@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama)
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
@@ -0,0 +1,68 @@
|
||||
plugins {
|
||||
id("com.android.library")
|
||||
id("org.jetbrains.kotlin.android")
|
||||
}
|
||||
|
||||
android {
|
||||
namespace = "android.llama.cpp"
|
||||
compileSdk = 34
|
||||
|
||||
defaultConfig {
|
||||
minSdk = 33
|
||||
|
||||
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||
consumerProguardFiles("consumer-rules.pro")
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
|
||||
cppFlags("")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
release {
|
||||
isMinifyEnabled = false
|
||||
proguardFiles(
|
||||
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||
"proguard-rules.pro"
|
||||
)
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||
targetCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = "1.8"
|
||||
}
|
||||
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation("androidx.core:core-ktx:1.12.0")
|
||||
implementation("androidx.appcompat:appcompat:1.6.1")
|
||||
implementation("com.google.android.material:material:1.11.0")
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
}
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
# Add project specific ProGuard rules here.
|
||||
# You can control the set of applied configuration files using the
|
||||
# proguardFiles setting in build.gradle.
|
||||
#
|
||||
# For more details, see
|
||||
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||
|
||||
# If your project uses WebView with JS, uncomment the following
|
||||
# and specify the fully qualified class name to the JavaScript interface
|
||||
# class:
|
||||
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||
# public *;
|
||||
#}
|
||||
|
||||
# Uncomment this to preserve the line number information for
|
||||
# debugging stack traces.
|
||||
#-keepattributes SourceFile,LineNumberTable
|
||||
|
||||
# If you keep the line number information, uncomment this to
|
||||
# hide the original source file name.
|
||||
#-renamesourcefileattribute SourceFile
|
||||
+24
@@ -0,0 +1,24 @@
|
||||
package android.llama.cpp
|
||||
|
||||
import androidx.test.platform.app.InstrumentationRegistry
|
||||
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||
|
||||
import org.junit.Test
|
||||
import org.junit.runner.RunWith
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Instrumented test, which will execute on an Android device.
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
@RunWith(AndroidJUnit4::class)
|
||||
class ExampleInstrumentedTest {
|
||||
@Test
|
||||
fun useAppContext() {
|
||||
// Context of the app under test.
|
||||
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
|
||||
assertEquals("android.llama.cpp.test", appContext.packageName)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
|
||||
</manifest>
|
||||
@@ -0,0 +1,49 @@
|
||||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
||||
+14
-14
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||
|
||||
if (!model) {
|
||||
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
llama_free(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
||||
llama_log_set(log_callback, NULL);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_bench_1model(
|
||||
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
|
||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||
|
||||
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||
llama_backend_init();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
||||
return env->NewStringUTF(llama_print_system_info());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jint JNICALL
|
||||
Java_com_example_llama_Llm_completion_1init(
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_completion_1loop(
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
JNIEnv * env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
+4
-4
@@ -1,4 +1,4 @@
|
||||
package com.example.llama
|
||||
package android.llama.cpp
|
||||
|
||||
import android.util.Log
|
||||
import kotlinx.coroutines.CoroutineDispatcher
|
||||
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
|
||||
import java.util.concurrent.Executors
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
class Llm {
|
||||
class LLamaAndroid {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||
@@ -165,8 +165,8 @@ class Llm {
|
||||
}
|
||||
|
||||
// Enforce only one instance of Llm.
|
||||
private val _instance: Llm = Llm()
|
||||
private val _instance: LLamaAndroid = LLamaAndroid()
|
||||
|
||||
fun instance(): Llm = _instance
|
||||
fun instance(): LLamaAndroid = _instance
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package android.llama.cpp
|
||||
|
||||
import org.junit.Test
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Example local unit test, which will execute on the development machine (host).
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
class ExampleUnitTest {
|
||||
@Test
|
||||
fun addition_isCorrect() {
|
||||
assertEquals(4, 2 + 2)
|
||||
}
|
||||
}
|
||||
@@ -15,3 +15,4 @@ dependencyResolutionManagement {
|
||||
|
||||
rootProject.name = "LlamaAndroid"
|
||||
include(":app")
|
||||
include(":llama")
|
||||
|
||||
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
gpt_print_usage(argc, argv, params);
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
|
||||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||
|
||||
@@ -121,7 +121,7 @@ int main(int argc, char ** argv){
|
||||
// debug
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
// print current draft sequence
|
||||
|
||||
+22
-14
@@ -60,9 +60,9 @@ static void write_logfile(
|
||||
return;
|
||||
}
|
||||
|
||||
const std::string timestamp = get_sortable_timestamp();
|
||||
const std::string timestamp = string_get_sortable_timestamp();
|
||||
|
||||
const bool success = create_directory_with_parents(params.logdir);
|
||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||
if (!success) {
|
||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
__func__, params.logdir.c_str());
|
||||
@@ -80,7 +80,7 @@ static void write_logfile(
|
||||
fprintf(logfile, "binary: main\n");
|
||||
char model_desc[128];
|
||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
||||
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
||||
|
||||
fprintf(logfile, "\n");
|
||||
fprintf(logfile, "######################\n");
|
||||
@@ -88,8 +88,8 @@ static void write_logfile(
|
||||
fprintf(logfile, "######################\n");
|
||||
fprintf(logfile, "\n");
|
||||
|
||||
dump_string_yaml_multiline(logfile, "output", output.c_str());
|
||||
dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
|
||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||
|
||||
llama_dump_timing_info_yaml(logfile, ctx);
|
||||
fclose(logfile);
|
||||
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
LOG("%s: llama backend init\n", __func__);
|
||||
@@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
|
||||
// print system information
|
||||
{
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
std::string path_session = params.path_prompt_cache;
|
||||
@@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
if (params.interactive) {
|
||||
const char *control_message;
|
||||
const char * control_message;
|
||||
if (params.multiline_input) {
|
||||
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
|
||||
control_message = " - To return control to the AI, end your input with '\\'.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n";
|
||||
} else {
|
||||
control_message = " - Press Return to return control to LLaMa.\n"
|
||||
control_message = " - Press Return to return control to the AI.\n"
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
@@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
|
||||
// display text
|
||||
if (input_echo && display) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
||||
printf("%s", token_str.c_str());
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||
|
||||
// Console/Stream Output
|
||||
fprintf(stdout, "%s", token_str.c_str());
|
||||
|
||||
// Record Displayed Tokens To Log
|
||||
// Note: Generated tokens are created one by one hence this check
|
||||
if (embd.size() > 1) {
|
||||
// Incoming Requested Tokens
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
// Outgoing Generated Tokens
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
@@ -879,7 +887,7 @@ int main(int argc, char ** argv) {
|
||||
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
||||
}
|
||||
if (params.escape) {
|
||||
process_escapes(buffer);
|
||||
string_process_escapes(buffer);
|
||||
}
|
||||
|
||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||
|
||||
@@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
|
||||
while (true) {
|
||||
if (dump_kv_cache) {
|
||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
@@ -44,9 +44,9 @@ static void write_logfile(
|
||||
return;
|
||||
}
|
||||
|
||||
const std::string timestamp = get_sortable_timestamp();
|
||||
const std::string timestamp = string_get_sortable_timestamp();
|
||||
|
||||
const bool success = create_directory_with_parents(params.logdir);
|
||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||
if (!success) {
|
||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
__func__, params.logdir.c_str());
|
||||
@@ -64,7 +64,7 @@ static void write_logfile(
|
||||
fprintf(logfile, "binary: main\n");
|
||||
char model_desc[128];
|
||||
llama_model_desc(model, model_desc, sizeof(model_desc));
|
||||
dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
|
||||
yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
|
||||
|
||||
fprintf(logfile, "\n");
|
||||
fprintf(logfile, "######################\n");
|
||||
@@ -72,9 +72,9 @@ static void write_logfile(
|
||||
fprintf(logfile, "######################\n");
|
||||
fprintf(logfile, "\n");
|
||||
|
||||
dump_vector_float_yaml(logfile, "logits", results.logits);
|
||||
yaml_dump_vector_float(logfile, "logits", results.logits);
|
||||
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
||||
dump_vector_float_yaml(logfile, "probs", results.probs);
|
||||
yaml_dump_vector_float(logfile, "probs", results.probs);
|
||||
|
||||
llama_dump_timing_info_yaml(logfile, ctx);
|
||||
fclose(logfile);
|
||||
@@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
params.prompt = string_random_prompt(rng);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
@@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
struct results_perplexity results;
|
||||
|
||||
@@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
||||
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
|
||||
|
||||
@@ -11,7 +11,7 @@ struct retrieval_params {
|
||||
};
|
||||
|
||||
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
|
||||
gpt_print_usage(argc, argv, gpt_params);
|
||||
gpt_params_print_usage(argc, argv, gpt_params);
|
||||
printf("retrieval options:\n");
|
||||
printf(" --context-file FNAME file containing context to embed.\n");
|
||||
printf(" specify multiple files by providing --context-file option multiple times.\n");
|
||||
@@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
// max batch size
|
||||
|
||||
@@ -594,7 +594,7 @@
|
||||
message = html`<${Probabilities} data=${data} />`
|
||||
} else {
|
||||
const text = isArrayMessage ?
|
||||
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
||||
data.map(msg => msg.content).join('') :
|
||||
data;
|
||||
message = isCompletionMode ?
|
||||
text :
|
||||
@@ -877,19 +877,30 @@
|
||||
|
||||
// poor mans markdown replacement
|
||||
const Markdownish = (params) => {
|
||||
const md = params.text
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
|
||||
.replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||
.replace(/\n/gim, '<br />');
|
||||
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
|
||||
const chunks = params.text.split('```');
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
if (i % 2 === 0) { // outside code block
|
||||
chunks[i] = chunks[i]
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
|
||||
.replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||
.replace(/\n/gim, '<br />');
|
||||
} else { // inside code block
|
||||
chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
|
||||
}
|
||||
}
|
||||
|
||||
const restoredText = chunks.join('');
|
||||
|
||||
return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
|
||||
};
|
||||
|
||||
const ModelGenerationInfo = (params) => {
|
||||
@@ -903,6 +914,7 @@
|
||||
`
|
||||
}
|
||||
|
||||
|
||||
// simple popover impl
|
||||
const Popover = (props) => {
|
||||
const isOpen = useSignal(false);
|
||||
@@ -1054,4 +1066,3 @@
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>SimpleChat LlamaCppEtal </title>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<meta name="message" content="Save Nature Save Earth" />
|
||||
<meta name="description" content="SimpleChat: trigger LLM web service endpoints /chat/completions and /completions, single/multi chat sessions" />
|
||||
<meta name="author" content="by Humans for All" />
|
||||
<meta http-equiv="Cache-Control" content="no-cache, no-store, must-revalidate" />
|
||||
<script src="simplechat.js" defer></script>
|
||||
<link rel="stylesheet" href="simplechat.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="samecolumn" id="fullbody">
|
||||
|
||||
<div class="sameline">
|
||||
<p class="heading flex-grow" > <b> SimpleChat </b> </p>
|
||||
<div class="sameline">
|
||||
<label for="api-ep">Mode:</label>
|
||||
<select name="api-ep" id="api-ep">
|
||||
<option value="chat" selected>Chat</option>
|
||||
<option value="completion">Completion</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="sessions-div" class="sameline"></div>
|
||||
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<label for="system-in">System</label>
|
||||
<input type="text" name="system" id="system-in" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"/>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div id="chat-div">
|
||||
<p> You need to have javascript enabled.</p>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<textarea id="user-in" class="flex-grow" rows="3" placeholder="enter your query to the ai model here" ></textarea>
|
||||
<button id="user-btn">submit</button>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,201 @@
|
||||
|
||||
# SimpleChat
|
||||
|
||||
by Humans for All.
|
||||
|
||||
|
||||
## overview
|
||||
|
||||
This simple web frontend, allows triggering/testing the server's /completions or /chat/completions endpoints
|
||||
in a simple way with minimal code from a common code base. Inturn additionally it tries to allow single or
|
||||
multiple independent back and forth chatting to an extent, with the ai llm model at a basic level, with their
|
||||
own system prompts.
|
||||
|
||||
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
||||
enough manner, in general.
|
||||
|
||||
Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
|
||||
console.
|
||||
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude
|
||||
form of old messages culling can be achieved.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants
|
||||
they can update the js file or equivalent member in gMe as needed.
|
||||
|
||||
|
||||
## usage
|
||||
|
||||
One could run this web frontend directly using server itself or if anyone is thinking of adding a built in web
|
||||
frontend to configure the server over http(s) or so, then run this web frontend using something like python's
|
||||
http module.
|
||||
|
||||
### running using examples/server
|
||||
|
||||
bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
|
||||
|
||||
### running using python3's server module
|
||||
|
||||
first run examples/server
|
||||
* bin/server -m path/model.gguf
|
||||
|
||||
next run this web front end in examples/server/public_simplechat
|
||||
* cd ../examples/server/public_simplechat
|
||||
* python3 -m http.server PORT
|
||||
|
||||
### using the front end
|
||||
|
||||
Open this simple web front end from your local browser
|
||||
|
||||
* http://127.0.0.1:PORT/index.html
|
||||
|
||||
Once inside
|
||||
|
||||
* Select between chat and completion mode. By default it is set to chat mode.
|
||||
|
||||
* In completion mode
|
||||
* logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
|
||||
If the model requires any prefix wrt user role messages, then the end user has to
|
||||
explicitly add the needed prefix, when they enter their chat message.
|
||||
Similarly if the model requires any prefix to trigger assistant/ai-model response,
|
||||
then the end user needs to enter the same.
|
||||
This keeps the logic simple, while still giving flexibility to the end user to
|
||||
manage any templating/tagging requirement wrt their messages to the model.
|
||||
* the logic doesnt insert newline at the begining and end wrt the prompt message generated.
|
||||
However if the chat being sent to /completions end point has more than one role's message,
|
||||
then insert newline when moving from one role's message to the next role's message, so
|
||||
that it can be clearly identified/distinguished.
|
||||
* given that /completions endpoint normally doesnt add additional chat-templating of its
|
||||
own, the above ensures that end user can create a custom single/multi message combo with
|
||||
any tags/special-tokens related chat templating to test out model handshake. Or enduser
|
||||
can use it just for normal completion related/based query.
|
||||
|
||||
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
||||
Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
|
||||
responses with a suitable system prompt.
|
||||
* if chat.add_system_begin is used
|
||||
* you cant change the system prompt, after it is has been submitted once along with user query.
|
||||
* you cant set a system prompt, after you have submitted any user query
|
||||
* if chat.add_system_anytime is used
|
||||
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
||||
* inturn the updated/changed system prompt will be inserted into the chat session.
|
||||
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
||||
|
||||
* Enter your query and either press enter or click on the submit button.
|
||||
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
||||
|
||||
* Wait for the logic to communicate with the server and get the response.
|
||||
* the user is not allowed to enter any fresh query during this time.
|
||||
* the user input box will be disabled and a working message will be shown in it.
|
||||
|
||||
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
||||
|
||||
* Using NewChat one can start independent chat sessions.
|
||||
* two independent chat sessions are setup by default.
|
||||
|
||||
|
||||
## Devel note
|
||||
|
||||
### Reason behind this
|
||||
|
||||
The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
|
||||
by developers who may not be from web frontend background (so inturn may not be familiar with template /
|
||||
end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
|
||||
|
||||
And given that the idea is also to help explore/experiment for developers, some flexibility is provided
|
||||
to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented
|
||||
to explore some of the end points and ideas/implications around them.
|
||||
|
||||
|
||||
### General
|
||||
|
||||
Me/gMe consolidates the settings which control the behaviour into one object.
|
||||
One can see the current settings, as well as change/update them using browsers devel-tool/console.
|
||||
|
||||
bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
|
||||
communicating with the server or only sends the latest user query/message.
|
||||
|
||||
bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
|
||||
messages that get inserted into prompt field wrt /Completion endpoint.
|
||||
|
||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||
irrespective of whether /chat/completions or /completions endpoint.
|
||||
|
||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||
modify the existing options value or remove them, for now you can update this global var
|
||||
using browser's development-tools/console.
|
||||
|
||||
iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
|
||||
This is disabled by default. However if enabled, then in addition to latest system message, only
|
||||
the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
|
||||
from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
|
||||
only user messages after the latest system message/prompt will be considered.
|
||||
|
||||
This specified sliding window user message count also includes the latest user query.
|
||||
<0 : Send entire chat history to server
|
||||
0 : Send only the system message if any to the server
|
||||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||
|
||||
|
||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way.
|
||||
|
||||
|
||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
||||
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
||||
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
||||
|
||||
|
||||
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
||||
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
||||
its instances relatively easily, however given the current goal of keeping this simple, it has
|
||||
not been added, for now.
|
||||
|
||||
|
||||
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
||||
the system prompt, anytime during the conversation or only at the beginning.
|
||||
|
||||
|
||||
read_json_early, is to experiment with reading json response data early on, if available,
|
||||
so that user can be shown generated data, as and when it is being generated, rather than
|
||||
at the end when full data is available.
|
||||
|
||||
the server flow doesnt seem to be sending back data early, atleast for request (inc options)
|
||||
that is currently sent.
|
||||
|
||||
if able to read json data early on in future, as and when ai model is generating data, then
|
||||
this helper needs to indirectly update the chat div with the recieved data, without waiting
|
||||
for the overall data to be available.
|
||||
|
||||
|
||||
### Default setup
|
||||
|
||||
By default things are setup to try and make the user experience a bit better, if possible.
|
||||
However a developer when testing the server of ai-model may want to change these value.
|
||||
|
||||
Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
|
||||
just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
|
||||
full chat history. This way if there is any response with garbage/repeatation, it doesnt
|
||||
mess with things beyond the next question/request/query, in some ways.
|
||||
|
||||
Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
|
||||
available wrt next query-response. However dont forget that the server when started should
|
||||
also be started with a model context size of 1k or more, to be on safe side.
|
||||
|
||||
The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
|
||||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||
to /completions endpoint handling code on server side.
|
||||
|
||||
Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server
|
||||
along with the user query. So that the model is partly set to try avoid repeating text in
|
||||
its response.
|
||||
|
||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console.
|
||||
|
||||
|
||||
## At the end
|
||||
|
||||
Also a thank you to all open source and open model developers, who strive for the common good.
|
||||
@@ -0,0 +1,68 @@
|
||||
/**
|
||||
* the styling of the simplechat web frontend
|
||||
* by Humans for All
|
||||
*/
|
||||
|
||||
#fullbody {
|
||||
height: 98vh;
|
||||
}
|
||||
|
||||
.heading {
|
||||
background-color: lightgray;
|
||||
}
|
||||
|
||||
.session-selected {
|
||||
background-color: lightblue;
|
||||
}
|
||||
|
||||
.role-system {
|
||||
background-color: lightblue;
|
||||
}
|
||||
.role-user {
|
||||
background-color: lightgray;
|
||||
}
|
||||
|
||||
.flex-grow {
|
||||
flex-grow: 1;
|
||||
}
|
||||
.float-right {
|
||||
float: right;
|
||||
}
|
||||
|
||||
#chat-div {
|
||||
overflow: scroll;
|
||||
flex-grow: 1;
|
||||
flex-shrink: 1;
|
||||
min-height: 40vh;
|
||||
}
|
||||
button {
|
||||
min-width: 8vw;
|
||||
}
|
||||
|
||||
.sameline {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
}
|
||||
.samecolumn {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.ul1 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
.ul2 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0.6vmin;
|
||||
}
|
||||
|
||||
@media print {
|
||||
|
||||
#fullbody {
|
||||
height: auto;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,629 @@
|
||||
// @ts-check
|
||||
// A simple completions and chat/completions test related web front end logic
|
||||
// by Humans for All
|
||||
|
||||
class Roles {
|
||||
static System = "system";
|
||||
static User = "user";
|
||||
static Assistant = "assistant";
|
||||
}
|
||||
|
||||
class ApiEP {
|
||||
static Chat = "chat";
|
||||
static Completion = "completion";
|
||||
}
|
||||
|
||||
let gUsageMsg = `
|
||||
<p class="role-system">Usage</p>
|
||||
<ul class="ul1">
|
||||
<li> Set system prompt above, to try control ai response charactersitic, if model supports same.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode normally wont have a system prompt.</li>
|
||||
</ul>
|
||||
<li> Enter your query to ai assistant below.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode doesnt insert user/role: prefix implicitly.</li>
|
||||
<li> Use shift+enter for inserting enter/newline.</li>
|
||||
</ul>
|
||||
<li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
|
||||
<ul class="ul2">
|
||||
<li> experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand</li>
|
||||
</ul>
|
||||
</ul>
|
||||
`;
|
||||
|
||||
/** @typedef {{role: string, content: string}[]} ChatMessages */
|
||||
|
||||
class SimpleChat {
|
||||
|
||||
constructor() {
|
||||
/**
|
||||
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
||||
* @type {ChatMessages}
|
||||
*/
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recent chat messages.
|
||||
* If iRecentUserMsgCnt < 0
|
||||
* Then return the full chat history
|
||||
* Else
|
||||
* Return chat messages from latest going back till the last/latest system prompt.
|
||||
* While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
|
||||
* @param {number} iRecentUserMsgCnt
|
||||
*/
|
||||
recent_chat(iRecentUserMsgCnt) {
|
||||
if (iRecentUserMsgCnt < 0) {
|
||||
return this.xchat;
|
||||
}
|
||||
if (iRecentUserMsgCnt == 0) {
|
||||
console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
|
||||
}
|
||||
/** @type{ChatMessages} */
|
||||
let rchat = [];
|
||||
let sysMsg = this.get_system_latest();
|
||||
if (sysMsg.length != 0) {
|
||||
rchat.push({role: Roles.System, content: sysMsg});
|
||||
}
|
||||
let iUserCnt = 0;
|
||||
let iStart = this.xchat.length;
|
||||
for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
|
||||
if (iUserCnt >= iRecentUserMsgCnt) {
|
||||
break;
|
||||
}
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.User) {
|
||||
iStart = i;
|
||||
iUserCnt += 1;
|
||||
}
|
||||
}
|
||||
for(let i = iStart; i < this.xchat.length; i++) {
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.System) {
|
||||
continue;
|
||||
}
|
||||
rchat.push({role: msg.role, content: msg.content});
|
||||
}
|
||||
return rchat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an entry into xchat
|
||||
* @param {string} role
|
||||
* @param {string|undefined|null} content
|
||||
*/
|
||||
add(role, content) {
|
||||
if ((content == undefined) || (content == null) || (content == "")) {
|
||||
return false;
|
||||
}
|
||||
this.xchat.push( {role: role, content: content} );
|
||||
if (role == Roles.System) {
|
||||
this.iLastSys = this.xchat.length - 1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Show the contents in the specified div
|
||||
* @param {HTMLDivElement} div
|
||||
* @param {boolean} bClear
|
||||
*/
|
||||
show(div, bClear=true) {
|
||||
if (bClear) {
|
||||
div.replaceChildren();
|
||||
}
|
||||
let last = undefined;
|
||||
for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
let entry = document.createElement("p");
|
||||
entry.className = `role-${x.role}`;
|
||||
entry.innerText = `${x.role}: ${x.content}`;
|
||||
div.appendChild(entry);
|
||||
last = entry;
|
||||
}
|
||||
if (last !== undefined) {
|
||||
last.scrollIntoView(false);
|
||||
} else {
|
||||
if (bClear) {
|
||||
div.innerHTML = gUsageMsg;
|
||||
gMe.show_info(div);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
* The needed fields/options are picked from a global object.
|
||||
* Convert the json into string.
|
||||
* @param {Object} obj
|
||||
*/
|
||||
request_jsonstr(obj) {
|
||||
for(let k in gMe.chatRequestOptions) {
|
||||
obj[k] = gMe.chatRequestOptions[k];
|
||||
}
|
||||
return JSON.stringify(obj);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string form of json object suitable for chat/completions
|
||||
*/
|
||||
request_messages_jsonstr() {
|
||||
let req = {
|
||||
messages: this.recent_chat(gMe.iRecentUserMsgCnt),
|
||||
}
|
||||
return this.request_jsonstr(req);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string form of json object suitable for /completions
|
||||
* @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
|
||||
*/
|
||||
request_prompt_jsonstr(bInsertStandardRolePrefix) {
|
||||
let prompt = "";
|
||||
let iCnt = 0;
|
||||
for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
iCnt += 1;
|
||||
if (iCnt > 1) {
|
||||
prompt += "\n";
|
||||
}
|
||||
if (bInsertStandardRolePrefix) {
|
||||
prompt += `${chat.role}: `;
|
||||
}
|
||||
prompt += `${chat.content}`;
|
||||
}
|
||||
let req = {
|
||||
prompt: prompt,
|
||||
}
|
||||
return this.request_jsonstr(req);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow setting of system prompt, but only at begining.
|
||||
* @param {string} sysPrompt
|
||||
* @param {string} msgTag
|
||||
*/
|
||||
add_system_begin(sysPrompt, msgTag) {
|
||||
if (this.xchat.length == 0) {
|
||||
if (sysPrompt.length > 0) {
|
||||
return this.add(Roles.System, sysPrompt);
|
||||
}
|
||||
} else {
|
||||
if (sysPrompt.length > 0) {
|
||||
if (this.xchat[0].role !== Roles.System) {
|
||||
console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
|
||||
} else {
|
||||
if (this.xchat[0].content !== sysPrompt) {
|
||||
console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow setting of system prompt, at any time.
|
||||
* @param {string} sysPrompt
|
||||
* @param {string} msgTag
|
||||
*/
|
||||
add_system_anytime(sysPrompt, msgTag) {
|
||||
if (sysPrompt.length <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (this.iLastSys < 0) {
|
||||
return this.add(Roles.System, sysPrompt);
|
||||
}
|
||||
|
||||
let lastSys = this.xchat[this.iLastSys].content;
|
||||
if (lastSys !== sysPrompt) {
|
||||
return this.add(Roles.System, sysPrompt);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the latest system prompt.
|
||||
*/
|
||||
get_system_latest() {
|
||||
if (this.iLastSys == -1) {
|
||||
return "";
|
||||
}
|
||||
let sysPrompt = this.xchat[this.iLastSys].content;
|
||||
return sysPrompt;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
let gBaseURL = "http://127.0.0.1:8080";
|
||||
let gChatURL = {
|
||||
'chat': `${gBaseURL}/chat/completions`,
|
||||
'completion': `${gBaseURL}/completions`,
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set the class of the children, based on whether it is the idSelected or not.
|
||||
* @param {HTMLDivElement} elBase
|
||||
* @param {string} idSelected
|
||||
* @param {string} classSelected
|
||||
* @param {string} classUnSelected
|
||||
*/
|
||||
function el_children_config_class(elBase, idSelected, classSelected, classUnSelected="") {
|
||||
for(let child of elBase.children) {
|
||||
if (child.id == idSelected) {
|
||||
child.className = classSelected;
|
||||
} else {
|
||||
child.className = classUnSelected;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create button and set it up.
|
||||
* @param {string} id
|
||||
* @param {(this: HTMLButtonElement, ev: MouseEvent) => any} callback
|
||||
* @param {string | undefined} name
|
||||
* @param {string | undefined} innerText
|
||||
*/
|
||||
function el_create_button(id, callback, name=undefined, innerText=undefined) {
|
||||
if (!name) {
|
||||
name = id;
|
||||
}
|
||||
if (!innerText) {
|
||||
innerText = id;
|
||||
}
|
||||
let btn = document.createElement("button");
|
||||
btn.id = id;
|
||||
btn.name = name;
|
||||
btn.innerText = innerText;
|
||||
btn.addEventListener("click", callback);
|
||||
return btn;
|
||||
}
|
||||
|
||||
|
||||
class MultiChatUI {
|
||||
|
||||
constructor() {
|
||||
/** @type {Object<string, SimpleChat>} */
|
||||
this.simpleChats = {};
|
||||
/** @type {string} */
|
||||
this.curChatId = "";
|
||||
|
||||
// the ui elements
|
||||
this.elInSystem = /** @type{HTMLInputElement} */(document.getElementById("system-in"));
|
||||
this.elDivChat = /** @type{HTMLDivElement} */(document.getElementById("chat-div"));
|
||||
this.elBtnUser = /** @type{HTMLButtonElement} */(document.getElementById("user-btn"));
|
||||
this.elInUser = /** @type{HTMLInputElement} */(document.getElementById("user-in"));
|
||||
this.elSelectApiEP = /** @type{HTMLSelectElement} */(document.getElementById("api-ep"));
|
||||
this.elDivSessions = /** @type{HTMLDivElement} */(document.getElementById("sessions-div"));
|
||||
|
||||
this.validate_element(this.elInSystem, "system-in");
|
||||
this.validate_element(this.elDivChat, "chat-div");
|
||||
this.validate_element(this.elInUser, "user-in");
|
||||
this.validate_element(this.elSelectApiEP, "api-ep");
|
||||
this.validate_element(this.elDivChat, "sessions-div");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the element got
|
||||
* @param {HTMLElement | null} el
|
||||
* @param {string} msgTag
|
||||
*/
|
||||
validate_element(el, msgTag) {
|
||||
if (el == null) {
|
||||
throw Error(`ERRR:SimpleChat:MCUI:${msgTag} element missing in html...`);
|
||||
} else {
|
||||
console.debug(`INFO:SimpleChat:MCUI:${msgTag} Id[${el.id}] Name[${el["name"]}]`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset user input ui.
|
||||
* * clear user input
|
||||
* * enable user input
|
||||
* * set focus to user input
|
||||
*/
|
||||
ui_reset_userinput() {
|
||||
this.elInUser.value = "";
|
||||
this.elInUser.disabled = false;
|
||||
this.elInUser.focus();
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup the needed callbacks wrt UI, curChatId to defaultChatId and
|
||||
* optionally switch to specified defaultChatId.
|
||||
* @param {string} defaultChatId
|
||||
* @param {boolean} bSwitchSession
|
||||
*/
|
||||
setup_ui(defaultChatId, bSwitchSession=false) {
|
||||
|
||||
this.curChatId = defaultChatId;
|
||||
if (bSwitchSession) {
|
||||
this.handle_session_switch(this.curChatId);
|
||||
}
|
||||
|
||||
this.elBtnUser.addEventListener("click", (ev)=>{
|
||||
if (this.elInUser.disabled) {
|
||||
return;
|
||||
}
|
||||
this.handle_user_submit(this.curChatId, this.elSelectApiEP.value).catch((/** @type{Error} */reason)=>{
|
||||
let msg = `ERRR:SimpleChat\nMCUI:HandleUserSubmit:${this.curChatId}\n${reason.name}:${reason.message}`;
|
||||
console.debug(msg.replace("\n", ":"));
|
||||
alert(msg);
|
||||
this.ui_reset_userinput();
|
||||
});
|
||||
});
|
||||
|
||||
this.elInUser.addEventListener("keyup", (ev)=> {
|
||||
// allow user to insert enter into their message using shift+enter.
|
||||
// while just pressing enter key will lead to submitting.
|
||||
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||
let value = this.elInUser.value;
|
||||
this.elInUser.value = value.substring(0,value.length-1);
|
||||
this.elBtnUser.click();
|
||||
ev.preventDefault();
|
||||
}
|
||||
});
|
||||
|
||||
this.elInSystem.addEventListener("keyup", (ev)=> {
|
||||
// allow user to insert enter into the system prompt using shift+enter.
|
||||
// while just pressing enter key will lead to setting the system prompt.
|
||||
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||
let chat = this.simpleChats[this.curChatId];
|
||||
chat.add_system_anytime(this.elInSystem.value, this.curChatId);
|
||||
chat.show(this.elDivChat);
|
||||
ev.preventDefault();
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup a new chat session and optionally switch to it.
|
||||
* @param {string} chatId
|
||||
* @param {boolean} bSwitchSession
|
||||
*/
|
||||
new_chat_session(chatId, bSwitchSession=false) {
|
||||
this.simpleChats[chatId] = new SimpleChat();
|
||||
if (bSwitchSession) {
|
||||
this.handle_session_switch(chatId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try read json response early, if available.
|
||||
* @param {Response} resp
|
||||
*/
|
||||
async read_json_early(resp) {
|
||||
if (!resp.body) {
|
||||
throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body...");
|
||||
}
|
||||
let tdUtf8 = new TextDecoder("utf-8");
|
||||
let rr = resp.body.getReader();
|
||||
let gotBody = "";
|
||||
while(true) {
|
||||
let { value: cur, done: done} = await rr.read();
|
||||
let curBody = tdUtf8.decode(cur);
|
||||
console.debug("DBUG:SC:PART:", curBody);
|
||||
gotBody += curBody;
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return JSON.parse(gotBody);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle user query submit request, wrt specified chat session.
|
||||
* @param {string} chatId
|
||||
* @param {string} apiEP
|
||||
*/
|
||||
async handle_user_submit(chatId, apiEP) {
|
||||
|
||||
let chat = this.simpleChats[chatId];
|
||||
|
||||
// In completion mode, if configured, clear any previous chat history.
|
||||
// So if user wants to simulate a multi-chat based completion query,
|
||||
// they will have to enter the full thing, as a suitable multiline
|
||||
// user input/query.
|
||||
if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) {
|
||||
chat.clear();
|
||||
}
|
||||
|
||||
chat.add_system_anytime(this.elInSystem.value, chatId);
|
||||
|
||||
let content = this.elInUser.value;
|
||||
if (!chat.add(Roles.User, content)) {
|
||||
console.debug(`WARN:SimpleChat:MCUI:${chatId}:HandleUserSubmit:Ignoring empty user input...`);
|
||||
return;
|
||||
}
|
||||
chat.show(this.elDivChat);
|
||||
|
||||
let theBody;
|
||||
let theUrl = gChatURL[apiEP]
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
theBody = chat.request_messages_jsonstr();
|
||||
} else {
|
||||
theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
|
||||
}
|
||||
|
||||
this.elInUser.value = "working...";
|
||||
this.elInUser.disabled = true;
|
||||
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:${theUrl}:ReqBody:${theBody}`);
|
||||
let resp = await fetch(theUrl, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: theBody,
|
||||
});
|
||||
|
||||
let respBody = await resp.json();
|
||||
//let respBody = await this.read_json_early(resp);
|
||||
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
||||
let assistantMsg;
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
assistantMsg = respBody["choices"][0]["message"]["content"];
|
||||
} else {
|
||||
try {
|
||||
assistantMsg = respBody["choices"][0]["text"];
|
||||
} catch {
|
||||
assistantMsg = respBody["content"];
|
||||
}
|
||||
}
|
||||
chat.add(Roles.Assistant, assistantMsg);
|
||||
if (chatId == this.curChatId) {
|
||||
chat.show(this.elDivChat);
|
||||
} else {
|
||||
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
||||
}
|
||||
this.ui_reset_userinput();
|
||||
}
|
||||
|
||||
/**
|
||||
* Show buttons for NewChat and available chat sessions, in the passed elDiv.
|
||||
* If elDiv is undefined/null, then use this.elDivSessions.
|
||||
* Take care of highlighting the selected chat-session's btn.
|
||||
* @param {HTMLDivElement | undefined} elDiv
|
||||
*/
|
||||
show_sessions(elDiv=undefined) {
|
||||
if (!elDiv) {
|
||||
elDiv = this.elDivSessions;
|
||||
}
|
||||
elDiv.replaceChildren();
|
||||
// Btn for creating new chat session
|
||||
let btnNew = el_create_button("New CHAT", (ev)=> {
|
||||
if (this.elInUser.disabled) {
|
||||
console.error(`ERRR:SimpleChat:MCUI:NewChat:Current session [${this.curChatId}] awaiting response, ignoring request...`);
|
||||
alert("ERRR:SimpleChat\nMCUI:NewChat\nWait for response to pending query, before starting new chat session");
|
||||
return;
|
||||
}
|
||||
let chatId = `Chat${Object.keys(this.simpleChats).length}`;
|
||||
let chatIdGot = prompt("INFO:SimpleChat\nMCUI:NewChat\nEnter id for new chat session", chatId);
|
||||
if (!chatIdGot) {
|
||||
console.error("ERRR:SimpleChat:MCUI:NewChat:Skipping based on user request...");
|
||||
return;
|
||||
}
|
||||
this.new_chat_session(chatIdGot, true);
|
||||
this.create_session_btn(elDiv, chatIdGot);
|
||||
el_children_config_class(elDiv, chatIdGot, "session-selected", "");
|
||||
});
|
||||
elDiv.appendChild(btnNew);
|
||||
// Btns for existing chat sessions
|
||||
let chatIds = Object.keys(this.simpleChats);
|
||||
for(let cid of chatIds) {
|
||||
let btn = this.create_session_btn(elDiv, cid);
|
||||
if (cid == this.curChatId) {
|
||||
btn.className = "session-selected";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
create_session_btn(elDiv, cid) {
|
||||
let btn = el_create_button(cid, (ev)=>{
|
||||
let target = /** @type{HTMLButtonElement} */(ev.target);
|
||||
console.debug(`DBUG:SimpleChat:MCUI:SessionClick:${target.id}`);
|
||||
if (this.elInUser.disabled) {
|
||||
console.error(`ERRR:SimpleChat:MCUI:SessionClick:${target.id}:Current session [${this.curChatId}] awaiting response, ignoring switch...`);
|
||||
alert("ERRR:SimpleChat\nMCUI:SessionClick\nWait for response to pending query, before switching");
|
||||
return;
|
||||
}
|
||||
this.handle_session_switch(target.id);
|
||||
el_children_config_class(elDiv, target.id, "session-selected", "");
|
||||
});
|
||||
elDiv.appendChild(btn);
|
||||
return btn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Switch ui to the specified chatId and set curChatId to same.
|
||||
* @param {string} chatId
|
||||
*/
|
||||
async handle_session_switch(chatId) {
|
||||
let chat = this.simpleChats[chatId];
|
||||
if (chat == undefined) {
|
||||
console.error(`ERRR:SimpleChat:MCUI:HandleSessionSwitch:${chatId} missing...`);
|
||||
return;
|
||||
}
|
||||
this.elInSystem.value = chat.get_system_latest();
|
||||
this.elInUser.value = "";
|
||||
chat.show(this.elDivChat);
|
||||
this.elInUser.focus();
|
||||
this.curChatId = chatId;
|
||||
console.log(`INFO:SimpleChat:MCUI:HandleSessionSwitch:${chatId} entered...`);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
class Me {
|
||||
|
||||
constructor() {
|
||||
this.defaultChatIds = [ "Default", "Other" ];
|
||||
this.multiChat = new MultiChatUI();
|
||||
this.bCompletionFreshChatAlways = true;
|
||||
this.bCompletionInsertStandardRolePrefix = false;
|
||||
this.iRecentUserMsgCnt = 2;
|
||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
this.chatRequestOptions = {
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 1024,
|
||||
"frequency_penalty": 1.2,
|
||||
"presence_penalty": 1.2,
|
||||
"n_predict": 1024
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {HTMLDivElement} elDiv
|
||||
*/
|
||||
show_info(elDiv) {
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = "Settings (devel-tools-console gMe)";
|
||||
p.className = "role-system";
|
||||
elDiv.appendChild(p);
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** @type {Me} */
|
||||
let gMe;
|
||||
|
||||
function startme() {
|
||||
console.log("INFO:SimpleChat:StartMe:Starting...");
|
||||
gMe = new Me();
|
||||
for (let cid of gMe.defaultChatIds) {
|
||||
gMe.multiChat.new_chat_session(cid);
|
||||
}
|
||||
gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
|
||||
gMe.multiChat.show_sessions();
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", startme);
|
||||
@@ -1019,7 +1019,7 @@ struct server_context {
|
||||
sampler_names.emplace_back(sampler_name);
|
||||
}
|
||||
}
|
||||
slot.sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
|
||||
slot.sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
|
||||
} else {
|
||||
slot.sparams.samplers_sequence = default_sparams.samplers_sequence;
|
||||
}
|
||||
@@ -1256,7 +1256,7 @@ struct server_context {
|
||||
std::vector<std::string> samplers_sequence;
|
||||
samplers_sequence.reserve(slot.sparams.samplers_sequence.size());
|
||||
for (const auto & sampler_type : slot.sparams.samplers_sequence) {
|
||||
samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
|
||||
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -2852,7 +2852,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
if (!parse_kv_override(argv[i], params.kv_overrides)) {
|
||||
if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
|
||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
@@ -3310,7 +3310,7 @@ int main(int argc, char ** argv) {
|
||||
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||
json request_data = json::parse(req.body);
|
||||
std::string filename = request_data.at("filename");
|
||||
if (!validate_file_name(filename)) {
|
||||
if (!fs_validate_filename(filename)) {
|
||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
@@ -3340,7 +3340,7 @@ int main(int argc, char ** argv) {
|
||||
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||
json request_data = json::parse(req.body);
|
||||
std::string filename = request_data.at("filename");
|
||||
if (!validate_file_name(filename)) {
|
||||
if (!fs_validate_filename(filename)) {
|
||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
||||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
if %errorlevel% neq 0 goto ERROR
|
||||
:: build example/main only
|
||||
:: make main
|
||||
|
||||
@@ -3,40 +3,390 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#include <shellapi.h> // For CommandLineToArgvW
|
||||
#endif
|
||||
|
||||
static void print_usage_information(const char * argv0, FILE * stream) {
|
||||
fprintf(stream, "usage: %s [options]\n\n", argv0);
|
||||
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
|
||||
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
|
||||
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
|
||||
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
|
||||
fprintf(stream, " The possible options are:\n");
|
||||
fprintf(stream, "\n");
|
||||
fprintf(stream, " -h, --help print this help and exit\n");
|
||||
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
||||
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
|
||||
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
||||
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
||||
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) text;
|
||||
(void) user_data;
|
||||
}
|
||||
|
||||
static std::string read_prompt_from_file(const char * filepath, bool & success) {
|
||||
success = false;
|
||||
|
||||
std::ifstream in(filepath, std::ios::binary);
|
||||
if (!in) {
|
||||
fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
|
||||
return std::string();
|
||||
}
|
||||
// do not assume the file is seekable (e.g. /dev/stdin)
|
||||
std::stringstream buffer;
|
||||
buffer << in.rdbuf();
|
||||
if (in.fail()) {
|
||||
fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
|
||||
return std::string();
|
||||
}
|
||||
|
||||
success = true;
|
||||
return buffer.str();
|
||||
}
|
||||
|
||||
//
|
||||
// Function: ingest_args(...) -> vector<string>
|
||||
//
|
||||
// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
|
||||
// strings, as an STL vector<string>.
|
||||
//
|
||||
// In particular, it handles character encoding shenanigans on Windows.
|
||||
//
|
||||
// Note: raw_argc and raw_argv are not actually read at all on Windows.
|
||||
// On Windows we call GetCommandLineW to get the arguments in wchar_t
|
||||
// format, ignoring the regular argc/argv arguments to main().
|
||||
//
|
||||
// TODO: potential opportunity to roll common stuff into common/console.cpp
|
||||
// in relation to Windows wchar_t shenanigans.
|
||||
static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
|
||||
std::vector<std::string> argv;
|
||||
|
||||
// Handle Windows, if given non-ASCII arguments.
|
||||
// We convert wchar_t arguments into UTF-8 char* on this platform.
|
||||
// Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
|
||||
// without throwing tantrums.
|
||||
#if defined(_WIN32)
|
||||
int argc;
|
||||
const LPWSTR cmdline_wargv = GetCommandLineW();
|
||||
LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
|
||||
|
||||
// silence unused arg warnings
|
||||
(void) raw_argc;
|
||||
(void) raw_argv;
|
||||
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
|
||||
char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
|
||||
GGML_ASSERT(output_buf);
|
||||
|
||||
WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
|
||||
output_buf[length_needed] = '\0';
|
||||
|
||||
argv.push_back(output_buf);
|
||||
free(output_buf);
|
||||
}
|
||||
|
||||
LocalFree((HLOCAL) wargv);
|
||||
#else
|
||||
int argc = raw_argc;
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
argv.push_back(raw_argv[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
GGML_ASSERT((unsigned int) argc == argv.size());
|
||||
|
||||
return argv;
|
||||
}
|
||||
|
||||
//
|
||||
// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
|
||||
//
|
||||
// writes a string to standard output; taking into account that on Windows
|
||||
// to display correctly you have to use special handling. Works even if the
|
||||
// user has not set a unicode code page on a Windows cmd.exe.
|
||||
//
|
||||
// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
|
||||
// a human-readable is written instead.
|
||||
//
|
||||
// On non-Windows systems, simply printfs() the string.
|
||||
static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
||||
invalid_utf8 = false;
|
||||
|
||||
#if defined(_WIN32)
|
||||
// Are we in a console?
|
||||
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
|
||||
DWORD dwMode = 0;
|
||||
|
||||
// According to Microsoft docs:
|
||||
// "WriteConsole fails if it is used with a standard handle that is redirected to a file."
|
||||
// Also according to the docs, you can use GetConsoleMode to check for that.
|
||||
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
|
||||
printf("%s", str);
|
||||
return;
|
||||
}
|
||||
|
||||
// MultiByteToWideChar reports an error if str is empty, don't report
|
||||
// them as invalid_utf8.
|
||||
if (*str == 0) {
|
||||
return;
|
||||
}
|
||||
int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
|
||||
if (length_needed == 0) {
|
||||
DWORD err = GetLastError();
|
||||
if (err == ERROR_NO_UNICODE_TRANSLATION) {
|
||||
invalid_utf8 = true;
|
||||
int len = strlen(str);
|
||||
printf("<");
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (i > 0) {
|
||||
printf(" ");
|
||||
}
|
||||
printf("%02x", (uint8_t) str[i]);
|
||||
}
|
||||
printf(">");
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
|
||||
}
|
||||
|
||||
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
||||
GGML_ASSERT(wstr);
|
||||
|
||||
MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
|
||||
WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
|
||||
|
||||
free(wstr);
|
||||
#else
|
||||
// TODO: reporting invalid_utf8 would be useful on non-Windows too.
|
||||
// printf will silently just write bad unicode.
|
||||
printf("%s", str);
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int raw_argc, char ** raw_argv) {
|
||||
const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
|
||||
const int argc = argv.size();
|
||||
|
||||
if (argc <= 1) {
|
||||
print_usage_information(argv[0].c_str(), stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char * model_path = argv[1];
|
||||
const char * prompt = argv[2];
|
||||
//////
|
||||
// Read out all the command line arguments.
|
||||
//////
|
||||
|
||||
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
|
||||
// variables where to put any arguments we see.
|
||||
bool printing_ids = false;
|
||||
bool no_bos = false;
|
||||
bool disable_logging = false;
|
||||
const char * model_path = NULL;
|
||||
const char * prompt_path = NULL;
|
||||
const char * prompt_arg = NULL;
|
||||
|
||||
// track which arguments were explicitly given
|
||||
// used for sanity checking down the line
|
||||
bool model_path_set = false;
|
||||
bool prompt_path_set = false;
|
||||
bool prompt_set = false;
|
||||
bool stdin_set = false;
|
||||
|
||||
int iarg = 1;
|
||||
for (; iarg < argc; ++iarg) {
|
||||
std::string arg{argv[iarg]};
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
print_usage_information(argv[0].c_str(), stdout);
|
||||
return 0;
|
||||
}
|
||||
else if (arg == "--ids") {
|
||||
printing_ids = true;
|
||||
}
|
||||
else if (arg == "-m" || arg == "--model") {
|
||||
if (model_path_set) {
|
||||
fprintf(stderr, "Error: -m or --model specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
model_path = argv[++iarg].c_str();
|
||||
model_path_set = true;
|
||||
}
|
||||
else if (arg == "--no-bos") {
|
||||
no_bos = true;
|
||||
}
|
||||
else if (arg == "-p" || arg == "--prompt") {
|
||||
if (prompt_set) {
|
||||
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
prompt_arg = argv[++iarg].c_str();
|
||||
prompt_set = true;
|
||||
}
|
||||
else if (arg == "-f" || arg == "--file") {
|
||||
if (prompt_path_set) {
|
||||
fprintf(stderr, "Error: -f or --file specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
prompt_path = argv[++iarg].c_str();
|
||||
prompt_path_set = true;
|
||||
}
|
||||
else if (arg == "--stdin") {
|
||||
stdin_set = true;
|
||||
}
|
||||
else if (arg == "--log-disable") {
|
||||
disable_logging = true;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
//////
|
||||
// Sanity check the command line arguments.
|
||||
//////
|
||||
|
||||
// Check that we have the required stuff set.
|
||||
if (model_path_set && model_path == NULL) {
|
||||
fprintf(stderr, "Error: --model requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
if (!model_path_set) {
|
||||
fprintf(stderr, "Error: must specify --model.\n");
|
||||
return 1;
|
||||
}
|
||||
if (prompt_path_set && prompt_path == NULL) {
|
||||
fprintf(stderr, "Error: --file requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
if (prompt_set && prompt_arg == NULL) {
|
||||
fprintf(stderr, "Error: --prompt requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
|
||||
if (prompts_set > 1) {
|
||||
fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
|
||||
return 1;
|
||||
}
|
||||
// Must have some prompt.
|
||||
if (prompts_set == 0) {
|
||||
fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
GGML_ASSERT(model_path);
|
||||
GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
|
||||
|
||||
//////
|
||||
// Figure out where will the prompt come from.
|
||||
//////
|
||||
|
||||
std::string prompt;
|
||||
if (prompt_path_set) {
|
||||
bool success = false;
|
||||
prompt = read_prompt_from_file(prompt_path, success);
|
||||
if (!success) {
|
||||
return 1;
|
||||
}
|
||||
} else if (prompt_set) {
|
||||
prompt = prompt_arg;
|
||||
} else {
|
||||
GGML_ASSERT(stdin_set);
|
||||
// we read stdin *after* loading model (early exit if model cannot
|
||||
// be loaded, which can be a nicer user experience)
|
||||
}
|
||||
|
||||
//////
|
||||
// Start actually doing the tokenizing stuff.
|
||||
//////
|
||||
|
||||
#ifdef LOG_DISABLE_LOGS
|
||||
disable_logging = true;
|
||||
#endif
|
||||
|
||||
if (disable_logging) {
|
||||
llama_log_set(llama_log_callback_null, NULL);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.vocab_only = true;
|
||||
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
||||
if (!model) {
|
||||
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
if (!ctx) {
|
||||
fprintf(stderr, "Error: could not create context.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// read entire prompt from stdin?
|
||||
if (stdin_set) {
|
||||
GGML_ASSERT(!prompt_path_set && !prompt_set);
|
||||
|
||||
std::stringstream stdin_buffer;
|
||||
stdin_buffer << std::cin.rdbuf();
|
||||
if (std::cin.fail()) {
|
||||
fprintf(stderr, "Error: could not read the entire standard input.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
prompt = stdin_buffer.str();
|
||||
}
|
||||
|
||||
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
||||
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||
|
||||
tokens = ::llama_tokenize(model, prompt, true, true);
|
||||
if (printing_ids) {
|
||||
printf("[");
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||
if (printing_ids) {
|
||||
printf("%d\n", tokens[i]);
|
||||
if (i > 0) {
|
||||
printf(", ");
|
||||
}
|
||||
printf("%d", tokens[i]);
|
||||
} else {
|
||||
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
|
||||
bool invalid_utf8 = false;
|
||||
printf("%6d -> '", tokens[i]);
|
||||
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
||||
if (invalid_utf8) {
|
||||
printf("' (utf-8 decode failure)\n");
|
||||
} else {
|
||||
printf("'\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (printing_ids) {
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
// silence valgrind
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -341,7 +341,8 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
|
||||
struct ggml_tensor * t16;
|
||||
if (enable_flash_attn) {
|
||||
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
|
||||
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
||||
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
|
||||
} else {
|
||||
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
||||
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
||||
|
||||
Generated
+6
-6
@@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1714641030,
|
||||
"narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
|
||||
"lastModified": 1715865404,
|
||||
"narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
|
||||
"rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1714635257,
|
||||
"narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
|
||||
"lastModified": 1716509168,
|
||||
"narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
|
||||
"rev": "bfb7a882678e518398ce9a31a881538679f6f092",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
||||
// QK = number of values after dequantization
|
||||
// QK_K = super-block size
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
#define K_SCALE_SIZE 4
|
||||
#else
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif // GGML_QKK_64
|
||||
|
||||
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
||||
// QR = QK / number of values before dequantization
|
||||
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
||||
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
||||
#define QR4_NL 2
|
||||
|
||||
#if QK_K == 64
|
||||
#define QI4_XS QI4_NL
|
||||
#define QR4_XS QR4_NL
|
||||
#else
|
||||
#define QI4_XS (QK_K / (4*QR4_XS))
|
||||
#define QR4_XS 8
|
||||
#endif
|
||||
|
||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||
|
||||
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
||||
// weight is represented as x = a * q
|
||||
// 16 blocks of 16 elements each
|
||||
// Effectively 3.4375 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[2];
|
||||
ggml_half d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
@@ -244,20 +225,11 @@ typedef struct {
|
||||
ggml_half d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 4-bit quantization
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 4.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
ggml_half d[2]; // super-block scales/mins
|
||||
uint8_t scales[2]; // 4-bit block scales/mins
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
union {
|
||||
struct {
|
||||
@@ -270,21 +242,11 @@ typedef struct {
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 5-bit quantization
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 5.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
ggml_half d; // super-block scale
|
||||
int8_t scales[QK_K/16]; // 8-bit block scales
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
union {
|
||||
struct {
|
||||
@@ -298,7 +260,6 @@ typedef struct {
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 6-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
@@ -356,11 +317,7 @@ typedef struct {
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
// 3.4375 bpw
|
||||
#if QK_K == 64
|
||||
#define IQ3S_N_SCALE 2
|
||||
#else
|
||||
#define IQ3S_N_SCALE QK_K/64
|
||||
#endif
|
||||
typedef struct {
|
||||
ggml_half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
||||
typedef struct {
|
||||
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
||||
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
||||
#if QK_K == 64
|
||||
ggml_half d;
|
||||
#endif
|
||||
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
||||
} block_iq1_m;
|
||||
#if QK_K == 64
|
||||
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
||||
#else
|
||||
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
||||
#endif
|
||||
|
||||
// Used by IQ1_M quants
|
||||
typedef union {
|
||||
@@ -406,9 +356,6 @@ typedef struct {
|
||||
} block_iq4_nl;
|
||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||
|
||||
#if QK_K == 64
|
||||
#define block_iq4_xs block_iq4_nl
|
||||
#else
|
||||
typedef struct {
|
||||
ggml_half d;
|
||||
uint16_t scales_h;
|
||||
@@ -416,7 +363,6 @@ typedef struct {
|
||||
uint8_t qs[QK_K/2];
|
||||
} block_iq4_xs;
|
||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||
#endif
|
||||
|
||||
#endif // GGML_COMMON_DECL
|
||||
#endif // GGML_COMMON_DECL
|
||||
|
||||
+24
-9
@@ -119,6 +119,20 @@ int ggml_cuda_get_device() {
|
||||
return id;
|
||||
}
|
||||
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
||||
auto res = hipMallocManaged(ptr, size);
|
||||
if (res == hipSuccess) {
|
||||
// if error we "need" to know why...
|
||||
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||
}
|
||||
return res;
|
||||
#else
|
||||
return cudaMalloc(ptr, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
@@ -271,7 +285,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
@@ -537,7 +551,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||
|
||||
void * dev_ptr;
|
||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
||||
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
@@ -798,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
||||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
ggml_cuda_set_device(id);
|
||||
char * buf;
|
||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
||||
|
||||
// set padding to 0 to avoid possible NaN values
|
||||
if (size > original_size) {
|
||||
@@ -2510,9 +2524,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
|
||||
bool use_cuda_graph = true;
|
||||
bool cuda_graph_update_required = false;
|
||||
// pointer to CUDA cpy kernel, which is required to identify
|
||||
// vector of pointers to CUDA cpy kernels, which are required to identify
|
||||
// kernel parameters which need updated in the graph for each token
|
||||
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
||||
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
||||
|
||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||
@@ -2588,9 +2602,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
if (node->op == GGML_OP_CPY) {
|
||||
// store the copy op parameter which changes with each token.
|
||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||
if (ggml_cuda_cpy_fn_ptr == nullptr) {
|
||||
// store a pointer to the copy op CUDA kernel to identify it later
|
||||
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
// store a pointer to each copy op CUDA kernel to identify it later
|
||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2720,7 +2735,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
||||
int k = 0;
|
||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
|
||||
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
||||
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||
|
||||
@@ -79,13 +79,8 @@
|
||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||
#define cudaHostUnregister hipHostUnregister
|
||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||
#ifdef GGML_HIP_UMA
|
||||
#define cudaMalloc hipMallocManaged
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
||||
#else
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||
#endif
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||
|
||||
+87
-6
@@ -1,15 +1,68 @@
|
||||
#include "concat.cuh"
|
||||
|
||||
static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
|
||||
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (nidx < ne00) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne00 +
|
||||
blockIdx.z * ne00 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
(nidx - ne00) +
|
||||
blockIdx.y * (ne0 - ne00) +
|
||||
blockIdx.z * (ne0 - ne00) * gridDim.y;
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.y < ne01) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * ne01;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx +
|
||||
(blockIdx.y - ne01) * ne0 +
|
||||
blockIdx.z * ne0 * (gridDim.y - ne01);
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.z < ne02) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
@@ -25,25 +78,53 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
||||
static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
|
||||
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||
dim3 gridDim(num_blocks, ne1, ne2);
|
||||
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||
if (dim == 0) {
|
||||
concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
|
||||
return;
|
||||
}
|
||||
if (dim == 1) {
|
||||
concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
|
||||
return;
|
||||
}
|
||||
concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream);
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_cuda(
|
||||
src0_d + i3 * (src0->nb[3] / 4),
|
||||
src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * ( dst->nb[3] / 4),
|
||||
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
} else {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t n = tid/32;
|
||||
const int64_t l = tid - 32*n;
|
||||
const int64_t is = 8*n + l/16;
|
||||
@@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||
#else
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const int64_t il = tid%16; // 0...15
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
||||
float dall = __low2half(x[i].dm);
|
||||
float dmin = __high2half(x[i].dm);
|
||||
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
#if QK_K == 256
|
||||
const int64_t r = threadIdx.x/4;
|
||||
const int64_t tid = r/2;
|
||||
const int64_t is0 = r%2;
|
||||
@@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
||||
const uint8_t * hm = x[i].hmask;
|
||||
|
||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||
#else
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const int64_t il = tid%16; // 0...15
|
||||
const int64_t im = il/8; // 0...1
|
||||
const int64_t in = il%8; // 0...7
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
||||
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
||||
const float d = (float)x[i].d;
|
||||
|
||||
if (is == 0) {
|
||||
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||
} else {
|
||||
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||
if (j < 4) {
|
||||
d = q[j] & 63; m = q[j + 4] & 63;
|
||||
@@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
@@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 32 threads
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8;
|
||||
@@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||
}
|
||||
#else
|
||||
const int64_t tid = threadIdx.x;
|
||||
const uint8_t * q = x[i].qs;
|
||||
dst_t * y = yy + i*QK_K;
|
||||
const float d = (float)x[i].dm[0];
|
||||
const float m = (float)x[i].dm[1];
|
||||
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
||||
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/16; // il is in 0...3
|
||||
@@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
||||
hm <<= 1;
|
||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
#else
|
||||
const int64_t tid = threadIdx.x;
|
||||
const uint8_t q = x[i].qs[tid];
|
||||
const int64_t im = tid/8; // 0...3
|
||||
const int64_t in = tid%8; // 0...7
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const uint8_t h = x[i].qh[in] >> im;
|
||||
const float d = x[i].d;
|
||||
dst_t * y = yy + i*QK_K + tid;
|
||||
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
||||
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
#if QK_K == 256
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int64_t tid = threadIdx.x;
|
||||
@@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||
#else
|
||||
|
||||
// assume 32 threads
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t ip = tid/16; // 0 or 1
|
||||
const int64_t il = tid - 16*ip; // 0...15
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const uint8_t ql = x[i].ql[16*ip + il];
|
||||
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
||||
const int8_t * sc = x[i].scales;
|
||||
|
||||
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
||||
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||
const block_iq1_s * x = (const block_iq1_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||
const block_iq1_m * x = (const block_iq1_m *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
@@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#if QK_K != 64
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int64_t i = blockIdx.x;
|
||||
@@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
@@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
#if QK_K == 64
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
|
||||
@@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
|
||||
@@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
tmp += dall * sum1 - dmin * sum2;
|
||||
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||
const int offset = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
uint32_t uaux[2];
|
||||
const uint8_t * d = (const uint8_t *)uaux;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + offset;
|
||||
const uint8_t * q = x[i].qs + offset;
|
||||
const uint32_t * s = (const uint32_t *)x[i].scales;
|
||||
|
||||
uaux[0] = s[0] & 0x0f0f0f0f;
|
||||
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
||||
|
||||
const float2 dall = __half22float2(x[i].dm);
|
||||
|
||||
float sum1 = 0, sum2 = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
const uint8_t ql = q[l];
|
||||
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
||||
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
||||
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
||||
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
||||
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
||||
}
|
||||
tmp += dall.x * sum1 - dall.y * sum2;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
@@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const uint16_t kmask1 = 0x0303;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
@@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
tmp += d * sum;
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
||||
const int in = offset/8; // 0 or 1
|
||||
const int im = offset%8; // 0...7
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + offset;
|
||||
const uint8_t * q = x[i].qs + offset;
|
||||
const uint8_t * s = x[i].scales;
|
||||
|
||||
const float dall = (float)x[i].d;
|
||||
|
||||
float sum = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
const uint8_t hl = x[i].hmask[im+l] >> in;
|
||||
const uint8_t ql = q[l];
|
||||
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
||||
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
||||
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
||||
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
@@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
|
||||
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
||||
|
||||
#if QK_K == 256
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
@@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
#endif
|
||||
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
||||
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
uint16_t aux16[2];
|
||||
const uint8_t * s = (const uint8_t *)aux16;
|
||||
|
||||
float tmp = 0;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
const uint8_t * q = x[i].qs + step;
|
||||
const float * y = yy + i*QK_K + step;
|
||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
const float d = (float)x[i].dm[0];
|
||||
const float m = (float)x[i].dm[1];
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
||||
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
||||
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
||||
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
@@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
@@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
||||
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
||||
}
|
||||
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
const int im = step/8;
|
||||
const int in = step%8;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
const uint8_t * q = x[i].qs + step;
|
||||
const int8_t * s = x[i].scales;
|
||||
const float * y = yy + i*QK_K + step;
|
||||
const float d = x[i].d;
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
const uint8_t h = x[i].qh[in+j] >> im;
|
||||
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
||||
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
||||
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
||||
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
|
||||
@@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
|
||||
@@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
|
||||
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + step;
|
||||
const uint8_t * ql = x[i].ql + step;
|
||||
const uint8_t * qh = x[i].qh + step;
|
||||
const int8_t * s = x[i].scales;
|
||||
|
||||
const float d = x[i+0].d;
|
||||
|
||||
float sum = 0;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
||||
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
||||
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
||||
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
||||
}
|
||||
tmp += sum;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
|
||||
|
||||
@@ -83,7 +83,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
|
||||
Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
}
|
||||
@@ -238,6 +238,10 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
|
||||
float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x];
|
||||
float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
|
||||
Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
|
||||
Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
|
||||
}
|
||||
@@ -237,6 +237,10 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
|
||||
const int j_VKQ = j_VKQ_0 + threadIdx.y;
|
||||
|
||||
if (ic0 + j_VKQ >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
float kqsum_j = kqsum[j_VKQ_0/nwarps];
|
||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||
const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
|
||||
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
}
|
||||
@@ -212,6 +212,10 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
|
||||
break;
|
||||
}
|
||||
|
||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||
|
||||
@@ -223,7 +227,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && tid < ncols) {
|
||||
if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
|
||||
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
||||
}
|
||||
#else
|
||||
|
||||
@@ -91,7 +91,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
Q_h2[j][i0/WARP_SIZE] = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||
Q_h2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
|
||||
Q_h2[j][i0/WARP_SIZE].x *= scale;
|
||||
Q_h2[j][i0/WARP_SIZE].y *= scale;
|
||||
}
|
||||
@@ -200,6 +200,10 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
|
||||
break;
|
||||
}
|
||||
|
||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||
|
||||
@@ -211,7 +215,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && tid < ncols) {
|
||||
if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
|
||||
dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -826,11 +826,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
|
||||
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||
|
||||
#if QK_K == 256
|
||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
||||
#else
|
||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
||||
#endif
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@@ -933,9 +929,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
|
||||
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||
|
||||
#if QK_K == 256
|
||||
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
||||
#endif
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
|
||||
@@ -712,7 +712,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
|
||||
#ifndef GGML_QKK_64
|
||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||
|
||||
int v[2];
|
||||
@@ -754,58 +753,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||
}
|
||||
|
||||
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
||||
|
||||
#else
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
uint16_t aux16[2];
|
||||
const uint8_t * s = (const uint8_t *)aux16;
|
||||
|
||||
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
|
||||
const float dall = bq4_K->dm[0];
|
||||
const float dmin = bq4_K->dm[1];
|
||||
|
||||
const float d8_1 = __low2float(bq8_1[0].ds);
|
||||
const float d8_2 = __low2float(bq8_1[1].ds);
|
||||
|
||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
||||
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
||||
|
||||
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
||||
const int v1 = q4[0];
|
||||
const int v2 = q4[4];
|
||||
|
||||
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
||||
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
||||
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
||||
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
||||
|
||||
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
||||
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
||||
|
||||
return dall * sumf_d - dmin * sumf_m;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
|
||||
#ifndef GGML_QKK_64
|
||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
||||
|
||||
int vl[2];
|
||||
@@ -847,48 +799,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||
}
|
||||
|
||||
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
||||
|
||||
#else
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
||||
|
||||
const int8_t * s = bq5_K->scales;
|
||||
|
||||
const float d = bq5_K->d;
|
||||
|
||||
const float d8_1 = __low2half(bq8_1[0].ds);
|
||||
const float d8_2 = __low2half(bq8_1[1].ds);
|
||||
|
||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
||||
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
||||
|
||||
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
||||
const int vl1 = ql[0];
|
||||
const int vl2 = ql[4];
|
||||
|
||||
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
||||
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
||||
const int in = step%8; // 0, 4, 0, 4
|
||||
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
||||
|
||||
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
||||
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
||||
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
||||
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
||||
|
||||
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
||||
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
||||
|
||||
return d * sumf_d;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||
@@ -919,7 +829,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if QK_K == 256
|
||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
||||
|
||||
#if QR2_XXS == 8
|
||||
@@ -960,15 +869,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
}
|
||||
return d * (sumi1 + sumi2);
|
||||
#endif
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1002,17 +907,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO
|
||||
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1048,16 +948,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1082,16 +977,12 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO: don't use lookup table for signs
|
||||
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1114,14 +1005,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if QK_K == 256
|
||||
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1149,14 +1036,10 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
const float d = d1q * __low2float (bq8_1[ib32].ds);
|
||||
const float m = d1q * __high2float(bq8_1[ib32].ds);
|
||||
return d * sumi + m * delta;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if QK_K == 256
|
||||
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1192,9 +1075,6 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
|
||||
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
@@ -1250,9 +1130,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
||||
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
|
||||
#if QK_K == 256
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
||||
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
||||
|
||||
@@ -1270,10 +1148,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
||||
sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
||||
}
|
||||
return d * (sumi1 + sumi2);
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
||||
#endif
|
||||
|
||||
@@ -144,6 +144,10 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
|
||||
+61
-27
@@ -35,6 +35,10 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_DIV,
|
||||
GGML_METAL_KERNEL_TYPE_DIV_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_F32,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_F16,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_I32,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_I16,
|
||||
GGML_METAL_KERNEL_TYPE_SCALE,
|
||||
GGML_METAL_KERNEL_TYPE_SCALE_4,
|
||||
GGML_METAL_KERNEL_TYPE_CLAMP,
|
||||
@@ -184,9 +188,9 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
|
||||
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
|
||||
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
|
||||
@@ -381,10 +385,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
prep[@"GGML_QKK_64"] = @(1);
|
||||
#endif
|
||||
|
||||
MTLCompileOptions* options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
@@ -489,6 +489,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I16, repeat_i16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
||||
@@ -638,9 +642,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
||||
@@ -750,6 +754,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_SQR:
|
||||
@@ -774,6 +779,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
if (op->src[0]->ne[0] == 256) {
|
||||
return false;
|
||||
}
|
||||
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
@@ -980,10 +988,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
switch (dst->op) {
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
const int64_t nb = ne00;
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
@@ -1012,7 +1020,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||
[encoder setBytes:&dim length:sizeof(dim) atIndex:27];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -1022,11 +1030,14 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
|
||||
const size_t offs = 0;
|
||||
|
||||
bool bcast_row = false;
|
||||
|
||||
int64_t nb = ne00;
|
||||
int64_t nb = ne00; // used by the "row" kernels
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
@@ -1095,6 +1106,42 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_REPEAT:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline;
|
||||
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
|
||||
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ACC:
|
||||
{
|
||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||
@@ -1773,11 +1820,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#else
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#endif
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -2018,12 +2061,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
#if QK_K == 64
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
||||
#else
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
||||
#endif
|
||||
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -2088,11 +2126,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#else
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#endif
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -2590,7 +2624,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
|
||||
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
|
||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
|
||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
||||
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
@@ -2603,7 +2637,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
|
||||
switch (ne00) {
|
||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
|
||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
||||
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
|
||||
+69
-409
@@ -168,6 +168,53 @@ kernel void kernel_div(
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_repeat(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3 % ne03;
|
||||
const int64_t i02 = i2 % ne02;
|
||||
const int64_t i01 = i1 % ne01;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
device char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1 ;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
const int i00 = i0 % ne00;
|
||||
*((device T *)(dst_ptr + i0*nb0)) = *((device T *)(src0_ptr + i00*nb00));
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_repeat<float>) kernel_repeat_t;
|
||||
|
||||
template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
|
||||
template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
|
||||
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_add_row(
|
||||
@@ -2418,7 +2465,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
|
||||
//template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
|
||||
|
||||
template<int64_t D, int64_t Q = 1, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_vec_f16(
|
||||
@@ -2696,7 +2743,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
}
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
||||
//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
||||
|
||||
kernel void kernel_cpy_f16_f16(
|
||||
device const half * src0,
|
||||
@@ -3319,31 +3366,30 @@ kernel void kernel_concat(
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int32_t & dim,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i03 = tgpig.z;
|
||||
const int64_t i02 = tgpig.y;
|
||||
const int64_t i01 = tgpig.x;
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
|
||||
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
||||
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
||||
device const float * x;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
if (i02 < ne02) {
|
||||
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
|
||||
src0_ptr += ntg.x*nb00;
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (device const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
} else {
|
||||
((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
|
||||
src1_ptr += ntg.x*nb10;
|
||||
x = (device const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||
}
|
||||
dst_ptr += ntg.x*nb0;
|
||||
|
||||
device float * y = (device float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3386,7 +3432,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
|
||||
const int step = sizeof(block_q2_K) * nb;
|
||||
|
||||
#if QK_K == 256
|
||||
const int ix = tiisg/8; // 0...3
|
||||
const int it = tiisg%8; // 0...7
|
||||
const int iq = it/4; // 0 or 1
|
||||
@@ -3438,57 +3483,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
|
||||
y4 += 4 * QK_K;
|
||||
}
|
||||
#else
|
||||
const int ix = tiisg/2; // 0...15
|
||||
const int it = tiisg%2; // 0...1
|
||||
|
||||
device const float * y4 = y + ix * QK_K + 8 * it;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += 16) {
|
||||
|
||||
float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
|
||||
yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
|
||||
yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
|
||||
yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
|
||||
}
|
||||
|
||||
device const uint8_t * sc = (device const uint8_t *)x[ib].scales;
|
||||
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
||||
device const half * dh = &x[ib].d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
||||
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
|
||||
acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
|
||||
acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
|
||||
acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
|
||||
acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
|
||||
acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
|
||||
acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
|
||||
acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
|
||||
}
|
||||
|
||||
float dall = dh[0];
|
||||
float dmin = dh[1];
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
|
||||
(acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
|
||||
(acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
|
||||
(acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
|
||||
dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
|
||||
|
||||
qs += step/2;
|
||||
sc += step;
|
||||
dh += step/2;
|
||||
}
|
||||
|
||||
y4 += 16 * QK_K;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
@@ -3526,7 +3520,6 @@ kernel void kernel_mul_mv_q2_K_f32(
|
||||
kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
void kernel_mul_mv_q3_K_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -3685,84 +3678,6 @@ void kernel_mul_mv_q3_K_f32_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void kernel_mul_mv_q3_K_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
const int row = 2 * r0 + sgitg;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
const int ix = tiisg/4;
|
||||
const int il = 4 * (tiisg%4);// 0, 4, 8, 12
|
||||
const int iq = il/8; // 0, 0, 1, 1
|
||||
const int in = il%8; // 0, 4, 0, 4
|
||||
|
||||
float2 sum = {0.f, 0.f};
|
||||
|
||||
for (int i = ix; i < nb; i += 8) {
|
||||
|
||||
const float d_all = (float)(x[i].d);
|
||||
|
||||
device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
|
||||
device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
|
||||
device const uint16_t * s = (device const uint16_t *)(x[i].scales);
|
||||
device const float * y = yy + i * QK_K + il;
|
||||
|
||||
const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
|
||||
const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
|
||||
const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
|
||||
const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
|
||||
|
||||
for (int l = 0; l < 4; l += 2) {
|
||||
const uint16_t hm = h[l/2] >> iq;
|
||||
sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4))
|
||||
+ y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
|
||||
+ y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
|
||||
+ y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
|
||||
sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
|
||||
+ y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
|
||||
+ y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
|
||||
+ y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
|
||||
}
|
||||
|
||||
}
|
||||
const float sumf = sum[0] + sum[1] * 1.f/256.f;
|
||||
|
||||
const float tot = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
[[host_name("kernel_mul_mv_q3_K_f32")]]
|
||||
kernel void kernel_mul_mv_q3_K_f32(
|
||||
@@ -3792,7 +3707,6 @@ kernel void kernel_mul_mv_q3_K_f32(
|
||||
kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
void kernel_mul_mv_q4_K_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -3906,103 +3820,6 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void kernel_mul_mv_q4_K_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int ix = tiisg/4; // 0...7
|
||||
const int it = tiisg%4; // 0...3
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
const int first_row = r0 * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[8];
|
||||
float yh[8];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int step = sizeof(block_q4_K) * nb / 2;
|
||||
|
||||
device const float * y4 = y + ix * QK_K + 8 * it;
|
||||
|
||||
uint16_t sc16[4];
|
||||
|
||||
for (int ib = ix; ib < nb; ib += 8) {
|
||||
|
||||
float2 sumy = {0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i] = y4[i+ 0]; sumy[0] += yl[i];
|
||||
yh[i] = y4[i+32]; sumy[1] += yh[i];
|
||||
}
|
||||
|
||||
device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
|
||||
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
||||
device const half * dh = x[ib].d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
sc16[0] = sc[0] & 0x000f;
|
||||
sc16[1] = sc[0] & 0x0f00;
|
||||
sc16[2] = sc[0] & 0x00f0;
|
||||
sc16[3] = sc[0] & 0xf000;
|
||||
|
||||
float2 acc1 = {0.f, 0.f};
|
||||
float2 acc2 = {0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
|
||||
acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
|
||||
acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
|
||||
acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
|
||||
}
|
||||
|
||||
float dall = dh[0];
|
||||
float dmin = dh[1];
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
|
||||
(acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
|
||||
dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
|
||||
|
||||
qs += step;
|
||||
sc += step;
|
||||
dh += step;
|
||||
}
|
||||
|
||||
y4 += 8 * QK_K;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
[[host_name("kernel_mul_mv_q4_K_f32")]]
|
||||
kernel void kernel_mul_mv_q4_K_f32(
|
||||
@@ -4070,8 +3887,6 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||
|
||||
const int step = sizeof(block_q5_K) * nb;
|
||||
|
||||
#if QK_K == 256
|
||||
#
|
||||
float yl[16], yh[16];
|
||||
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
@@ -4154,54 +3969,6 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||
y1 += 4 * QK_K;
|
||||
|
||||
}
|
||||
#else
|
||||
float yl[8], yh[8];
|
||||
|
||||
const int il = 4 * (tiisg/8); // 0, 4, 8, 12
|
||||
const int ix = tiisg%8;
|
||||
const int iq = il/8; // 0, 0, 1, 1
|
||||
const int in = il%8; // 0, 4, 0, 4
|
||||
|
||||
device const float * y = yy + ix*QK_K + il;
|
||||
|
||||
for (int i = ix; i < nb; i += 8) {
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
yl[l+0] = y[l+ 0];
|
||||
yl[l+4] = y[l+16];
|
||||
yh[l+0] = y[l+32];
|
||||
yh[l+4] = y[l+48];
|
||||
}
|
||||
|
||||
device const half * dh = &x[i].d;
|
||||
device const uint8_t * q = x[i].qs + il;
|
||||
device const uint8_t * h = x[i].qh + in;
|
||||
device const int8_t * s = x[i].scales;
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
|
||||
const float d = dh[0];
|
||||
|
||||
float2 acc = {0.f, 0.f};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t hl = h[l] >> iq;
|
||||
acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
|
||||
+ yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
|
||||
acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
|
||||
+ yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
|
||||
}
|
||||
sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
|
||||
|
||||
q += step;
|
||||
h += step;
|
||||
s += step;
|
||||
dh += step/2;
|
||||
|
||||
}
|
||||
|
||||
y += 8 * QK_K;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
const float tot = simd_sum(sumf[row]);
|
||||
@@ -4280,7 +4047,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if QK_K == 256
|
||||
const int tid = tiisg/2;
|
||||
const int ix = tiisg%2;
|
||||
const int ip = tid/8; // 0 or 1
|
||||
@@ -4316,30 +4082,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
const int ix = tiisg/4;
|
||||
const int il = 4*(tiisg%4);
|
||||
|
||||
for (int i = ix; i < nb; i += 8) {
|
||||
device const float * y = yy + i * QK_K + il;
|
||||
device const uint8_t * ql = x[i].ql + il;
|
||||
device const uint8_t * qh = x[i].qh + il;
|
||||
device const int8_t * s = x[i].scales;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
|
||||
sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
|
||||
sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
|
||||
sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
|
||||
}
|
||||
sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
const float tot = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
||||
@@ -5173,9 +4915,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
#if QK_K != 64
|
||||
iq1m_scale_t scale;
|
||||
#endif
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
@@ -5196,10 +4936,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
device const uint16_t * sc = (device const uint16_t *)xr->scales;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
#if QK_K != 64
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
#endif
|
||||
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
||||
@@ -5215,14 +4952,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
}
|
||||
const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
#if QK_K == 64
|
||||
const float d = (float) *((device const half *)(sc - 1));
|
||||
sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
|
||||
(sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
|
||||
#else
|
||||
|
||||
sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
|
||||
(sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
|
||||
#endif
|
||||
|
||||
sc += nb*sizeof(block_iq1_m)/2;
|
||||
qs += nb*sizeof(block_iq1_m);
|
||||
@@ -5334,7 +5066,6 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||
}
|
||||
}
|
||||
|
||||
#if QK_K != 64
|
||||
void kernel_mul_mv_iq4_xs_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -5429,7 +5160,6 @@ void kernel_mul_mv_iq4_xs_f32_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
||||
kernel void kernel_mul_mv_iq1_s_f32(
|
||||
@@ -5542,11 +5272,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
#if QK_K == 64
|
||||
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
#else
|
||||
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
#endif
|
||||
}
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
@@ -5672,10 +5398,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
|
||||
float dl, ml;
|
||||
uint8_t sc = xb->scales[il];
|
||||
|
||||
#if QK_K == 256
|
||||
q = q + 32*(il/8) + 16*(il&1);
|
||||
il = (il/2)%4;
|
||||
#endif
|
||||
|
||||
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
||||
@@ -5691,7 +5416,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
#if QK_K == 256
|
||||
q = q + 32 * (il/8) + 16 * (il&1);
|
||||
h = h + 16 * (il&1);
|
||||
uint8_t m = 1 << (il/2);
|
||||
@@ -5712,17 +5436,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||
}
|
||||
#else
|
||||
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
||||
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
||||
float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
|
||||
float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
uint8_t m = 1<<(il*2);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
||||
@@ -5734,7 +5447,6 @@ template <typename type4x4>
|
||||
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
||||
device const uchar * q = xb->qs;
|
||||
|
||||
#if QK_K == 256
|
||||
short is = (il/4) * 2;
|
||||
q = q + (il/4) * 32 + 16 * (il&1);
|
||||
il = il & 3;
|
||||
@@ -5743,16 +5455,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
||||
const float min = xb->dmin;
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
#else
|
||||
(void) get_scale_min_k4_just2;
|
||||
|
||||
q = q + 16 * (il&1);
|
||||
device const uint8_t * s = xb->scales;
|
||||
device const half2 * dh = (device const half2 *)xb->d;
|
||||
const float2 d = (float2)dh[0];
|
||||
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
|
||||
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
|
||||
#endif
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
@@ -5764,7 +5467,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
||||
device const uint8_t * q = xb->qs;
|
||||
device const uint8_t * qh = xb->qh;
|
||||
|
||||
#if QK_K == 256
|
||||
short is = (il/4) * 2;
|
||||
q = q + 32 * (il/4) + 16 * (il&1);
|
||||
qh = qh + 16 * (il&1);
|
||||
@@ -5781,17 +5483,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
||||
}
|
||||
#else
|
||||
q = q + 16 * (il&1);
|
||||
device const int8_t * s = xb->scales;
|
||||
const float dl = xb->d * s[il];
|
||||
uint8_t m = 1<<(il*2);
|
||||
const float coef = il<2 ? 1.f : 1.f/16.f;
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
@@ -5801,15 +5492,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
|
||||
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
#if QK_K == 256
|
||||
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
qh = qh + 32*(il/8) + 16*(il&1);
|
||||
float sc = scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
#else
|
||||
ql = ql + 16 * (il&1);
|
||||
float sc = scales[il];
|
||||
#endif
|
||||
|
||||
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
||||
const float coef = il>1 ? 1.f/16.f : 1.f;
|
||||
@@ -5966,20 +5653,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 &
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
||||
#if QK_K == 64
|
||||
const float d = xb->d;
|
||||
#else
|
||||
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const float d = scale.f16;
|
||||
#endif
|
||||
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
||||
#if QK_K == 64
|
||||
const float dl = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
|
||||
#else
|
||||
|
||||
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
||||
#endif
|
||||
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||
@@ -6009,9 +5691,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
||||
#if QK_K == 64
|
||||
dequantize_iq4_nl(xb, il, reg);
|
||||
#else
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
@@ -6028,7 +5707,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
@@ -6533,11 +6211,7 @@ kernel void kernel_mul_mm_id(
|
||||
sgitg);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
#define QK_NL 16
|
||||
#else
|
||||
#define QK_NL 4
|
||||
#endif
|
||||
|
||||
//
|
||||
// get rows
|
||||
@@ -6577,11 +6251,7 @@ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_r
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
#if QK_K == 64
|
||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
|
||||
#else
|
||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
#endif
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
@@ -6609,11 +6279,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_m
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
#if QK_K == 64
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
|
||||
#else
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
#endif
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
@@ -6641,11 +6307,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
#if QK_K == 64
|
||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
|
||||
#else
|
||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
#endif
|
||||
|
||||
//
|
||||
// matrix-vector multiplication
|
||||
@@ -6854,7 +6516,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t
|
||||
template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
||||
#if QK_K != 64
|
||||
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
||||
#endif
|
||||
|
||||
|
||||
+1
-1
@@ -1,4 +1,4 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-opencl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
|
||||
+76
-2664
File diff suppressed because it is too large
Load Diff
+74
-57
@@ -6,6 +6,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#ifdef _WIN32
|
||||
@@ -47,6 +48,7 @@ struct socket_t {
|
||||
sockfd_t fd;
|
||||
socket_t(sockfd_t fd) : fd(fd) {}
|
||||
~socket_t() {
|
||||
GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
|
||||
#ifdef _WIN32
|
||||
closesocket(this->fd);
|
||||
#else
|
||||
@@ -97,7 +99,7 @@ static ggml_guid_t ggml_backend_rpc_guid() {
|
||||
}
|
||||
|
||||
struct ggml_backend_rpc_buffer_type_context {
|
||||
std::shared_ptr<socket_t> sock;
|
||||
std::string endpoint;
|
||||
std::string name;
|
||||
size_t alignment;
|
||||
size_t max_size;
|
||||
@@ -106,8 +108,6 @@ struct ggml_backend_rpc_buffer_type_context {
|
||||
struct ggml_backend_rpc_context {
|
||||
std::string endpoint;
|
||||
std::string name;
|
||||
std::shared_ptr<socket_t> sock;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
};
|
||||
|
||||
struct ggml_backend_rpc_buffer_context {
|
||||
@@ -231,14 +231,13 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool parse_endpoint(const char * endpoint, std::string & host, int & port) {
|
||||
std::string str(endpoint);
|
||||
size_t pos = str.find(':');
|
||||
static bool parse_endpoint(const std::string & endpoint, std::string & host, int & port) {
|
||||
size_t pos = endpoint.find(':');
|
||||
if (pos == std::string::npos) {
|
||||
return false;
|
||||
}
|
||||
host = str.substr(0, pos);
|
||||
port = std::stoi(str.substr(pos + 1));
|
||||
host = endpoint.substr(0, pos);
|
||||
port = std::stoi(endpoint.substr(pos + 1));
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -273,6 +272,44 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
|
||||
|
||||
// RPC client-side implementation
|
||||
|
||||
static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
static std::unordered_map<std::string, std::weak_ptr<socket_t>> sockets;
|
||||
static bool initialized = false;
|
||||
|
||||
auto it = sockets.find(endpoint);
|
||||
if (it != sockets.end()) {
|
||||
if (auto sock = it->second.lock()) {
|
||||
return sock;
|
||||
}
|
||||
}
|
||||
std::string host;
|
||||
int port;
|
||||
if (!parse_endpoint(endpoint, host, port)) {
|
||||
return nullptr;
|
||||
}
|
||||
#ifdef _WIN32
|
||||
if (!initialized) {
|
||||
WSADATA wsaData;
|
||||
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
||||
if (res != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
#else
|
||||
UNUSED(initialized);
|
||||
#endif
|
||||
auto sock = socket_connect(host.c_str(), port);
|
||||
if (sock == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||
sockets[endpoint] = sock;
|
||||
return sock;
|
||||
}
|
||||
|
||||
GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
@@ -442,7 +479,8 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
||||
std::vector<uint8_t> input(input_size, 0);
|
||||
memcpy(input.data(), &size, sizeof(size));
|
||||
std::vector<uint8_t> output;
|
||||
bool status = send_rpc_cmd(buft_ctx->sock, ALLOC_BUFFER, input, output);
|
||||
auto sock = get_socket(buft_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, ALLOC_BUFFER, input, output);
|
||||
GGML_ASSERT(status);
|
||||
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
|
||||
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
|
||||
@@ -453,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
||||
if (remote_ptr != 0) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||
ggml_backend_rpc_buffer_interface,
|
||||
new ggml_backend_rpc_buffer_context{buft_ctx->sock, {}, remote_ptr, "RPC"},
|
||||
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
||||
remote_size);
|
||||
return buffer;
|
||||
} else {
|
||||
@@ -508,7 +546,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend
|
||||
}
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
return buft_ctx->sock == rpc_ctx->sock;
|
||||
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||
@@ -521,7 +559,6 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
|
||||
GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
|
||||
@@ -530,16 +567,13 @@ GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)rpc_ctx->buft->context;
|
||||
delete buft_ctx;
|
||||
delete rpc_ctx->buft;
|
||||
delete rpc_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
return ctx->buft;
|
||||
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||
@@ -590,7 +624,8 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
|
||||
std::vector<uint8_t> input;
|
||||
serialize_graph(cgraph, input);
|
||||
std::vector<uint8_t> output;
|
||||
bool status = send_rpc_cmd(rpc_ctx->sock, GRAPH_COMPUTE, input, output);
|
||||
auto sock = get_socket(rpc_ctx->endpoint);
|
||||
bool status = send_rpc_cmd(sock, GRAPH_COMPUTE, input, output);
|
||||
GGML_ASSERT(status);
|
||||
GGML_ASSERT(output.size() == 1);
|
||||
return (enum ggml_status)output[0];
|
||||
@@ -624,65 +659,48 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
static std::unordered_map<std::string, ggml_backend_t> instances;
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
||||
return backend != nullptr ? ggml_backend_rpc_get_default_buffer_type(backend) : nullptr;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||
std::string endpoint_str(endpoint);
|
||||
if (instances.find(endpoint_str) != instances.end()) {
|
||||
return instances[endpoint_str];
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
// NOTE: buffer types are allocated and never freed; this is by design
|
||||
static std::unordered_map<std::string, ggml_backend_buffer_type_t> buft_map;
|
||||
auto it = buft_map.find(endpoint);
|
||||
if (it != buft_map.end()) {
|
||||
return it->second;
|
||||
}
|
||||
#ifdef _WIN32
|
||||
{
|
||||
WSADATA wsaData;
|
||||
int res = WSAStartup(MAKEWORD(2, 2), &wsaData);
|
||||
if (res != 0) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
fprintf(stderr, "Connecting to %s\n", endpoint);
|
||||
std::string host;
|
||||
int port;
|
||||
if (!parse_endpoint(endpoint, host, port)) {
|
||||
return nullptr;
|
||||
}
|
||||
auto sock = socket_connect(host.c_str(), port);
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
size_t alignment = get_alignment(sock);
|
||||
size_t max_size = get_max_size(sock);
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = new ggml_backend_rpc_buffer_type_context {
|
||||
/* .sock = */ sock,
|
||||
/* .name = */ "RPC" + std::to_string(sock->fd),
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||
/* .alignment = */ alignment,
|
||||
/* .max_size = */ max_size
|
||||
/* .max_size = */ max_size
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
||||
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
||||
/* .context = */ buft_ctx
|
||||
};
|
||||
buft_map[endpoint] = buft;
|
||||
return buft;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .name = */ "RPC" + std::to_string(sock->fd),
|
||||
/* .sock = */ sock,
|
||||
/* .buft = */ buft
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .name = */ "RPC",
|
||||
};
|
||||
|
||||
instances[endpoint] = new ggml_backend {
|
||||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_rpc_guid(),
|
||||
/* .interface = */ ggml_backend_rpc_interface,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
|
||||
return instances[endpoint];
|
||||
return backend;
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||
@@ -706,14 +724,13 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint);
|
||||
if (backend == nullptr) {
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
return;
|
||||
}
|
||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
get_device_memory(ctx->sock, free, total);
|
||||
get_device_memory(sock, free, total);
|
||||
}
|
||||
|
||||
// RPC server-side implementation
|
||||
|
||||
+283
-561
File diff suppressed because it is too large
Load Diff
+652
-583
File diff suppressed because it is too large
Load Diff
+170
-91
@@ -290,6 +290,7 @@ struct vk_op_rope_neox_push_constants {
|
||||
float corr_dims[4];
|
||||
float theta_scale;
|
||||
float inv_ndims;
|
||||
uint32_t has_freq_facs;
|
||||
};
|
||||
|
||||
struct vk_op_soft_max_push_constants {
|
||||
@@ -1522,8 +1523,8 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
|
||||
}
|
||||
@@ -3732,7 +3733,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
||||
}
|
||||
|
||||
|
||||
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
|
||||
static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_ADD:
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
@@ -3853,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GGML_UNUSED(src2);
|
||||
}
|
||||
|
||||
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
||||
@@ -3880,12 +3883,15 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
||||
}
|
||||
|
||||
template<typename PC>
|
||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
||||
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
||||
if (src1 != nullptr) {
|
||||
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
||||
}
|
||||
if (src2 != nullptr) {
|
||||
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
||||
}
|
||||
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
||||
@@ -3896,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
const uint64_t ne02 = src0->ne[2];
|
||||
const uint64_t ne03 = src0->ne[3];
|
||||
const uint64_t ne0 = ne00 * ne01;
|
||||
|
||||
const bool use_src1 = src1 != nullptr;
|
||||
const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
|
||||
const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
|
||||
@@ -3904,7 +3911,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
const uint64_t ne1 = ne10 * ne11;
|
||||
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
|
||||
const bool use_src2 = src2 != nullptr;
|
||||
const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
|
||||
const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
|
||||
const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
|
||||
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
||||
const uint64_t ne2 = ne20 * ne21;
|
||||
|
||||
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
|
||||
ggml_vk_func_t op_func;
|
||||
|
||||
if (pipeline == nullptr) {
|
||||
@@ -3927,15 +3941,18 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
|
||||
|
||||
vk_buffer d_X = nullptr;
|
||||
size_t x_buf_offset = 0;
|
||||
vk_buffer d_Y = nullptr;
|
||||
size_t y_buf_offset = 0;
|
||||
vk_buffer d_Z = nullptr;
|
||||
size_t z_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
bool src2_uma = false;
|
||||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
|
||||
@@ -3944,10 +3961,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
|
||||
src1_uma = d_Y != nullptr;
|
||||
}
|
||||
if (use_src2) {
|
||||
ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
|
||||
src2_uma = d_Z != nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
||||
|
||||
vk_buffer d_D = extra->buffer_gpu.lock();
|
||||
@@ -3970,10 +3992,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
y_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Y != nullptr);
|
||||
}
|
||||
if (use_src2 && !src2_uma) {
|
||||
d_Z = extra_src2->buffer_gpu.lock();
|
||||
z_buf_offset = extra_src2->offset;
|
||||
GGML_ASSERT(d_Z != nullptr);
|
||||
}
|
||||
|
||||
if (op_supports_incontiguous) {
|
||||
x_sz = ggml_nbytes(src0);
|
||||
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
||||
z_sz = use_src2 ? ggml_nbytes(src2) : 0;
|
||||
d_sz = ggml_nbytes(dst);
|
||||
|
||||
if (x_buf_offset + x_sz >= d_X->size) {
|
||||
@@ -3982,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
||||
z_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
if (d_buf_offset + d_sz >= d_D->size) {
|
||||
d_sz = VK_WHOLE_SIZE;
|
||||
}
|
||||
@@ -4021,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
||||
y_sz *= ne12 * ne13;
|
||||
}
|
||||
if (use_src2 && z_sz != VK_WHOLE_SIZE) {
|
||||
z_sz *= ne22 * ne23;
|
||||
}
|
||||
if (d_sz != VK_WHOLE_SIZE) {
|
||||
d_sz *= ne02 * ne03;
|
||||
}
|
||||
}
|
||||
|
||||
if (op == GGML_OP_SOFT_MAX) {
|
||||
// Empty src1 is possible on soft_max, but the shader needs a buffer
|
||||
// Empty src1 is possible in soft_max, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_y;
|
||||
if (use_src1) {
|
||||
subbuf_y = { d_Y, y_buf_offset, y_sz };
|
||||
@@ -4037,6 +4071,28 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else if (op == GGML_OP_ROPE) {
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
const bool is_neox = mode & 2;
|
||||
|
||||
if (is_neox) {
|
||||
// Empty src2 is possible in rope, but the shader needs a buffer
|
||||
vk_subbuffer subbuf_z;
|
||||
if (use_src2) {
|
||||
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
||||
} else {
|
||||
subbuf_z = { d_X, 0, d_X->size };
|
||||
}
|
||||
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else {
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
}
|
||||
} else if (use_src2) {
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
} else if (use_src1) {
|
||||
ggml_vk_sync_buffers(subctx);
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
||||
@@ -4047,6 +4103,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
} else {
|
||||
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
||||
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
||||
GGML_ASSERT(!use_src2);
|
||||
|
||||
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
|
||||
|
||||
@@ -4088,7 +4145,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
||||
}
|
||||
|
||||
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
|
||||
}
|
||||
|
||||
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -4096,7 +4153,7 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, {
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
@@ -4111,7 +4168,7 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ADD, {
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
@@ -4126,7 +4183,7 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_MUL, {
|
||||
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
||||
@@ -4141,7 +4198,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, {
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
@@ -4154,7 +4211,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, {
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
@@ -4168,7 +4225,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
|
||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, {
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
@@ -4183,7 +4240,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||
const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
|
||||
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
|
||||
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
||||
(uint32_t)ggml_nelements(src0),
|
||||
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||
@@ -4195,21 +4252,21 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
|
||||
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||
}
|
||||
|
||||
static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
float * op_params = (float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
|
||||
}
|
||||
|
||||
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
|
||||
}
|
||||
|
||||
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
||||
int32_t * op_params = (int32_t *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
||||
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
|
||||
}
|
||||
|
||||
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
@@ -4228,7 +4285,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, {
|
||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
|
||||
ncols,
|
||||
src1 != nullptr ? nrows_y : (uint32_t)0,
|
||||
scale, max_bias,
|
||||
@@ -4237,11 +4294,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||
});
|
||||
}
|
||||
|
||||
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
#pragma message("TODO: implement phi3 frequency factors support")
|
||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
||||
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
||||
|
||||
static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||
const int mode = ((int32_t *) dst->op_params)[2];
|
||||
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||
@@ -4264,12 +4317,13 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
||||
if (is_neox) {
|
||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||
const float inv_ndims = -1.0f / n_dims;
|
||||
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
|
||||
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
||||
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
|
||||
src2 != nullptr,
|
||||
});
|
||||
} else {
|
||||
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
|
||||
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
||||
(uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
|
||||
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
|
||||
});
|
||||
@@ -4292,7 +4346,7 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||
|
||||
std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
|
||||
|
||||
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, {
|
||||
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
||||
ncols,
|
||||
ncols_pad,
|
||||
op_params[0],
|
||||
@@ -5408,6 +5462,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
|
||||
const ggml_tensor * src0 = node->src[0];
|
||||
const ggml_tensor * src1 = node->src[1];
|
||||
const ggml_tensor * src2 = node->src[2];
|
||||
|
||||
switch (node->op) {
|
||||
case GGML_OP_UNARY:
|
||||
@@ -5524,7 +5579,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
|
||||
ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
|
||||
|
||||
break;
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -5957,6 +6012,8 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
||||
};
|
||||
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
||||
ggml_vk_instance_init();
|
||||
|
||||
#ifdef GGML_VULKAN_DEBUG
|
||||
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
||||
#endif
|
||||
@@ -6500,7 +6557,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
|
||||
for (int j = 0; j < level; j++) {
|
||||
std::cerr << " ";
|
||||
}
|
||||
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl;
|
||||
std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
|
||||
|
||||
done.push_back(tensor);
|
||||
|
||||
@@ -6550,7 +6607,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
||||
static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
@@ -6561,12 +6618,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
||||
}
|
||||
|
||||
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
||||
if (tensor->src[0] != nullptr) {
|
||||
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
|
||||
std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
|
||||
}
|
||||
if (tensor->src[1] != nullptr) {
|
||||
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
||||
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
||||
}
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
||||
@@ -6577,43 +6634,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
||||
free(tensor_data);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
return;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float val = 0.0f;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
|
||||
}
|
||||
if (std::isnan(val)) {
|
||||
std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
|
||||
std::cerr << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
|
||||
std::cerr << std::endl;
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void * comp_result;
|
||||
size_t comp_size;
|
||||
size_t comp_nb[GGML_MAX_DIMS];
|
||||
@@ -6637,6 +6662,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
ggml_tensor * src0 = tensor->src[0];
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
ggml_tensor * src2 = tensor->src[2];
|
||||
|
||||
struct ggml_init_params iparams = {
|
||||
/*.mem_size =*/ 1024*1024*1024,
|
||||
@@ -6666,10 +6692,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
src0_buffer = malloc(src0_size);
|
||||
src0_clone->data = src0_buffer;
|
||||
if (src0->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
if (ggml_backend_buffer_is_host(src0->buffer)) {
|
||||
memcpy(src0_clone->data, src0->data, src0_size);
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
} else if (ggml_backend_buffer_is_vk(src0->buffer)) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
||||
uint64_t offset = extra->offset;
|
||||
@@ -6700,8 +6726,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
ggml_vk_print_tensor(ctx, src0, "src0");
|
||||
}
|
||||
|
||||
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
|
||||
}
|
||||
if (src1 != nullptr) {
|
||||
src1_clone = ggml_dup_tensor(ggml_ctx, src1);
|
||||
@@ -6710,10 +6734,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
src1_buffer = malloc(src1_size);
|
||||
src1_clone->data = src1_buffer;
|
||||
if (src1->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
if (ggml_backend_buffer_is_host(src1->buffer)) {
|
||||
memcpy(src1_clone->data, src1->data, src1_size);
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
} else if (ggml_backend_buffer_is_vk(src1->buffer)) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
||||
uint64_t offset = extra->offset;
|
||||
@@ -6744,12 +6768,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
ggml_vk_print_tensor(ctx, src1, "src1");
|
||||
std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
|
||||
std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
||||
std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
|
||||
if (src1->src[0] != nullptr) {
|
||||
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
||||
std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
|
||||
}
|
||||
if (src1->src[1] != nullptr) {
|
||||
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
||||
std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
|
||||
}
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
|
||||
@@ -6760,8 +6784,64 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(src1_clone, done);
|
||||
}
|
||||
}
|
||||
if (src2 != nullptr) {
|
||||
src2_clone = ggml_dup_tensor(ggml_ctx, src2);
|
||||
|
||||
ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
|
||||
src2_size = ggml_nbytes(src2);
|
||||
|
||||
src2_buffer = malloc(src2_size);
|
||||
src2_clone->data = src2_buffer;
|
||||
if (ggml_backend_buffer_is_host(src2->buffer)) {
|
||||
memcpy(src2_clone->data, src2->data, src2_size);
|
||||
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
} else if (ggml_backend_buffer_is_vk(src2->buffer)) {
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||
vk_buffer buffer_gpu = extra->buffer_gpu.lock();
|
||||
uint64_t offset = extra->offset;
|
||||
if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
|
||||
for (int i3 = 0; i3 < src2->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src2->ne[2]; i2++) {
|
||||
const int idx = i3*src2->ne[2] + i2;
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
src2_clone->nb[0] = src2->nb[0];
|
||||
src2_clone->nb[1] = src2->nb[1];
|
||||
for (int i = 2; i < GGML_MAX_DIMS; i++) {
|
||||
src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
|
||||
}
|
||||
} else {
|
||||
if (offset + src2_size >= buffer_gpu->size) {
|
||||
src2_size = buffer_gpu->size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
|
||||
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
ggml_vk_print_tensor(ctx, src2, "src2");
|
||||
std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
|
||||
std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
|
||||
if (src2->src[0] != nullptr) {
|
||||
std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
|
||||
}
|
||||
if (src2->src[1] != nullptr) {
|
||||
std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
|
||||
}
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
|
||||
std::cerr << std::endl;
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(src2_clone, done);
|
||||
}
|
||||
}
|
||||
|
||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||
@@ -6799,7 +6879,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
float attn_factor = ((float *) tensor->op_params)[8];
|
||||
float beta_fast = ((float *) tensor->op_params)[9];
|
||||
float beta_slow = ((float *) tensor->op_params)[10];
|
||||
tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else if (tensor->op == GGML_OP_UNARY) {
|
||||
switch (ggml_get_unary_op(tensor)) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
@@ -6847,7 +6927,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
||||
|
||||
ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
||||
}
|
||||
@@ -6888,7 +6967,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
@@ -6936,12 +7015,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
|
||||
std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||
if (src0 != nullptr) {
|
||||
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||
std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||
}
|
||||
if (src1 != nullptr) {
|
||||
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||
std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
@@ -6977,12 +7056,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
||||
std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||
if (src0 != nullptr) {
|
||||
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||
}
|
||||
if (src1 != nullptr) {
|
||||
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
@@ -7001,12 +7080,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
|
||||
if (avg_err > 0.05 || std::isnan(avg_err)) {
|
||||
std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||
std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
|
||||
if (src0 != nullptr) {
|
||||
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||
std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
|
||||
}
|
||||
if (src1 != nullptr) {
|
||||
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||
std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
@@ -7018,14 +7097,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
GGML_ASSERT(false);
|
||||
} else {
|
||||
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl;
|
||||
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
|
||||
}
|
||||
|
||||
free(comp_result);
|
||||
comp_result = nullptr;
|
||||
comp_size = 0;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
|
||||
if (ggml_backend_buffer_is_vk(tensor->buffer)) {
|
||||
free(tensor_data);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -871,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
},
|
||||
[GGML_TYPE_IQ4_XS] = {
|
||||
.type_name = "iq4_xs",
|
||||
#if QK_K == 64
|
||||
.blck_size = QK4_NL,
|
||||
#else
|
||||
.blck_size = QK_K,
|
||||
#endif
|
||||
.type_size = sizeof(block_iq4_xs),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
||||
.from_float = quantize_row_iq4_xs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||
#if QK_K == 64
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#else
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
#endif
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q8_K] = {
|
||||
@@ -2678,9 +2670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"ARGSORT",
|
||||
"LEAKY_RELU",
|
||||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_ATTN_EXT",
|
||||
"FLASH_FF",
|
||||
"FLASH_ATTN_BACK",
|
||||
"SSM_CONV",
|
||||
"SSM_SCAN",
|
||||
@@ -2706,7 +2696,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
||||
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -2768,9 +2758,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"argsort(x)",
|
||||
"leaky_relu(x)",
|
||||
|
||||
"flash_attn(x)",
|
||||
"flash_attn_ext(x)",
|
||||
"flash_ff(x)",
|
||||
"flash_attn_back(x)",
|
||||
"ssm_conv(x)",
|
||||
"ssm_scan(x)",
|
||||
@@ -2796,7 +2784,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
||||
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4894,10 +4882,21 @@ struct ggml_tensor * ggml_repeat_back(
|
||||
// ggml_concat
|
||||
|
||||
struct ggml_tensor * ggml_concat(
|
||||
struct ggml_context* ctx,
|
||||
struct ggml_tensor* a,
|
||||
struct ggml_tensor* b) {
|
||||
GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int dim) {
|
||||
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
||||
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
||||
if (d == dim) {
|
||||
ne[d] = a->ne[d] + b->ne[d];
|
||||
continue;
|
||||
}
|
||||
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
||||
ne[d] = a->ne[d];
|
||||
}
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
@@ -4905,7 +4904,9 @@ struct ggml_tensor * ggml_concat(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, dim);
|
||||
|
||||
result->op = GGML_OP_CONCAT;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -5025,6 +5026,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
||||
|
||||
result->op = GGML_OP_LEAKY_RELU;
|
||||
@@ -6956,38 +6958,6 @@ struct ggml_tensor * ggml_top_k(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
bool masked) {
|
||||
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
||||
// TODO: check if vT can be multiplied by (k*qT)
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (q->grad || k->grad || v->grad) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
||||
|
||||
int32_t t = masked ? 1 : 0;
|
||||
ggml_set_op_params(result, &t, sizeof(t));
|
||||
|
||||
result->op = GGML_OP_FLASH_ATTN;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = q;
|
||||
result->src[1] = k;
|
||||
result->src[2] = v;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn_ext
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn_ext(
|
||||
@@ -7047,38 +7017,6 @@ void ggml_flash_attn_ext_set_prec(
|
||||
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
||||
}
|
||||
|
||||
// ggml_flash_ff
|
||||
|
||||
struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b0,
|
||||
struct ggml_tensor * b1,
|
||||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1) {
|
||||
GGML_ASSERT(ggml_can_mul_mat(b0, a));
|
||||
// TODO: more checks
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
||||
|
||||
result->op = GGML_OP_FLASH_FF;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b0;
|
||||
result->src[2] = b1;
|
||||
result->src[3] = c0;
|
||||
result->src[4] = c1;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn_back
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn_back(
|
||||
@@ -7088,6 +7026,8 @@ struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * d,
|
||||
bool masked) {
|
||||
GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes");
|
||||
|
||||
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
||||
// TODO: check if vT can be multiplied by (k*qT)
|
||||
|
||||
@@ -11041,26 +10981,29 @@ static void ggml_compute_forward_concat_f32(
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = src0->ne[dim];
|
||||
|
||||
const float * x;
|
||||
|
||||
// TODO: smarter multi-theading
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||
if (i2 < ne02) { // src0
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
||||
|
||||
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
||||
*y = *x;
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
||||
} else {
|
||||
x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
||||
}
|
||||
}
|
||||
} // src1
|
||||
else {
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
||||
|
||||
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
||||
*y = *x;
|
||||
}
|
||||
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11068,7 +11011,7 @@ static void ggml_compute_forward_concat_f32(
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_concat(
|
||||
const struct ggml_compute_params* params,
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor* dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
@@ -15717,400 +15660,6 @@ static void ggml_compute_forward_argsort(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn
|
||||
|
||||
static void ggml_compute_forward_flash_attn_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * q = dst->src[0];
|
||||
const struct ggml_tensor * k = dst->src[1];
|
||||
const struct ggml_tensor * v = dst->src[2];
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t D = neq0;
|
||||
const int64_t N = neq1;
|
||||
const int64_t P = nek1 - N;
|
||||
const int64_t M = P + N;
|
||||
|
||||
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
||||
|
||||
GGML_ASSERT(ne0 == D);
|
||||
GGML_ASSERT(ne1 == N);
|
||||
GGML_ASSERT(P >= 0);
|
||||
|
||||
GGML_ASSERT(nbq0 == sizeof(float));
|
||||
GGML_ASSERT(nbk0 == sizeof(float));
|
||||
GGML_ASSERT(nbv0 == sizeof(float));
|
||||
|
||||
GGML_ASSERT(neq0 == D);
|
||||
GGML_ASSERT(nek0 == D);
|
||||
GGML_ASSERT(nev1 == D);
|
||||
|
||||
GGML_ASSERT(neq1 == N);
|
||||
GGML_ASSERT(nek1 == N + P);
|
||||
GGML_ASSERT(nev1 == D);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by q rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in q
|
||||
const int nr = neq1*neq2*neq3;
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
const float scale = 1.0f/sqrtf(D);
|
||||
|
||||
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// q indices
|
||||
const int iq3 = ir/(neq2*neq1);
|
||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||
|
||||
float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
|
||||
|
||||
for (int i = M; i < Mup; ++i) {
|
||||
S[i] = -INFINITY;
|
||||
}
|
||||
|
||||
const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
|
||||
for (int64_t ic = 0; ic < masked_begin; ++ic) {
|
||||
// k indices
|
||||
const int ik3 = iq3;
|
||||
const int ik2 = iq2 % nek2;
|
||||
const int ik1 = ic;
|
||||
|
||||
// S indices
|
||||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f32(neq0,
|
||||
S + i1, 0,
|
||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||
}
|
||||
|
||||
// scale
|
||||
ggml_vec_scale_f32(masked_begin, S, scale);
|
||||
|
||||
for (int64_t i = masked_begin; i < M; i++) {
|
||||
S[i] = -INFINITY;
|
||||
}
|
||||
|
||||
// softmax
|
||||
// exclude known -INF S[..] values from max and loop
|
||||
// dont forget to set their SW values to zero
|
||||
{
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(masked_begin, &max, S);
|
||||
|
||||
ggml_float sum = 0.0;
|
||||
{
|
||||
#ifdef GGML_SOFT_MAX_ACCELERATE
|
||||
max = -max;
|
||||
vDSP_vsadd(S, 1, &max, S, 1, Mup);
|
||||
vvexpf(S, S, &Mup);
|
||||
ggml_vec_sum_f32(Mup, &sum, S);
|
||||
#else
|
||||
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
||||
#endif
|
||||
}
|
||||
|
||||
assert(sum > 0.0);
|
||||
|
||||
sum = 1.0/sum;
|
||||
ggml_vec_scale_f32(masked_begin, S, sum);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < masked_begin; ++i) {
|
||||
assert(!isnan(S[i]));
|
||||
assert(!isinf(S[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int64_t ic = 0; ic < nev1; ++ic) {
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// v indices
|
||||
const int iv2 = iq2 % nev2;
|
||||
const int iv3 = iq3;
|
||||
|
||||
ggml_vec_dot_f32(masked_begin,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||
S, 0, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_attn_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * q = dst->src[0];
|
||||
const struct ggml_tensor * k = dst->src[1];
|
||||
const struct ggml_tensor * v = dst->src[2];
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t D = neq0;
|
||||
const int64_t N = neq1;
|
||||
const int64_t P = nek1 - N;
|
||||
const int64_t M = P + N;
|
||||
|
||||
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
||||
|
||||
GGML_ASSERT(ne0 == D);
|
||||
GGML_ASSERT(ne1 == N);
|
||||
GGML_ASSERT(P >= 0);
|
||||
|
||||
GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
|
||||
|
||||
GGML_ASSERT(neq0 == D);
|
||||
GGML_ASSERT(nek0 == D);
|
||||
GGML_ASSERT(nev1 == D);
|
||||
|
||||
GGML_ASSERT(neq1 == N);
|
||||
GGML_ASSERT(nek1 == N + P);
|
||||
GGML_ASSERT(nev1 == D);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by q rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in q
|
||||
const int nr = neq1*neq2*neq3;
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
const float scale = 1.0f/sqrtf(D);
|
||||
|
||||
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// q indices
|
||||
const int iq3 = ir/(neq2*neq1);
|
||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||
|
||||
float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
|
||||
|
||||
for (int i = M; i < Mup; ++i) {
|
||||
S[i] = -INFINITY;
|
||||
}
|
||||
|
||||
if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
|
||||
for (int64_t ic = 0; ic < nek1; ++ic) {
|
||||
// k indices
|
||||
const int ik3 = iq3;
|
||||
const int ik2 = iq2 % nek2;
|
||||
const int ik1 = ic;
|
||||
|
||||
// S indices
|
||||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f16(neq0,
|
||||
S + i1, 0,
|
||||
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||
}
|
||||
} else {
|
||||
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
||||
// k indices
|
||||
const int ik3 = iq3;
|
||||
const int ik2 = iq2 % nek2;
|
||||
const int ik1 = ic;
|
||||
|
||||
// S indices
|
||||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f16_unroll(neq0, nbk1,
|
||||
S + i1,
|
||||
((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
||||
}
|
||||
}
|
||||
|
||||
// scale
|
||||
ggml_vec_scale_f32(nek1, S, scale);
|
||||
|
||||
if (masked) {
|
||||
for (int64_t i = P; i < M; i++) {
|
||||
if (i > P + iq1) {
|
||||
S[i] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// softmax
|
||||
// todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
|
||||
// dont forget to set their S values to zero
|
||||
{
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(M, &max, S);
|
||||
|
||||
ggml_float sum = 0.0;
|
||||
{
|
||||
#ifdef GGML_SOFT_MAX_ACCELERATE
|
||||
max = -max;
|
||||
vDSP_vsadd(S, 1, &max, S, 1, Mup);
|
||||
vvexpf(S, S, &Mup);
|
||||
ggml_vec_sum_f32(Mup, &sum, S);
|
||||
#else
|
||||
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
||||
#endif
|
||||
}
|
||||
|
||||
assert(sum > 0.0);
|
||||
|
||||
sum = 1.0/sum;
|
||||
ggml_vec_scale_f32(M, S, sum);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < M; ++i) {
|
||||
assert(!isnan(S[i]));
|
||||
assert(!isinf(S[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
|
||||
|
||||
for (int64_t i = 0; i < M; i++) {
|
||||
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
||||
}
|
||||
|
||||
// todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
|
||||
if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
|
||||
for (int64_t ic = 0; ic < nev1; ++ic) {
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// v indices
|
||||
const int iv2 = iq2 % nev2;
|
||||
const int iv3 = iq3;
|
||||
|
||||
ggml_vec_dot_f16(nev0,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||
S16, 0, 1);
|
||||
}
|
||||
} else {
|
||||
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// v indices
|
||||
const int iv2 = iq2 % nev2;
|
||||
const int iv3 = iq3;
|
||||
|
||||
ggml_vec_dot_f16_unroll(nev0, nbv1,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
||||
S16);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_attn(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * q = dst->src[0];
|
||||
|
||||
switch (q->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_f16(params, masked, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_f32(params, masked, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn_ext
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
@@ -16344,165 +15893,6 @@ static void ggml_compute_forward_flash_attn_ext(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_ff
|
||||
|
||||
static void ggml_compute_forward_flash_ff_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * a = dst->src[0]; // F16
|
||||
const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
|
||||
const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
|
||||
const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
|
||||
const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, nea, a, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nba, a, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t D = nea0;
|
||||
//const int64_t N = nea1;
|
||||
const int64_t M = neb01;
|
||||
|
||||
GGML_ASSERT(ne0 == nea0);
|
||||
GGML_ASSERT(ne1 == nea1);
|
||||
GGML_ASSERT(ne2 == nea2);
|
||||
|
||||
GGML_ASSERT(nba0 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbb10 == sizeof(float));
|
||||
GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbc10 == sizeof(float));
|
||||
|
||||
GGML_ASSERT(neb00 == D);
|
||||
GGML_ASSERT(neb01 == M);
|
||||
GGML_ASSERT(neb10 == M);
|
||||
GGML_ASSERT(neb11 == 1);
|
||||
|
||||
GGML_ASSERT(nec00 == M);
|
||||
GGML_ASSERT(nec01 == D);
|
||||
GGML_ASSERT(nec10 == D);
|
||||
GGML_ASSERT(nec11 == 1);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by a rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in a
|
||||
const int nr = nea1*nea2*nea3;
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// a indices
|
||||
const int ia3 = ir/(nea2*nea1);
|
||||
const int ia2 = (ir - ia3*nea2*nea1)/nea1;
|
||||
const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
|
||||
|
||||
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
|
||||
|
||||
for (int64_t ic = 0; ic < neb01; ++ic) {
|
||||
// b0 indices
|
||||
const int ib03 = ia3;
|
||||
const int ib02 = ia2;
|
||||
const int ib01 = ic;
|
||||
|
||||
// S indices
|
||||
const int i1 = ib01;
|
||||
|
||||
ggml_vec_dot_f16(nea0,
|
||||
S + i1, 0,
|
||||
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
||||
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
||||
}
|
||||
|
||||
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
||||
//ggml_vec_gelu_f32(neb01, S, S);
|
||||
|
||||
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
||||
|
||||
for (int64_t i = 0; i < M; i++) {
|
||||
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
||||
}
|
||||
|
||||
ggml_vec_gelu_f16(neb01, S16, S16);
|
||||
|
||||
{
|
||||
// dst indices
|
||||
const int i1 = ia1;
|
||||
const int i2 = ia2;
|
||||
const int i3 = ia3;
|
||||
|
||||
for (int64_t ic = 0; ic < nec01; ++ic) {
|
||||
|
||||
ggml_vec_dot_f16(neb01,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
||||
S16, 0, 1);
|
||||
}
|
||||
|
||||
ggml_vec_add_f32(nec01,
|
||||
(float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
(float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
(float *) c1->data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_ff(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * b0 = dst->src[1];
|
||||
|
||||
switch (b0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_flash_ff_f16(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn_back
|
||||
|
||||
static void ggml_compute_forward_flash_attn_back_f32(
|
||||
@@ -18073,21 +17463,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_leaky_relu(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
||||
GGML_ASSERT(t == 0 || t == 1);
|
||||
const bool masked = t != 0;
|
||||
ggml_compute_forward_flash_attn(params, masked, tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
ggml_compute_forward_flash_ff(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
int32_t t = ggml_get_op_params_i32(tensor, 0);
|
||||
@@ -19094,7 +18473,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
struct ggml_tensor * flash_grad = NULL;
|
||||
@@ -19148,10 +18526,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
zero_table);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
GGML_ASSERT(false); // not supported
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
GGML_ASSERT(false); // not supported
|
||||
@@ -19838,15 +19212,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
@@ -20243,40 +19612,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
||||
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||
|
||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
||||
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
const int64_t ne00 = node->src[0]->ne[0]; // D
|
||||
|
||||
cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
||||
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
||||
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
||||
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
const int64_t D = node->src[0]->ne[0];
|
||||
@@ -22117,11 +21458,7 @@ size_t ggml_quantize_chunk(
|
||||
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
#if QK_K == 64
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
#else
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
#endif
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
@@ -23422,6 +22759,16 @@ int ggml_cpu_has_neon(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sve(void) {
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// TODO: Currently, SVE 256 bit is only supported.
|
||||
GGML_ASSERT(svcntb() == QK8_0);
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_arm_fma(void) {
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
return 1;
|
||||
|
||||
@@ -481,9 +481,7 @@ extern "C" {
|
||||
GGML_OP_ARGSORT,
|
||||
GGML_OP_LEAKY_RELU,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_ATTN_EXT,
|
||||
GGML_OP_FLASH_FF,
|
||||
GGML_OP_FLASH_ATTN_BACK,
|
||||
GGML_OP_SSM_CONV,
|
||||
GGML_OP_SSM_SCAN,
|
||||
@@ -1009,12 +1007,13 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// concat a and b on dim 2
|
||||
// concat a and b along dim
|
||||
// used in stable-diffusion
|
||||
GGML_API struct ggml_tensor * ggml_concat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
int dim);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_abs(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1761,13 +1760,6 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
int k);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
bool masked);
|
||||
|
||||
#define GGML_KQ_MASK_PAD 32
|
||||
|
||||
// q: [n_embd, n_batch, n_head, 1]
|
||||
@@ -1788,6 +1780,7 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec);
|
||||
|
||||
// TODO: needs to be adapted to ggml_flash_attn_ext
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
@@ -1796,14 +1789,6 @@ extern "C" {
|
||||
struct ggml_tensor * d,
|
||||
bool masked);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b0,
|
||||
struct ggml_tensor * b1,
|
||||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * s,
|
||||
@@ -2420,6 +2405,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
|
||||
@@ -2609,7 +2609,8 @@ layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout (binding = 1) readonly buffer Y {int data_b[];};
|
||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||
layout (binding = 2) readonly buffer Z {float data_freq_factors[];};
|
||||
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint ncols;
|
||||
@@ -2622,6 +2623,7 @@ layout (push_constant) uniform parameter {
|
||||
float corr_dims[4];
|
||||
float theta_scale;
|
||||
float inv_ndims;
|
||||
uint has_freq_facs;
|
||||
} p;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const uint i0) {
|
||||
@@ -2671,7 +2673,8 @@ void main() {
|
||||
const float cur_rot = p.inv_ndims * ic - ib;
|
||||
|
||||
const int pos = data_b[i2];
|
||||
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f);
|
||||
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
|
||||
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, uint(cur_rot), cos_theta, sin_theta);
|
||||
|
||||
+89
-13
@@ -33,17 +33,21 @@ class Keys:
|
||||
FILE_TYPE = "general.file_type"
|
||||
|
||||
class LLM:
|
||||
VOCAB_SIZE = "{arch}.vocab_size"
|
||||
CONTEXT_LENGTH = "{arch}.context_length"
|
||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
BLOCK_COUNT = "{arch}.block_count"
|
||||
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
EXPERT_COUNT = "{arch}.expert_count"
|
||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
LOGIT_SCALE = "{arch}.logit_scale"
|
||||
VOCAB_SIZE = "{arch}.vocab_size"
|
||||
CONTEXT_LENGTH = "{arch}.context_length"
|
||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
BLOCK_COUNT = "{arch}.block_count"
|
||||
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
|
||||
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
|
||||
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||
EXPERT_COUNT = "{arch}.expert_count"
|
||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
|
||||
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
LOGIT_SCALE = "{arch}.logit_scale"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -55,6 +59,8 @@ class Keys:
|
||||
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||
CAUSAL = "{arch}.attention.causal"
|
||||
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||
|
||||
class Rope:
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
@@ -64,6 +70,7 @@ class Keys:
|
||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
||||
|
||||
class SSM:
|
||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||
@@ -139,6 +146,8 @@ class MODEL_ARCH(IntEnum):
|
||||
COMMAND_R = auto()
|
||||
DBRX = auto()
|
||||
OLMO = auto()
|
||||
ARCTIC = auto()
|
||||
DEEPSEEK2 = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -167,6 +176,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
FFN_ACT = auto()
|
||||
FFN_NORM_EXP = auto()
|
||||
FFN_GATE_EXP = auto()
|
||||
FFN_DOWN_EXP = auto()
|
||||
FFN_UP_EXP = auto()
|
||||
@@ -183,6 +193,12 @@ class MODEL_TENSOR(IntEnum):
|
||||
SSM_A = auto()
|
||||
SSM_D = auto()
|
||||
SSM_OUT = auto()
|
||||
ATTN_Q_A = auto()
|
||||
ATTN_Q_B = auto()
|
||||
ATTN_KV_A_MQA = auto()
|
||||
ATTN_KV_B = auto()
|
||||
ATTN_Q_A_NORM = auto()
|
||||
ATTN_KV_A_NORM = auto()
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
@@ -218,6 +234,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
MODEL_ARCH.DBRX: "dbrx",
|
||||
MODEL_ARCH.OLMO: "olmo",
|
||||
MODEL_ARCH.ARCTIC: "arctic",
|
||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -251,6 +269,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||
@@ -262,6 +281,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
@@ -732,6 +757,54 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.ARCTIC: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_NORM_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.DEEPSEEK2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_A,
|
||||
MODEL_TENSOR.ATTN_Q_B,
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||
MODEL_TENSOR.ATTN_KV_B,
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
@@ -765,6 +838,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.DEEPSEEK2: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
}
|
||||
|
||||
#
|
||||
@@ -905,9 +982,8 @@ class GGUFValueType(IntEnum):
|
||||
raise ValueError(f"Unknown type: {type(val)}")
|
||||
|
||||
|
||||
# Note: Does not support GGML_QKK_64
|
||||
QK_K = 256
|
||||
# Items here are (block size, type size)
|
||||
QK_K = 256
|
||||
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
GGMLQuantizationType.F32: (1, 4),
|
||||
GGMLQuantizationType.F16: (1, 2),
|
||||
|
||||
@@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
|
||||
from .quants import quant_shape_to_byte_shape
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -251,6 +253,7 @@ class GGUFReader:
|
||||
tensor_names.add(tensor_name)
|
||||
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||
n_elems = int(np.prod(dims))
|
||||
np_dims = tuple(reversed(dims.tolist()))
|
||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||
n_bytes = n_elems * type_size // block_size
|
||||
data_offs = int(start_offs + offset_tensor[0])
|
||||
@@ -279,6 +282,7 @@ class GGUFReader:
|
||||
else:
|
||||
item_count = n_bytes
|
||||
item_type = np.uint8
|
||||
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||
tensors.append(ReaderTensor(
|
||||
name = tensor_name,
|
||||
tensor_type = ggml_type,
|
||||
@@ -286,7 +290,7 @@ class GGUFReader:
|
||||
n_elements = n_elems,
|
||||
n_bytes = n_bytes,
|
||||
data_offset = data_offs,
|
||||
data = self._get(data_offs, item_type, item_count),
|
||||
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
|
||||
field = field,
|
||||
))
|
||||
self.tensors = tensors
|
||||
|
||||
@@ -13,7 +13,6 @@ from string import ascii_letters, digits
|
||||
import numpy as np
|
||||
|
||||
from .constants import (
|
||||
GGML_QUANT_SIZES,
|
||||
GGUF_DEFAULT_ALIGNMENT,
|
||||
GGUF_MAGIC,
|
||||
GGUF_VERSION,
|
||||
@@ -26,6 +25,8 @@ from .constants import (
|
||||
TokenType,
|
||||
)
|
||||
|
||||
from .quants import quant_shape_from_byte_shape
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -229,10 +230,7 @@ class GGUFWriter:
|
||||
else:
|
||||
dtype = raw_dtype
|
||||
if tensor_dtype == np.uint8:
|
||||
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
|
||||
if tensor_shape[-1] % type_size != 0:
|
||||
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
|
||||
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
|
||||
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += self._pack("I", n_dims)
|
||||
for i in range(n_dims):
|
||||
@@ -376,9 +374,15 @@ class GGUFWriter:
|
||||
def add_block_count(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
|
||||
|
||||
def add_leading_dense_block_count(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)
|
||||
|
||||
def add_feed_forward_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_expert_feed_forward_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
def add_parallel_residual(self, use: bool) -> None:
|
||||
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
|
||||
|
||||
@@ -409,6 +413,12 @@ class GGUFWriter:
|
||||
def add_expert_used_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_expert_shared_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_SHARED_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_expert_weights_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_layer_norm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||
|
||||
@@ -418,6 +428,12 @@ class GGUFWriter:
|
||||
def add_causal_attention(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||
|
||||
def add_q_lora_rank(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length)
|
||||
|
||||
def add_kv_lora_rank(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)
|
||||
|
||||
def add_pooling_type(self, value: PoolingType) -> None:
|
||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||
|
||||
@@ -442,6 +458,9 @@ class GGUFWriter:
|
||||
def add_rope_scaling_finetuned(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)
|
||||
|
||||
def add_ssm_conv_kernel(self, value: int) -> None:
|
||||
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
|
||||
|
||||
|
||||
+15
-1
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import Callable
|
||||
from typing import Callable, Sequence
|
||||
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
@@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
|
||||
import numpy as np
|
||||
|
||||
|
||||
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
if shape[-1] % block_size != 0:
|
||||
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||
|
||||
|
||||
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
if shape[-1] % type_size != 0:
|
||||
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||
|
||||
|
||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||
n = n.astype(np.float32, copy=False).view(np.int32)
|
||||
|
||||
@@ -244,6 +244,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -255,6 +256,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
|
||||
),
|
||||
|
||||
# AWQ-activation gate
|
||||
@@ -272,6 +274,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
@@ -283,6 +286,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
@@ -306,6 +310,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||
"model.layers.{bid}.residual_mlp.w2", # arctic
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
@@ -317,6 +322,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
@@ -380,6 +386,42 @@ class TensorNameMap:
|
||||
"model.layers.{bid}.out_proj",
|
||||
"backbone.layers.{bid}.mixer.out_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_A: (
|
||||
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_B: (
|
||||
"model.layers.{bid}.self_attn.q_b_proj", # deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA: (
|
||||
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_KV_B: (
|
||||
"model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM: (
|
||||
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||
),
|
||||
}
|
||||
|
||||
# architecture-specific block mappings
|
||||
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
|
||||
MODEL_ARCH.ARCTIC: {
|
||||
MODEL_TENSOR.FFN_NORM: (
|
||||
"model.layers.{bid}.residual_layernorm",
|
||||
),
|
||||
MODEL_TENSOR.FFN_NORM_EXP: (
|
||||
"model.layers.{bid}.post_attention_layernorm",
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||
@@ -393,12 +435,14 @@ class TensorNameMap:
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
if arch in self.arch_block_mappings_cfg:
|
||||
self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
|
||||
for bid in range(n_blocks):
|
||||
for tensor, keys in self.block_mappings_cfg.items():
|
||||
if tensor not in MODEL_TENSORS[arch]:
|
||||
continue
|
||||
# TODO: make this configurable
|
||||
n_experts = 60
|
||||
n_experts = 160
|
||||
for xid in range(n_experts):
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
|
||||
@@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
|
||||
|
||||
for tensor in reader.tensors:
|
||||
total_bytes += tensor.n_bytes
|
||||
# Dimensions are written in reverse order, so flip them first
|
||||
shape = np.flipud(tensor.shape).tolist()
|
||||
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||
|
||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||
|
||||
|
||||
@@ -85,6 +85,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -264,6 +265,8 @@ extern "C" {
|
||||
bool check_tensors; // validate model tensor data
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
// https://github.com/ggerganov/llama.cpp/pull/7544
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
@@ -290,14 +293,14 @@ extern "C" {
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
|
||||
enum ggml_type type_k; // data type for K cache
|
||||
enum ggml_type type_v; // data type for V cache
|
||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -759,6 +762,12 @@ extern "C" {
|
||||
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
||||
|
||||
// Get the number of threads used for generation of a single token.
|
||||
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
||||
|
||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||
|
||||
// Set whether to use causal attention or not
|
||||
// If set to true, the model will only attend to the past tokens
|
||||
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||
@@ -817,6 +826,9 @@ extern "C" {
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Identify if Token Id is a control token or a render-able token
|
||||
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
|
||||
+17
-11
@@ -1259,22 +1259,26 @@ struct test_im2col : public test_case {
|
||||
// GGML_OP_CONCAT
|
||||
struct test_concat : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const int64_t b_ne2;
|
||||
const std::array<int64_t, 4> ne_a;
|
||||
const int64_t ne_b_d;
|
||||
const int dim;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, b_ne2);
|
||||
return VARS_TO_STR4(type, ne_a, ne_b_d, dim);
|
||||
}
|
||||
|
||||
test_concat(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
||||
int64_t b_ne2 = 10)
|
||||
: type(type), ne(ne), b_ne2(b_ne2) {}
|
||||
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
|
||||
int64_t ne_b_d = 10,
|
||||
int dim = 2)
|
||||
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
|
||||
ggml_tensor * out = ggml_concat(ctx, a, b);
|
||||
auto ne_b = ne_a;
|
||||
ne_b[dim] = ne_b_d;
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
||||
ggml_tensor * out = ggml_concat(ctx, a, b, dim);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
@@ -2211,8 +2215,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
||||
for (int dim : { 0, 1, 2, 3, }) {
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim));
|
||||
}
|
||||
|
||||
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user